package uk.ac.cam.ch.wwmm.oscartokeniser;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import nu.xom.Element;
import org.apache.commons.cli.HelpFormatter;
import org.xmlcml.euclid.EuclidConstants;
import uk.ac.cam.ch.wwmm.oscar.document.IProcessingDocument;
import uk.ac.cam.ch.wwmm.oscar.document.ITokeniser;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
import uk.ac.cam.ch.wwmm.oscar.document.TokenSequence;
import uk.ac.cam.ch.wwmm.oscar.terms.TermSets;
import uk.ac.cam.ch.wwmm.oscar.tools.StringTools;
import uk.ac.cam.ch.wwmm.oscar.types.BioTag;
import uk.ac.cam.ch.wwmm.oscar.types.BioType;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/oscartokeniser/Tokeniser.class */
public final class Tokeniser implements ITokeniser {
    private static Pattern oxidationStatePattern = Pattern.compile("\\((o|i{1,4}|i{0,3}[xv]|[xv]i{0,4})\\)", 2);
    private static Pattern oxidationStateEndPattern = Pattern.compile(".*\\((o|i{1,4}|i{0,3}[xv]|[xv]i{0,4})\\)", 2);
    private static Pattern trademarkPattern = Pattern.compile(".+?(\\((TM|R)\\)|\\(\\((TM|R)\\)\\))");
    private static Pattern tokenPattern = Pattern.compile("[^\\s \u0085 \u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006 \u2008\u2009\u200a\u2028\u2029 \u205f\u3000]+");
    private static String primesRe = "['`′″‴]*";
    private static String locantRe = "(\\d+" + primesRe + "[RSEZDLH]?|\\(([RSEZDLH±]|\\+|" + StringTools.hyphensRegex + "[DLRSEZ]|)\\)|([CNOS]|Se)\\d*|\\d*[" + StringTools.lowerGreek + "]|cis|trans|o(rtho)?|m(eta)?|p(ara)?|asym|sym|sec|tert|catena|closo|enantio|ent|endo|exo|fac|mer|gluco|nido|aci|erythro|threo|arachno|meso|syn|anti|tele|cine)" + primesRe;
    private static String prefixRe = "(" + locantRe + "(," + locantRe + ")*)";
    private static Pattern prefixPattern = Pattern.compile(prefixRe + EuclidConstants.S_LSQUARE + StringTools.hyphens + "](\\S*)");
    private static Tokeniser defaultInstance;
    private TokenClassifier tokenClassifier;

    public static synchronized Tokeniser getDefaultInstance() {
        if (defaultInstance == null) {
            defaultInstance = new Tokeniser(TokenClassifier.getDefaultInstance());
        }
        return defaultInstance;
    }

    public Tokeniser(TokenClassifier tokenClassifier) {
        this.tokenClassifier = tokenClassifier;
    }

    public TokenSequence tokenise(String str) {
        return tokenise(str, null, 0, null);
    }

    @Override // uk.ac.cam.ch.wwmm.oscar.document.ITokeniser
    public TokenSequence tokenise(String str, IProcessingDocument iProcessingDocument, int i, Element element) {
        LinkedList<Token> linkedList = new LinkedList();
        Matcher matcher = tokenPattern.matcher(str);
        while (matcher.find()) {
            linkedList.add(new Token(matcher.group(), matcher.start() + i, matcher.end() + i, iProcessingDocument, new BioType(BioTag.O), null));
        }
        int i2 = 0;
        while (i2 < linkedList.size()) {
            List<Token> splitToken = splitToken((Token) linkedList.get(i2));
            if (splitToken == null) {
                i2++;
            } else {
                linkedList.remove(i2);
                linkedList.addAll(i2, splitToken);
            }
        }
        ArrayList arrayList = new ArrayList();
        for (Token token : linkedList) {
            if (token.getSurface() != null && !"".equals(token.getSurface())) {
                arrayList.add(token);
            }
        }
        return indexTokensAndMakeTokenSequence(str, iProcessingDocument, i, element, arrayList);
    }

    public TokenSequence indexTokensAndMakeTokenSequence(String str, IProcessingDocument iProcessingDocument, int i, Element element, List<Token> list) {
        int i2 = 0;
        Iterator<Token> it = list.iterator();
        while (it.hasNext()) {
            it.next().setIndex(i2);
            i2++;
        }
        if (iProcessingDocument != null && iProcessingDocument.getTokensByStart() != null) {
            for (Token token : list) {
                iProcessingDocument.getTokensByStart().put(Integer.valueOf(token.getStart()), token);
                iProcessingDocument.getTokensByEnd().put(Integer.valueOf(token.getEnd()), token);
            }
        }
        TokenSequence tokenSequence = new TokenSequence(str, i, iProcessingDocument, list);
        Iterator<Token> it2 = list.iterator();
        while (it2.hasNext()) {
            it2.next().setTokenSequence(tokenSequence);
        }
        tokenSequence.setElem(element);
        return tokenSequence;
    }

    private List<Token> splitToken(Token token) {
        List<Token> rawSplitToken = rawSplitToken(token);
        if (rawSplitToken == null) {
            return null;
        }
        int i = 0;
        for (Token token2 : rawSplitToken) {
            if (token2.getEnd() - token2.getStart() > 0) {
                i++;
            }
        }
        if (i > 1) {
            return rawSplitToken;
        }
        return null;
    }

    private List<Token> rawSplitToken(Token token) {
        if (TermSets.getDefaultInstance().getAbbreviations().contains(token.getSurface().toLowerCase())) {
            return null;
        }
        String substring = token.getSurface().length() > 2 ? token.getSurface().substring(0, token.getSurface().length() - 1) : "";
        if (token.getSurface().startsWith("$") || token.getEnd() - token.getStart() < 2 || token.getSurface().equals(HelpFormatter.DEFAULT_LONG_OPT_PREFIX) || oxidationStatePattern.matcher(token.getSurface()).matches()) {
            return null;
        }
        if ("([{".indexOf(token.getSurface().codePointAt(0)) != -1 && (StringTools.isBracketed(token.getSurface()) || StringTools.isLackingCloseBracket(token.getSurface()))) {
            return splitAt(token, token.getStart() + 1);
        }
        if (")]}".indexOf(token.getSurface().codePointAt(token.getSurface().length() - 1)) != -1 && (StringTools.isBracketed(token.getSurface()) || StringTools.isLackingOpenBracket(token.getSurface()))) {
            return splitAt(token, token.getEnd() - 1);
        }
        if ("=<>≠≡≢≣≤≥≦≧≨≩≪≫\"'‘’‚‛“”„‟".indexOf(token.getSurface().codePointAt(0)) != -1) {
            return splitAt(token, token.getStart() + 1);
        }
        if (".,;:!?™®-\"'‘’‚‛“”„‟".indexOf(token.getSurface().codePointAt(token.getSurface().length() - 1)) != -1 && (!token.getSurface().substring(token.getSurface().length() - 1).equals(EuclidConstants.S_APOS) || !token.getSurface().matches("[A-Z][a-z]+s'"))) {
            return splitAt(token, token.getEnd() - 1);
        }
        Matcher matcher = trademarkPattern.matcher(token.getSurface());
        if (matcher.matches() && matcher.start(1) > 0) {
            return splitAt(token, token.getStart() + matcher.start(1));
        }
        if (substring.contains(EuclidConstants.S_LANGLE)) {
            return splitAt(token, token.getStart() + token.getSurface().indexOf(EuclidConstants.S_LANGLE), token.getStart() + token.getSurface().indexOf(EuclidConstants.S_LANGLE) + 1);
        }
        if (substring.contains(EuclidConstants.S_RANGLE)) {
            return splitAt(token, token.getStart() + token.getSurface().indexOf(EuclidConstants.S_RANGLE), token.getStart() + token.getSurface().indexOf(EuclidConstants.S_RANGLE) + 1);
        }
        if (substring.contains("/")) {
            return splitAt(token, token.getStart() + token.getSurface().indexOf("/"), token.getStart() + token.getSurface().indexOf("/") + 1);
        }
        if (substring.contains(EuclidConstants.S_COLON) && StringTools.bracketsAreBalanced(token.getSurface()) && StringTools.bracketsAreBalanced(token.getSurface().substring(token.getSurface().indexOf(EuclidConstants.S_COLON) + 1))) {
            return splitAt(token, token.getStart() + token.getSurface().indexOf(EuclidConstants.S_COLON), token.getStart() + token.getSurface().indexOf(EuclidConstants.S_COLON) + 1);
        }
        if (substring.contains("+")) {
            int indexOf = token.getSurface().indexOf("+");
            if (indexOf < token.getSurface().length() - 2 && StringTools.isHyphen(token.getSurface().substring(indexOf + 1, indexOf + 2))) {
                return splitAt(token, token.getStart() + indexOf + 1, token.getStart() + indexOf + 2);
            }
            if (indexOf <= 0 || indexOf >= token.getSurface().length() - 1) {
                return splitAt(token, token.getStart() + indexOf, token.getStart() + indexOf + 1);
            }
            if (!token.getSurface().endsWith("-") && StringTools.bracketsAreBalanced(token.getSurface()) && StringTools.bracketsAreBalanced(token.getSurface().substring(indexOf + 1))) {
                return splitAt(token, token.getStart() + indexOf, token.getStart() + indexOf + 1);
            }
        }
        if (substring.contains(StringTools.midElipsis)) {
            return splitAt(token, token.getStart() + token.getSurface().indexOf(StringTools.midElipsis), token.getStart() + token.getSurface().indexOf(StringTools.midElipsis) + 1);
        }
        if (substring.contains(HelpFormatter.DEFAULT_LONG_OPT_PREFIX)) {
            return splitAt(token, token.getStart() + token.getSurface().indexOf(HelpFormatter.DEFAULT_LONG_OPT_PREFIX), token.getStart() + token.getSurface().indexOf(HelpFormatter.DEFAULT_LONG_OPT_PREFIX) + 2);
        }
        int indexOfSplittableHyphen = HyphenTokeniser.indexOfSplittableHyphen(token.getSurface());
        if (indexOfSplittableHyphen != -1 && !token.getSurface().matches(".*[a-z][a-z].*") && token.getSurface().matches(".*[A-Z].*") && this.tokenClassifier.isTokenLevelRegexMatch(token.getSurface(), "bondRegex")) {
            indexOfSplittableHyphen = -1;
        }
        if (indexOfSplittableHyphen == -1) {
            return null;
        }
        if (!token.getSurface().endsWith("NMR") && prefixPattern.matcher(token.getSurface()).matches()) {
            return splitAt(token, token.getStart() + indexOfSplittableHyphen + 1);
        }
        return splitAt(token, token.getStart() + indexOfSplittableHyphen, token.getStart() + indexOfSplittableHyphen + 1);
    }

    public List<Token> splitAt(Token token, int i) {
        int start = i - token.getStart();
        LinkedList linkedList = new LinkedList();
        linkedList.add(new Token(token.getSurface().substring(0, start), token.getStart(), i, token.getDoc(), token.getBioType(), token.getNeElem()));
        linkedList.add(new Token(token.getSurface().substring(start), i, token.getEnd(), token.getDoc(), token.getBioType(), token.getNeElem()));
        return linkedList;
    }

    private List<Token> splitAt(Token token, int i, int i2) {
        int start = i - token.getStart();
        int start2 = i2 - token.getStart();
        LinkedList linkedList = new LinkedList();
        linkedList.add(new Token(token.getSurface().substring(0, start), token.getStart(), i, token.getDoc(), token.getBioType(), token.getNeElem()));
        linkedList.add(new Token(token.getSurface().substring(start, start2), i, i2, token.getDoc(), token.getBioType(), token.getNeElem()));
        linkedList.add(new Token(token.getSurface().substring(start2), i2, token.getEnd(), token.getDoc(), token.getBioType(), token.getNeElem()));
        return linkedList;
    }
}
