package uk.ac.cam.ch.wwmm.oscartokeniser;

import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xmlcml.euclid.EuclidConstants;
import uk.ac.cam.ch.wwmm.oscar.ont.OntologyTerms;
import uk.ac.cam.ch.wwmm.oscar.terms.TermSets;
import uk.ac.cam.ch.wwmm.oscar.tools.StringTools;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/oscartokeniser/HyphenTokeniser.class */
public final class HyphenTokeniser {
    private Set<String> splitSuffixes;
    private Set<String> noSplitPrefixes;
    private int minPrefixLength;
    private int maxPrefixLength;
    private static final Logger LOG = LoggerFactory.getLogger(HyphenTokeniser.class);
    private static HyphenTokeniser myInstance = null;
    private static Pattern suffixPrefixPattern = Pattern.compile("mono|di|tri|tetra|penta|hexa|hepta|un|de|re|pre");
    static Pattern propernounHyphenPattern = Pattern.compile("((Mc|Mac)?[A-Z]\\p{Ll}\\p{Ll}\\p{Ll}+(s'|'s)?(?:-|‐|‑|‒|–|—|―))+(Mc|Mac)?[A-Z]\\p{Ll}\\p{Ll}\\p{Ll}+(s'|'s)?");
    private static Pattern lowercaseEitherSidePattern = Pattern.compile("[a-z][a-z][-‐‑‒–—―][a-z]+");

    public static void reinitialise() {
        myInstance = null;
        getInstance();
    }

    static HyphenTokeniser getInstance() {
        if (myInstance == null) {
            myInstance = new HyphenTokeniser();
        }
        return myInstance;
    }

    private HyphenTokeniser() {
        LOG.debug("Initialising hyphen tokeniser... ");
        this.splitSuffixes = new HashSet();
        this.splitSuffixes.addAll(TermSets.getDefaultInstance().getSplitSuffixes());
        this.noSplitPrefixes = TermSets.getDefaultInstance().getNoSplitPrefixes();
        this.minPrefixLength = 1000;
        this.maxPrefixLength = 0;
        for (String str : this.noSplitPrefixes) {
            this.minPrefixLength = Math.min(this.minPrefixLength, str.length());
            this.maxPrefixLength = Math.max(this.maxPrefixLength, str.length());
        }
        LOG.debug("hyphen tokeniser initialised");
    }

    public static int indexOfSplittableHyphen(String str) {
        return getInstance().indexOfSplittableHyphenInternal(str);
    }

    private int indexOfSplittableHyphenInternal(String str) {
        boolean bracketsAreBalanced = StringTools.bracketsAreBalanced(str);
        int length = str.length() - 3;
        while (length > 0) {
            if (StringTools.hyphens.indexOf(str.codePointAt(length)) != -1 && (!bracketsAreBalanced || StringTools.bracketsAreBalanced(str.substring(length + 1)))) {
                if (!checkEnDash(str, length) && !checkEmDash(str, length) && !suffixContainedInSplitSuffix(str, length) && !termMatchesPropernounPattern(str) && !suffixStartsWithSplitSuffix(str, length)) {
                    if (length == 1) {
                        continue;
                    } else {
                        if (!precededByNoSplitPrefix(str, length)) {
                            if (!termContainedInHyphTokable(str, length) && !lowercaseEitherSideOfHyphen(str, length)) {
                            }
                            return length;
                        }
                        continue;
                    }
                }
                return length;
            }
            length--;
        }
        return -1;
    }

    boolean lowercaseEitherSideOfHyphen(String str, int i) {
        return lowercaseEitherSidePattern.matcher(str.substring(i - 2)).matches();
    }

    private boolean precededByNoSplitPrefix(String str, int i) {
        for (int i2 = this.minPrefixLength; i2 <= this.maxPrefixLength && i2 <= i; i2++) {
            if (this.noSplitPrefixes.contains(str.substring(i - i2, i).toLowerCase())) {
                return true;
            }
        }
        return false;
    }

    boolean suffixStartsWithSplitSuffix(String str, int i) {
        String lowerCase = str.substring(i + 1).toLowerCase();
        Matcher matcher = suffixPrefixPattern.matcher(lowerCase);
        if (matcher.lookingAt()) {
            lowerCase = lowerCase.substring(matcher.end());
        }
        while (lowerCase.length() >= 3) {
            if (this.splitSuffixes.contains(lowerCase)) {
                return true;
            }
            lowerCase = lowerCase.substring(0, lowerCase.length() - 1);
        }
        return false;
    }

    boolean termMatchesPropernounPattern(String str) {
        return propernounHyphenPattern.matcher(str).matches();
    }

    boolean suffixContainedInSplitSuffix(String str, int i) {
        return this.splitSuffixes.contains(str.substring(i + 1).toLowerCase());
    }

    private boolean termContainedInHyphTokable(String str, int i) {
        StringBuilder sb = new StringBuilder(str.length());
        sb.append(StringTools.normaliseName(str.substring(0, i)));
        sb.append(EuclidConstants.S_SPACE);
        sb.append(StringTools.normaliseName(str.substring(i + 1)));
        return OntologyTerms.getDefaultInstance().getHyphTokable().contains(sb.toString());
    }

    private boolean checkEmDash(String str, int i) {
        return str.substring(i, i + 1).equals(StringTools.emDash);
    }

    private boolean checkEnDash(String str, int i) {
        return str.substring(i, i + 1).equals(StringTools.enDash);
    }
}
