package edu.northwestern.at.morphadorner;

import edu.northwestern.at.morphadorner.corpuslinguistics.adornedword.AdornedWord;
import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.Lexicon;
import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.LexiconFactory;
import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.UsesLexicon;
import edu.northwestern.at.morphadorner.corpuslinguistics.namerecognizer.Names;
import edu.northwestern.at.morphadorner.corpuslinguistics.namestandardizer.NameStandardizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.namestandardizer.NameStandardizerFactory;
import edu.northwestern.at.morphadorner.corpuslinguistics.partsofspeech.PartOfSpeechTags;
import edu.northwestern.at.morphadorner.corpuslinguistics.postagger.InvalidRuleException;
import edu.northwestern.at.morphadorner.corpuslinguistics.postagger.PartOfSpeechTagger;
import edu.northwestern.at.morphadorner.corpuslinguistics.postagger.transitionmatrix.TransitionMatrix;
import edu.northwestern.at.morphadorner.corpuslinguistics.spellingmapper.SpellingMapper;
import edu.northwestern.at.morphadorner.corpuslinguistics.spellingmapper.SpellingMapperFactory;
import edu.northwestern.at.morphadorner.corpuslinguistics.spellingstandardizer.SpellingStandardizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.spellingstandardizer.SpellingStandardizerFactory;
import edu.northwestern.at.morphadorner.xgtagger.XGOptions;
import edu.northwestern.at.utils.CharUtils;
import edu.northwestern.at.utils.Formatters;
import edu.northwestern.at.utils.RomanNumeralUtils;
import edu.northwestern.at.utils.SingleTagTaggedStrings;
import edu.northwestern.at.utils.StringUtils;
import edu.northwestern.at.utils.TaggedStrings;
import edu.northwestern.at.utils.TextFile;
import edu.northwestern.at.utils.UTF8Properties;
import edu.northwestern.at.utils.UnicodeReader;
import edu.northwestern.at.utils.logger.UsesLogger;
import edu.northwestern.at.utils.xml.DOMUtils;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/* loaded from: input_file:edu/northwestern/at/morphadorner/MorphAdornerUtils.class */
public class MorphAdornerUtils {
    protected static Pattern underlineCapCapPattern = Pattern.compile("^_([ABCDEFGHIJKLMNOPQRSTUVWXYZ])([ABCDEFGHIJKLMNOPQRSTUVWXYZ])");
    protected static final Matcher underlineCapCapMatcher = underlineCapCapPattern.matcher("");
    protected static Runtime runTime = Runtime.getRuntime();

    public static int countPageBreaks(Document document) {
        return document.getElementsByTagName("pb").getLength();
    }

    public static SpellingMapper createSpellingMapper(UTF8Properties uTF8Properties) throws IOException {
        return SpellingMapperFactory.newSpellingMapper(uTF8Properties);
    }

    public static NameStandardizer createNameStandardizer(Lexicon lexicon, MorphAdornerSettings morphAdornerSettings, MorphAdornerLogger morphAdornerLogger) throws IOException {
        NameStandardizer newNameStandardizer = NameStandardizerFactory.newNameStandardizer(morphAdornerSettings.properties);
        if (newNameStandardizer != null) {
            if (lexicon != null) {
                long currentTimeMillis = System.currentTimeMillis();
                newNameStandardizer.loadNamesFromLexicon(lexicon);
                morphAdornerLogger.println("Loaded_names", new Object[]{Formatters.formatIntegerWithCommas(newNameStandardizer.getNumberOfNames()), durationString(morphAdornerSettings, currentTimeMillis)});
            }
            if (newNameStandardizer instanceof UsesLogger) {
                ((UsesLogger) newNameStandardizer).setLogger(morphAdornerLogger.getLogger());
            }
        }
        return newNameStandardizer;
    }

    public static Lexicon loadWordLexicon(MorphAdornerSettings morphAdornerSettings, MorphAdornerLogger morphAdornerLogger) throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        Lexicon newLexicon = LexiconFactory.newLexicon(morphAdornerSettings.properties);
        if (morphAdornerSettings.wordLexiconURL != null) {
            newLexicon.loadLexicon(morphAdornerSettings.wordLexiconURL, "utf-8");
        }
        morphAdornerLogger.println("Loaded_word_lexicon", new Object[]{Formatters.formatIntegerWithCommas(newLexicon.getLexiconSize()), durationString(morphAdornerSettings, currentTimeMillis)});
        ((UsesLogger) newLexicon).setLogger(morphAdornerLogger.getLogger());
        return newLexicon;
    }

    public static Lexicon loadSuffixLexicon(MorphAdornerSettings morphAdornerSettings, MorphAdornerLogger morphAdornerLogger) throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        Lexicon newLexicon = LexiconFactory.newLexicon(morphAdornerSettings.properties);
        if (morphAdornerSettings.suffixLexiconURL != null) {
            newLexicon.loadLexicon(morphAdornerSettings.suffixLexiconURL, "utf-8");
        }
        morphAdornerLogger.println("Loaded_suffix_lexicon", new Object[]{Formatters.formatIntegerWithCommas(newLexicon.getLexiconSize()), durationString(morphAdornerSettings, currentTimeMillis)});
        ((UsesLogger) newLexicon).setLogger(morphAdornerLogger.getLogger());
        return newLexicon;
    }

    public static TransitionMatrix loadTransitionMatrix(PartOfSpeechTagger partOfSpeechTagger, MorphAdornerSettings morphAdornerSettings, MorphAdornerLogger morphAdornerLogger) throws IOException {
        TransitionMatrix transitionMatrix = new TransitionMatrix();
        if (morphAdornerSettings.transitionMatrixURL != null && partOfSpeechTagger.usesTransitionProbabilities()) {
            long currentTimeMillis = System.currentTimeMillis();
            transitionMatrix.loadTransitionMatrix(morphAdornerSettings.transitionMatrixURL, "utf-8", '\t');
            partOfSpeechTagger.setTransitionMatrix(transitionMatrix);
            morphAdornerLogger.println("Loaded_transition_matrix", new Object[]{durationString(morphAdornerSettings, currentTimeMillis)});
            transitionMatrix.setLogger(morphAdornerLogger.getLogger());
        }
        return transitionMatrix;
    }

    public static void loadTaggerRules(PartOfSpeechTagger partOfSpeechTagger, MorphAdornerSettings morphAdornerSettings, MorphAdornerLogger morphAdornerLogger) throws InvalidRuleException, IOException {
        if (morphAdornerSettings.contextRulesURL != null && partOfSpeechTagger.usesContextRules()) {
            partOfSpeechTagger.setContextRules(new TextFile(morphAdornerSettings.contextRulesURL, "utf-8").toArray());
        }
        if (morphAdornerSettings.lexicalRulesURL == null || !partOfSpeechTagger.usesLexicalRules()) {
            return;
        }
        partOfSpeechTagger.setLexicalRules(new TextFile(morphAdornerSettings.lexicalRulesURL, "utf-8").toArray());
    }

    public static SpellingStandardizer createSpellingStandardizer(Lexicon lexicon, Names names, MorphAdornerSettings morphAdornerSettings, MorphAdornerLogger morphAdornerLogger) throws IOException {
        SpellingStandardizer newSpellingStandardizer = SpellingStandardizerFactory.newSpellingStandardizer(morphAdornerSettings.properties);
        if (newSpellingStandardizer != null) {
            long currentTimeMillis = System.currentTimeMillis();
            if (newSpellingStandardizer instanceof UsesLexicon) {
                ((UsesLexicon) newSpellingStandardizer).setLexicon(lexicon);
            }
            if (morphAdornerSettings.spellingsURL != null) {
                newSpellingStandardizer.loadStandardSpellings(morphAdornerSettings.spellingsURL, "utf-8");
                morphAdornerLogger.println("Loaded_standard_spellings", new Object[]{Formatters.formatIntegerWithCommas(newSpellingStandardizer.getNumberOfStandardSpellings()), durationString(morphAdornerSettings, currentTimeMillis)});
            }
            newSpellingStandardizer.addStandardSpellings(names.getFirstNames());
            newSpellingStandardizer.addStandardSpellings(names.getSurnames());
            newSpellingStandardizer.addStandardSpellings(names.getPlaceNames().keySet());
            if (morphAdornerSettings.alternateSpellingsURLs != null) {
                int i = 0;
                for (int i2 = 0; i2 < morphAdornerSettings.alternateSpellingsURLs.length; i2++) {
                    long currentTimeMillis2 = System.currentTimeMillis();
                    newSpellingStandardizer.loadAlternativeSpellings(morphAdornerSettings.alternateSpellingsURLs[i2], "utf-8", "\t");
                    morphAdornerLogger.println("Loaded_alternate_spellings", new Object[]{Formatters.formatIntegerWithCommas(newSpellingStandardizer.getNumberOfAlternateSpellings() - i), durationString(morphAdornerSettings, currentTimeMillis2)});
                    i = newSpellingStandardizer.getNumberOfAlternateSpellings();
                }
            }
            if (morphAdornerSettings.alternateSpellingsByWordClassURLs != null) {
                int[] iArr = {0, 0};
                for (int i3 = 0; i3 < morphAdornerSettings.alternateSpellingsByWordClassURLs.length; i3++) {
                    long currentTimeMillis3 = System.currentTimeMillis();
                    newSpellingStandardizer.loadAlternativeSpellingsByWordClass(morphAdornerSettings.alternateSpellingsByWordClassURLs[i3], "utf-8");
                    int[] numberOfAlternateSpellingsByWordClass = newSpellingStandardizer.getNumberOfAlternateSpellingsByWordClass();
                    morphAdornerLogger.println("Loaded_alternate_spellings_by_word_class", new Object[]{Formatters.formatIntegerWithCommas(numberOfAlternateSpellingsByWordClass[1] - iArr[1]), Formatters.formatIntegerWithCommas(numberOfAlternateSpellingsByWordClass[0] - iArr[0]), durationString(morphAdornerSettings, currentTimeMillis3)});
                    iArr[0] = numberOfAlternateSpellingsByWordClass[0];
                    iArr[1] = numberOfAlternateSpellingsByWordClass[1];
                }
            }
            if (newSpellingStandardizer instanceof UsesLogger) {
                ((UsesLogger) newSpellingStandardizer).setLogger(morphAdornerLogger.getLogger());
            }
        }
        return newSpellingStandardizer;
    }

    public static String durationString(MorphAdornerSettings morphAdornerSettings, long j) {
        StringBuffer stringBuffer = new StringBuffer();
        long currentTimeMillis = ((System.currentTimeMillis() - j) + 999) / 1000;
        String formatLongWithCommas = Formatters.formatLongWithCommas(currentTimeMillis);
        if (currentTimeMillis < 1) {
            formatLongWithCommas = "< 1";
        }
        String str = currentTimeMillis > 1 ? "seconds" : "second";
        stringBuffer.append(formatLongWithCommas);
        stringBuffer.append(" ");
        stringBuffer.append(morphAdornerSettings.getString(str));
        stringBuffer.append(".");
        return stringBuffer.toString();
    }

    public static void fixEmptySoftTags(XGOptions xGOptions, Document document) {
        List<Node> descendants = DOMUtils.getDescendants(document);
        for (int i = 0; i < descendants.size(); i++) {
            Node node = descendants.get(i);
            String nodeName = node.getNodeName();
            if (xGOptions.isSoftTag(nodeName) && DOMUtils.getText(node).length() == 0) {
                if (nodeName.equals("gap")) {
                    DOMUtils.setText(node, " \ue500 ");
                } else if (!nodeName.equals("pb")) {
                    DOMUtils.setText(node, " ");
                }
            }
        }
    }

    public static String fixSpelling(String str) {
        String str2 = str;
        if (!str2.equals(CharUtils.VERTICAL_BAR_STRING)) {
            str2 = StringUtils.replaceAll(str2, CharUtils.VERTICAL_BAR_STRING, "");
        }
        if (!str2.equals("{")) {
            str2 = StringUtils.replaceAll(str2, "{", "");
        }
        if (!str2.equals("}")) {
            str2 = StringUtils.replaceAll(str2, "}", "");
        }
        if (!str2.equals("+")) {
            str2 = StringUtils.replaceAll(str2, "+", "");
        }
        if (str2.length() > 1 && str2.charAt(0) == '_') {
            underlineCapCapMatcher.reset(str2);
            if (underlineCapCapMatcher.find()) {
                str2 = (str2.charAt(1) + "") + (Character.toLowerCase(str2.charAt(2)) + "") + (str2.length() > 3 ? str2.substring(3) : "");
            }
        }
        return str2;
    }

    public static void fixSupTags(Document document) {
        Node previousSibling;
        NodeList elementsByTagName = document.getElementsByTagName("hi");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Element element = (Element) elementsByTagName.item(i);
            String attribute = element.getAttribute("rend");
            if (attribute != null && attribute.equals("superscript")) {
                String text = DOMUtils.getText(element);
                if (!text.startsWith("^") && (previousSibling = element.getPreviousSibling()) != null) {
                    String textContent = previousSibling.getTextContent();
                    if (textContent.endsWith(" y")) {
                        String lowerCase = text.toLowerCase();
                        if (lowerCase.equals("e") || lowerCase.equals("t") || lowerCase.equals("c") || lowerCase.equals("en") || lowerCase.equals("ere") || lowerCase.equals("f") || lowerCase.equals("i") || lowerCase.equals("m") || lowerCase.equals("n") || lowerCase.equals("o") || lowerCase.equals("u")) {
                            DOMUtils.setText(element, CharUtils.CHAR_SUP_TEXT_MARKER_STRING + text);
                        }
                    } else if (textContent.endsWith(" w")) {
                        String lowerCase2 = text.toLowerCase();
                        if (lowerCase2.equals("ch") || lowerCase2.equals("t") || lowerCase2.equals("th")) {
                            DOMUtils.setText(element, CharUtils.CHAR_SUP_TEXT_MARKER_STRING + text);
                        }
                    }
                }
            }
        }
    }

    public static String getLemma(MorphAdorner morphAdorner, String str, String str2) {
        String str3 = str;
        String lemmaWordClass = morphAdorner.partOfSpeechTags.getLemmaWordClass(str2);
        if (!morphAdorner.lemmatizer.cantLemmatize(str) && !lemmaWordClass.equals(PartOfSpeechTags.NONE)) {
            boolean isCompoundTag = morphAdorner.partOfSpeechTags.isCompoundTag(str2);
            if (isCompoundTag) {
                String lemmatize = morphAdorner.lemmatizer.lemmatize(str, "compound");
                if (morphAdorner.lemmatizer.isCompoundLemma(lemmatize)) {
                    return lemmatize;
                }
            }
            List<String> extractWords = morphAdorner.spellingTokenizer.extractWords(str);
            if (isCompoundTag && extractWords.size() != 1) {
                str3 = "";
                String[] splitTag = morphAdorner.partOfSpeechTags.splitTag(str2);
                if (splitTag.length == extractWords.size()) {
                    for (int i = 0; i < extractWords.size(); i++) {
                        String str4 = extractWords.get(i);
                        if (i > 0) {
                            str3 = str3 + morphAdorner.lemmaSeparator;
                        }
                        str3 = str3 + morphAdorner.lemmatizer.lemmatize(str4, morphAdorner.partOfSpeechTags.getLemmaWordClass(splitTag[i]));
                    }
                }
            } else if (lemmaWordClass.length() == 0) {
                str3 = morphAdorner.lemmatizer.lemmatize(str, "compound");
                if (str3.equals(str)) {
                    str3 = morphAdorner.lemmatizer.lemmatize(str);
                }
            } else {
                str3 = morphAdorner.lemmatizer.lemmatize(str, lemmaWordClass);
            }
        } else if (morphAdorner.partOfSpeechTags.isNumberTag(str2) && RomanNumeralUtils.isLooseRomanNumeral(str3)) {
            if (str3.charAt(0) == '.') {
                str3 = str3.substring(1);
            }
            if (str3.charAt(str3.length() - 1) == '.') {
                str3 = str3.substring(0, str3.length() - 1);
            }
        }
        return str3;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String getStandardizedSpelling(MorphAdorner morphAdorner, String str, String str2, String str3) {
        String str4 = str;
        if (morphAdorner.partOfSpeechTags.isProperNounTag(str3)) {
            str4 = morphAdorner.nameStandardizer.standardizeProperName(str);
        } else if ((!morphAdorner.partOfSpeechTags.isNounTag(str3) || !CharUtils.hasInternalCaps(str)) && !morphAdorner.partOfSpeechTags.isForeignWordTag(str3)) {
            if (!morphAdorner.partOfSpeechTags.isNumberTag(str3)) {
                str4 = morphAdorner.spellingStandardizer.standardizeSpelling(str, morphAdorner.partOfSpeechTags.getMajorWordClass(str3));
                if (str4.equalsIgnoreCase(str)) {
                    str4 = str;
                }
            } else if (RomanNumeralUtils.isLooseRomanNumeral(str4)) {
                if (str4.charAt(0) == '.') {
                    str4 = str4.substring(1);
                }
                if (str4.charAt(str4.length() - 1) == '.') {
                    str4 = str4.substring(0, str4.length() - 1);
                }
            }
        }
        return str4;
    }

    public static String[] getKWIC(List<AdornedWord> list, int i, int i2) {
        String[] strArr = new String[3];
        StringBuffer stringBuffer = new StringBuffer();
        AdornedWord adornedWord = list.get(i);
        int i3 = 0;
        int length = ((i2 - 4) - adornedWord.getToken().length()) / 2;
        for (int i4 = i - 1; i3 < length && i4 >= 0; i4--) {
            AdornedWord adornedWord2 = list.get(i4);
            if (stringBuffer.length() > 0) {
                stringBuffer.insert(0, " ");
            }
            stringBuffer.insert(0, adornedWord2.getToken());
            i3 += adornedWord2.getToken().length() + 1;
        }
        strArr[0] = stringBuffer.toString();
        strArr[1] = adornedWord.getToken();
        stringBuffer.setLength(0);
        int size = list.size();
        for (int i5 = i + 1; stringBuffer.length() < length && i5 < size; i5++) {
            stringBuffer.append(list.get(i5).getToken());
            stringBuffer.append(" ");
        }
        strArr[2] = stringBuffer.toString();
        return strArr;
    }

    public static int getWordCount(List<List<String>> list) {
        int i = 0;
        for (int i2 = 0; i2 < list.size(); i2++) {
            i += list.get(i2).size();
        }
        return i;
    }

    public static int[] getWordAndSentenceCounts(List<List<String>> list) {
        int[] iArr = {0, 0};
        for (int i = 0; i < list.size(); i++) {
            List<String> list2 = list.get(i);
            int size = list2.size();
            boolean z = false;
            while (!z && list2.get(size - 1).equals(CharUtils.CHAR_END_OF_TEXT_SECTION_STRING)) {
                size--;
                z = size < 1;
            }
            if (size > 0) {
                iArr[0] = iArr[0] + 1;
            }
            iArr[1] = iArr[1] + size;
        }
        return iArr;
    }

    public static TaggedStrings getWordList(String str, String str2, String str3, MorphAdornerSettings morphAdornerSettings, MorphAdornerLogger morphAdornerLogger) {
        long currentTimeMillis = System.currentTimeMillis();
        SingleTagTaggedStrings singleTagTaggedStrings = new SingleTagTaggedStrings(new TextFile(MorphAdornerUtils.class.getResourceAsStream(str), "utf-8").toArray(), str2);
        morphAdornerLogger.println(str3, new Object[]{Formatters.formatIntegerWithCommas(singleTagTaggedStrings.getStringCount()), durationString(morphAdornerSettings, currentTimeMillis)});
        return singleTagTaggedStrings;
    }

    public static TaggedStrings getExtraWordsList(String str, String str2, String str3, MorphAdornerSettings morphAdornerSettings, MorphAdornerLogger morphAdornerLogger) {
        long currentTimeMillis = System.currentTimeMillis();
        UTF8Properties uTF8Properties = null;
        try {
            uTF8Properties = new UTF8Properties();
            uTF8Properties.load(MorphAdornerUtils.class.getResourceAsStream(str), str2);
        } catch (Exception e) {
        }
        if (uTF8Properties.size() > 0) {
            morphAdornerLogger.println(str3, new Object[]{Formatters.formatIntegerWithCommas(uTF8Properties.getStringCount()), durationString(morphAdornerSettings, currentTimeMillis)});
        }
        return uTF8Properties;
    }

    public static boolean isAdorned(String str, int i) {
        String str2;
        boolean z = false;
        try {
            BufferedReader bufferedReader = new BufferedReader(new UnicodeReader(new FileInputStream(str), "utf-8"));
            int i2 = 0;
            String readLine = bufferedReader.readLine();
            while (true) {
                if (readLine == null || 0 != 0 || i2 >= i) {
                    break;
                }
                i2++;
                if (readLine.indexOf("<w ") >= 0) {
                    try {
                        String[] matchGroups = WordAttributePatterns.idReplacer.matchGroups(WordAttributePatterns.wReplacer.matchGroups(readLine)[2]);
                        if (matchGroups != null && (str2 = matchGroups[2]) != null && str2.length() > 0) {
                            z = true;
                            break;
                        }
                    } catch (Exception e) {
                    }
                }
                readLine = bufferedReader.readLine();
            }
            bufferedReader.close();
        } catch (Exception e2) {
        }
        return z;
    }

    public static void logMemoryUsage(MorphAdornerLogger morphAdornerLogger, String str) {
        morphAdornerLogger.println("Memory_used", new Object[]{str, Formatters.formatLongWithCommas(runTime.freeMemory()), Formatters.formatLongWithCommas(runTime.totalMemory())});
    }

    protected MorphAdornerUtils() {
    }
}
