package edu.northwestern.at.morphadorner.tools.punktabbreviationdetector;

import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.ICU4JBreakIteratorWordTokenizer;
import edu.northwestern.at.utils.CharUtils;
import edu.northwestern.at.utils.FileNameUtils;
import edu.northwestern.at.utils.FileUtils;
import edu.northwestern.at.utils.SetUtils;
import edu.northwestern.at.utils.StringUtils;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.List;
import java.util.Locale;
import java.util.TreeSet;

/* loaded from: input_file:edu/northwestern/at/morphadorner/tools/punktabbreviationdetector/PunktAbbreviationDetector.class */
public class PunktAbbreviationDetector {
    protected static final int INITPARAMS = 2;
    protected static PrintStream printStream;

    public static void main(String[] strArr) throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        printStream = new PrintStream((OutputStream) new BufferedOutputStream(System.out), true, "utf-8");
        String str = strArr[0];
        printStream.println("Language code: " + str);
        Locale languageCodeToLocale = languageCodeToLocale(str);
        printStream.println("Language: " + languageCodeToLocale.getDisplayLanguage());
        ICU4JBreakIteratorWordTokenizer iCU4JBreakIteratorWordTokenizer = new ICU4JBreakIteratorWordTokenizer(languageCodeToLocale);
        iCU4JBreakIteratorWordTokenizer.setStoreWhitespaceTokens(true);
        iCU4JBreakIteratorWordTokenizer.setMergeWhitespaceTokens(true);
        iCU4JBreakIteratorWordTokenizer.setSplitAroundPeriods(false);
        String str2 = strArr[1];
        PunktTokenCounter punktTokenCounter = new PunktTokenCounter(0.3d, false);
        String[] strArr2 = new String[strArr.length - 2];
        for (int i = 2; i < strArr.length; i++) {
            strArr2[i - 2] = strArr[i];
        }
        String[] expandFileNameWildcards = FileNameUtils.expandFileNameWildcards(strArr2);
        printStream.println("There are " + StringUtils.formatNumberWithCommas(expandFileNameWildcards.length) + " files to process.");
        long j = 0;
        for (String str3 : expandFileNameWildcards) {
            List<String> extractWords = iCU4JBreakIteratorWordTokenizer.extractWords(FileUtils.readTextFile(str3, "utf-8"));
            for (int i2 = 0; i2 < extractWords.size(); i2++) {
                punktTokenCounter.count(makePunktToken(extractWords.get(i2)));
                j++;
            }
        }
        printStream.println("\nProcessing completed in " + StringUtils.formatNumberWithCommas(((System.currentTimeMillis() - currentTimeMillis) + 999) / 1000) + " seconds.");
        printStream.println("\n" + StringUtils.formatNumberWithCommas(j) + " tokens extracted.");
        printStream.println();
        printStream.println("There were " + StringUtils.formatNumberWithCommas(punktTokenCounter.getCandidates().size()) + " candidates.");
        printStream.println("There are " + StringUtils.formatNumberWithCommas(punktTokenCounter.getAbbreviations().size()) + " abbreviations.");
        SetUtils.saveSet(new TreeSet(punktTokenCounter.getAbbreviations()), str2, "utf-8");
    }

    public static Locale languageCodeToLocale(String str) {
        return new Locale(str);
    }

    public static PunktToken makePunktToken(String str) {
        char charAt = str.charAt(0);
        return Character.isWhitespace(charAt) ? new PunktToken(str, PunktTokenType.WHITESPACE) : Character.isDigit(charAt) ? new PunktToken(str, PunktTokenType.NUMBER) : CharUtils.isPunctuationOrSymbol(str) ? new PunktToken(str, PunktTokenType.NONWORD) : new PunktToken(str, PunktTokenType.WORD);
    }

    protected PunktAbbreviationDetector() {
    }
}
