package edu.northwestern.at.morphadorner.tools.taggertrainer.ngram;

import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.BaseLexicon;
import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.Lexicon;
import edu.northwestern.at.morphadorner.corpuslinguistics.postagger.transitionmatrix.TransitionMatrix;
import edu.northwestern.at.utils.CharUtils;
import edu.northwestern.at.utils.UnicodeReader;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.StringTokenizer;

/* loaded from: input_file:edu/northwestern/at/morphadorner/tools/taggertrainer/ngram/NGramTaggerTrainer.class */
public class NGramTaggerTrainer {
    protected static Lexicon wordLexicon;
    protected static String trainingDataFileName;
    protected static int trainingDataCount = 0;
    protected static TransitionMatrix transitionMatrix = new TransitionMatrix();
    protected static String transitionMatrixFileName = null;
    protected static String sepChars = "\t";

    protected static void getProgramParameters(String[] strArr) throws IOException {
        if (strArr.length < 3) {
            help();
        }
        trainingDataFileName = strArr[0];
        File file = new File(strArr[1]);
        wordLexicon = new BaseLexicon();
        wordLexicon.loadLexicon(file.toURI().toURL(), "utf-8");
        transitionMatrixFileName = strArr[2];
    }

    protected static void loadTrainingData() throws IOException {
        long currentTimeMillis = System.currentTimeMillis();
        BufferedReader bufferedReader = new BufferedReader(new UnicodeReader(new FileInputStream(trainingDataFileName), "utf-8"));
        trainingDataCount = 0;
        String str = ".";
        String str2 = ".";
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                System.out.println("Training data loaded in " + (((System.currentTimeMillis() - currentTimeMillis) + 999) / 1000) + " seconds.");
                return;
            }
            String trim = readLine.trim();
            if (trim.length() != 0) {
                StringTokenizer stringTokenizer = new StringTokenizer(trim, sepChars);
                String str3 = "";
                String str4 = "";
                try {
                    str3 = stringTokenizer.nextToken().trim();
                    str4 = stringTokenizer.nextToken().trim();
                } catch (Exception e) {
                    if (CharUtils.isPunctuationOrSymbol(str3)) {
                        str4 = str3;
                    } else {
                        e.printStackTrace();
                        System.out.println("line=" + trim);
                    }
                }
                transitionMatrix.incrementCount(str4, 1);
                transitionMatrix.incrementCount(str, str4, 1);
                transitionMatrix.incrementCount(str2, str, str4, 1);
                trainingDataCount++;
                str2 = str;
                str = str4;
            }
        }
    }

    public static void main(String[] strArr) {
        try {
            getProgramParameters(strArr);
            loadTrainingData();
            transitionMatrix.saveTransitionMatrix(transitionMatrixFileName, "utf-8", '\t');
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    protected static URL getURL(String str) {
        URL url;
        try {
            url = new URL(str);
        } catch (MalformedURLException e) {
            try {
                url = new File(str).toURI().toURL();
            } catch (Exception e2) {
                url = null;
            }
        }
        return url;
    }

    protected static void help() {
        System.out.println("java edu.northwestern.at.taggertrainer.ngram.NGramTaggerTrainer trainingdata wordlexicon outputtransitionmatrix");
        System.exit(1);
    }
}
