package edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer;

import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import edu.northwestern.at.morphadorner.corpuslinguistics.abbreviations.Abbreviations;
import edu.northwestern.at.utils.FileUtils;
import edu.northwestern.at.utils.ListFactory;
import edu.northwestern.at.utils.StringUtils;
import edu.northwestern.at.utils.UnicodeReader;
import java.util.List;
import java.util.Locale;

/* loaded from: input_file:edu/northwestern/at/morphadorner/corpuslinguistics/tokenizer/ICU4JBreakIteratorWordTokenizer.class */
public class ICU4JBreakIteratorWordTokenizer extends AbstractWordTokenizer implements WordTokenizer, CanTokenizeWhitespace, CanSplitAroundPeriods {
    protected Locale locale;
    protected boolean storeWhitespaceTokens;
    protected boolean mergeWhitespaceTokens;
    protected boolean splitAroundPeriods;
    protected BreakIterator wordIterator;
    protected String wordBreakRulesFileName;

    public ICU4JBreakIteratorWordTokenizer() {
        this.locale = Locale.US;
        this.storeWhitespaceTokens = false;
        this.mergeWhitespaceTokens = false;
        this.splitAroundPeriods = true;
        this.wordIterator = null;
        this.wordBreakRulesFileName = "resources/wordbreakrules.txt";
        createWordIterator();
    }

    public ICU4JBreakIteratorWordTokenizer(Locale locale) {
        this.locale = Locale.US;
        this.storeWhitespaceTokens = false;
        this.mergeWhitespaceTokens = false;
        this.splitAroundPeriods = true;
        this.wordIterator = null;
        this.wordBreakRulesFileName = "resources/wordbreakrules.txt";
        this.locale = locale;
        createWordIterator();
    }

    @Override // edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.CanTokenizeWhitespace
    public boolean getStoreWhitespaceTokens() {
        return this.storeWhitespaceTokens;
    }

    @Override // edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.CanTokenizeWhitespace
    public void setStoreWhitespaceTokens(boolean z) {
        this.storeWhitespaceTokens = z;
    }

    @Override // edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.CanTokenizeWhitespace
    public boolean getMergeWhitespaceTokens() {
        return this.mergeWhitespaceTokens;
    }

    @Override // edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.CanTokenizeWhitespace
    public void setMergeWhitespaceTokens(boolean z) {
        this.mergeWhitespaceTokens = z;
    }

    @Override // edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.CanSplitAroundPeriods
    public boolean getSplitAroundPeriods() {
        return this.splitAroundPeriods;
    }

    @Override // edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.CanSplitAroundPeriods
    public void setSplitAroundPeriods(boolean z) {
        this.splitAroundPeriods = z;
    }

    protected void createWordIterator() {
        this.preTokenizer = new NoopPreTokenizer();
        String createAbbreviationsPattern = Abbreviations.createAbbreviationsPattern(null);
        UnicodeReader unicodeReader = null;
        try {
            try {
                unicodeReader = new UnicodeReader(ICU4JBreakIteratorWordTokenizer.class.getResourceAsStream(this.wordBreakRulesFileName), "utf-8");
                String readTextFile = FileUtils.readTextFile(unicodeReader);
                unicodeReader.close();
                this.wordIterator = new RuleBasedBreakIterator(StringUtils.replaceAll(readTextFile, "%abbreviations%", createAbbreviationsPattern));
                if (unicodeReader != null) {
                    try {
                        unicodeReader.close();
                    } catch (Exception e) {
                    }
                }
            } catch (Exception e2) {
                this.wordIterator = BreakIterator.getWordInstance(this.locale);
                if (unicodeReader != null) {
                    try {
                        unicodeReader.close();
                    } catch (Exception e3) {
                    }
                }
            }
        } catch (Throwable th) {
            if (unicodeReader != null) {
                try {
                    unicodeReader.close();
                } catch (Exception e4) {
                    throw th;
                }
            }
            throw th;
        }
    }

    @Override // edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.AbstractWordTokenizer, edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.WordTokenizer
    public List<String> extractWords(String str) {
        List<String> createNewList = ListFactory.createNewList();
        String pretokenize = this.preTokenizer.pretokenize(str);
        this.wordIterator.setText(pretokenize);
        int first = this.wordIterator.first();
        int next = this.wordIterator.next();
        while (true) {
            int i = next;
            if (i == -1) {
                return createNewList;
            }
            String substring = pretokenize.substring(first, i);
            if (!Character.isWhitespace(substring.charAt(0))) {
                String preprocessToken = preprocessToken(substring, createNewList);
                if (preprocessToken.length() > 0) {
                    if (this.splitAroundPeriods) {
                        String[] splitToken = splitToken(preprocessToken);
                        for (int i2 = 0; i2 < splitToken.length; i2++) {
                            if (splitToken[i2].length() > 0) {
                                addWordToSentence(createNewList, splitToken[i2]);
                            }
                        }
                    } else {
                        addWordToSentence(createNewList, preprocessToken);
                    }
                }
            } else if (this.storeWhitespaceTokens) {
                if (!this.mergeWhitespaceTokens || createNewList.size() <= 1) {
                    addWordToSentence(createNewList, substring);
                } else {
                    String str2 = createNewList.get(createNewList.size() - 1);
                    if (Character.isWhitespace(str2.charAt(0))) {
                        createNewList.set(createNewList.size() - 1, str2 + substring);
                    } else {
                        addWordToSentence(createNewList, substring);
                    }
                }
            }
            first = i;
            next = this.wordIterator.next();
        }
    }
}
