package edu.northwestern.at.morphadorner.gate;

import edu.northwestern.at.morphadorner.corpuslinguistics.adornedword.AdornedWord;
import edu.northwestern.at.morphadorner.corpuslinguistics.lemmatizer.DefaultLemmatizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.lemmatizer.Lemmatizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.partsofspeech.PartOfSpeechTags;
import edu.northwestern.at.morphadorner.corpuslinguistics.postagger.DefaultPartOfSpeechTagger;
import edu.northwestern.at.morphadorner.corpuslinguistics.postagger.PartOfSpeechTagger;
import edu.northwestern.at.morphadorner.corpuslinguistics.spellingstandardizer.ExtendedSimpleSpellingStandardizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.spellingstandardizer.SpellingStandardizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.PennTreebankTokenizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.WordTokenizer;
import edu.northwestern.at.utils.CharUtils;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Resource;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.GateRuntimeException;
import gate.util.OffsetComparator;
import java.io.File;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;

/* loaded from: input_file:edu/northwestern/at/morphadorner/gate/PosTaggerGateWrapper.class */
public class PosTaggerGateWrapper extends MorphAdornerGateWrapperBase {
    protected PartOfSpeechTagger partOfSpeechTagger = null;
    protected PartOfSpeechTags partOfSpeechTags = null;
    protected Lemmatizer lemmatizer = null;
    protected SpellingStandardizer standardizer = null;
    protected WordTokenizer spellingTokenizer = new PennTreebankTokenizer();

    @Override // edu.northwestern.at.morphadorner.gate.MorphAdornerGateWrapperBase
    public Resource init() throws ResourceInstantiationException {
        commonInit();
        try {
            this.partOfSpeechTagger = new DefaultPartOfSpeechTagger();
            this.lemmatizer = new DefaultLemmatizer();
            this.standardizer = new ExtendedSimpleSpellingStandardizer();
            this.standardizer.loadStandardSpellings(new File(this.spellingsURL).toURI().toURL(), "utf-8");
            this.standardizer.loadAlternativeSpellings(new File(this.alternateSpellingsURL).toURI().toURL(), "utf-8", "\t");
            this.partOfSpeechTags = this.wordLexicon.getPartOfSpeechTags();
            return super.init();
        } catch (Exception e) {
            throw new ResourceInstantiationException(e.getMessage());
        }
    }

    @Override // edu.northwestern.at.morphadorner.gate.MorphAdornerGateWrapperBase
    public void execute() throws ExecutionException {
        if (this.document == null) {
            throw new GateRuntimeException("There is no document to process.");
        }
        if (this.inputASName != null && this.inputASName.length() == 0) {
            this.inputASName = null;
        }
        if (this.baseSentenceAnnotationType == null || this.baseSentenceAnnotationType.trim().length() == 0) {
            throw new GateRuntimeException("No base Sentence Annotation Type provided.");
        }
        AnnotationSet annotations = this.inputASName == null ? this.document.getAnnotations() : this.document.getAnnotations(this.inputASName);
        if (this.outputASName != null && this.outputASName.length() == 0) {
            this.outputASName = null;
        }
        AnnotationSet annotations2 = this.outputASName == null ? this.document.getAnnotations() : this.document.getAnnotations(this.outputASName);
        try {
            this.document.getFeatures().put("Number of tokens", new Integer(annotations.get("Token").size()).toString());
            try {
                this.document.getFeatures().put("Number of sentences", new Integer(annotations.get("Sentence").size()).toString());
                AnnotationSet annotationSet = annotations.get("Sentence");
                ArrayList arrayList = new ArrayList();
                OffsetComparator offsetComparator = new OffsetComparator();
                ArrayList<Annotation> arrayList2 = new ArrayList((Collection) annotationSet);
                Collections.sort(arrayList2, offsetComparator);
                ArrayList arrayList3 = new ArrayList((Collection) annotations.get("Token"));
                Collections.sort(arrayList3, offsetComparator);
                ListIterator listIterator = arrayList3.listIterator();
                ArrayList arrayList4 = new ArrayList();
                Annotation annotation = (Annotation) listIterator.next();
                fireStatusChanged("Adorning " + annotationSet.size() + " sentences in " + this.document.getName());
                fireProgressChanged(0);
                long currentTimeMillis = System.currentTimeMillis();
                for (Annotation annotation2 : arrayList2) {
                    arrayList4.clear();
                    arrayList.clear();
                    while (annotation != null && annotation.getEndNode().getOffset().compareTo(annotation2.getEndNode().getOffset()) <= 0) {
                        arrayList4.add(annotation);
                        arrayList.add(annotation.getFeatures().get("string"));
                        annotation = (Annotation) (listIterator.hasNext() ? listIterator.next() : null);
                    }
                    Iterator it = arrayList4.iterator();
                    for (AdornedWord adornedWord : this.partOfSpeechTagger.tagSentence(arrayList)) {
                        Annotation annotation3 = (Annotation) it.next();
                        String partsOfSpeech = adornedWord.getPartsOfSpeech();
                        String spelling = adornedWord.getSpelling();
                        String standardizedSpelling = getStandardizedSpelling(spelling, partsOfSpeech);
                        String lemma = this.wordLexicon.getLemma(spelling, partsOfSpeech);
                        if (lemma.equals("*")) {
                            lemma = getLemma(this.lemmatizer, standardizedSpelling, partsOfSpeech);
                        }
                        annotation3.getFeatures().put("category", partsOfSpeech);
                        annotation3.getFeatures().put(MorphAdornerGateWrapperBase.TOKEN_LEMMA_FEATURE_NAME, lemma);
                        annotation3.getFeatures().put(MorphAdornerGateWrapperBase.TOKEN_SPELLING_FEATURE_NAME, spelling);
                        annotation3.getFeatures().put(MorphAdornerGateWrapperBase.TOKEN_STANDARD_SPELLING_FEATURE_NAME, standardizedSpelling);
                    }
                    fireProcessFinished();
                    fireStatusChanged(this.document.getName() + " adorned in " + NumberFormat.getInstance().format((System.currentTimeMillis() - currentTimeMillis) / 1000.0d) + " seconds.");
                }
                fireStatusChanged("Adornment complete.");
                fireProgressChanged(0);
            } catch (NullPointerException e) {
                throw new ExecutionException("You need to run a Sentence Splitter first.");
            }
        } catch (NullPointerException e2) {
            throw new ExecutionException("You need to run a Tokenizer first!");
        }
    }

    protected String getLemma(Lemmatizer lemmatizer, String str, String str2) {
        String str3 = str;
        String lemmaWordClass = this.partOfSpeechTags.getLemmaWordClass(str2);
        if (!lemmatizer.cantLemmatize(str) && !lemmaWordClass.equals(PartOfSpeechTags.NONE)) {
            List<String> extractWords = this.spellingTokenizer.extractWords(str);
            if (this.partOfSpeechTags.isCompoundTag(str2) && extractWords.size() != 1) {
                str3 = "";
                String[] splitTag = this.partOfSpeechTags.splitTag(str2);
                if (splitTag.length == extractWords.size()) {
                    for (int i = 0; i < extractWords.size(); i++) {
                        String str4 = extractWords.get(i);
                        if (i > 0) {
                            str3 = str3 + lemmaSeparator;
                        }
                        str3 = str3 + lemmatizer.lemmatize(str4, this.partOfSpeechTags.getLemmaWordClass(splitTag[i]));
                    }
                }
            } else if (lemmaWordClass.length() == 0) {
                str3 = lemmatizer.lemmatize(str, "compound");
                if (str3.equals(str)) {
                    str3 = lemmatizer.lemmatize(str);
                }
            } else {
                str3 = lemmatizer.lemmatize(str, lemmaWordClass);
            }
        }
        if (str3.equals("*")) {
            str3 = str;
        }
        if (str3.indexOf(lemmaSeparator) < 0 && !this.partOfSpeechTags.isProperNounTag(str2)) {
            str3 = str3.toLowerCase();
        }
        return str3;
    }

    protected String getStandardizedSpelling(String str, String str2) {
        String str3 = str;
        if (!this.partOfSpeechTags.isProperNounTag(str2) && ((!this.partOfSpeechTags.isNounTag(str2) || !CharUtils.hasInternalCaps(str)) && !this.partOfSpeechTags.isForeignWordTag(str2) && !this.partOfSpeechTags.isNumberTag(str2))) {
            str3 = this.standardizer.standardizeSpelling(str, this.partOfSpeechTags.getMajorWordClass(str2));
        }
        return str3;
    }
}
