package uk.ac.cam.ch.wwmm.oscarMEMM.memm;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.parser.AbstractBottomUpParser;
import org.apache.commons.collections.set.UnmodifiableSet;
import org.apache.xpath.XPath;
import org.xmlcml.cml.element.CMLBond;
import org.xmlcml.euclid.EuclidConstants;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
import uk.ac.cam.ch.wwmm.oscar.document.TokenSequence;
import uk.ac.cam.ch.wwmm.oscar.terms.TermSets;
import uk.ac.cam.ch.wwmm.oscar.tools.StringTools;
import uk.ac.cam.ch.wwmm.oscar.types.NamedEntityType;
import uk.ac.cam.ch.wwmm.oscarMEMM.FeatureSet;
import uk.ac.cam.ch.wwmm.oscarMEMM.memm.data.MEMMModel;
import uk.ac.cam.ch.wwmm.oscarrecogniser.extractedtrainingdata.ExtractedTrainingData;
import uk.ac.cam.ch.wwmm.oscarrecogniser.tokenanalysis.NGram;
import uk.ac.cam.ch.wwmm.oscarrecogniser.tokenanalysis.TokenSuffixClassifier;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/oscarMEMM/memm/FeatureExtractor.class */
public final class FeatureExtractor {
    private static final String ZERO = "0";
    private static final String ONE = "1";
    private static final String TWO = "2";
    private static final String THREE = "3";
    private static final String FOUR = "4";
    private static final String FIVE = "5";
    private static final char C_ONE = '1';
    private static final char C_TWO = '2';
    private static final char C_THREE = '3';
    private static final char C_FOUR = '4';
    private static final String STOPWORD_USER_DEFINED_FEATURE = "$STOP:UDW";
    private static final String STOPWORD_NONCHEMICALNONWORD_FEATURE = "$STOP:NCNW";
    private static final String STOPWORD_NON_CHEMICAL_WORD_FEATURE = "$STOP:NCW";
    private static final String STOPWORD_CLOSED_CLASS_FEATURE = "$STOP:CLOSEDCLASS";
    private static final String STOPWORD_FEATURE = "$STOP:STOPWORD";
    private static final String OXIDATION_STATE_FEATURE = "oxidationState";
    private static final String ENDS_IN_ELEMENT_FEATURE = "endsinem";
    private static final String ELEMENT_FEATURE = "element";
    private static final String IN_NAMEDICT_FEATURE = "inCND";
    private static final String NGRAM_DEC_FEATURE = "ngram-=";
    private static final String NGRAM_INC_FEATURE = "ngram+=";
    private static final String SUFFIX_CT_FEATURE = "ct=";
    private static final String NGRAMSCORE_DEC_FEATURE = "ngscore-=";
    private static final String NGRAMSCORE_INC_FEATURE = "ngscore+=";
    private static final double NGRAM_SCORE_UPPER_BOUND = 15.0d;
    private static final int SUFFIX_HI_SCORE = 100;
    private static final int SUFFIX_LO_SCORE = -100;
    private static final double SUFFIX_SCORE_UPPER_BOUND = 15.0d;
    private static final double SUFFIX_SCORE_LOWER_BOUND = -15.0d;
    private static final String SUFFIX_SCORE_DEC_FEATURE = "sscore-=";
    private static final String SUFFIX_SCORE_INC_FEATURE = "sscore+=";
    private static final String NGRAM_FEATURE = "G=";
    private static final double NGRAM_SCORE_LOWER_BOUND = -15.0d;
    private static final String RE_LINE_END = "$";
    private static final String RE_LINE_START = "^";
    private static final String SUFFIX_FEATURE = "s=";
    private static final String SHAPE_FEATURE = "ws=";
    private static final String SHAPE_COMPLEX_FEATURE = "complex";
    private static final String WITHOUT_TERMINAL_S_FEATURE = "wts=";
    private static final String RNMID_FEATURE = "$RNMID";
    private static final String RNEND_FEATURE = "$RNEND";
    private TokenSequence tokSeq;
    private List<FeatureSet> tokenFeatureSets;
    private boolean noPC = false;
    private boolean noC = false;
    private boolean newSuffixes = false;
    private NGram ngram;
    private ExtractedTrainingData etd;
    private UnmodifiableSet chemNameDictNames;
    private static final Pattern DIGITS = Pattern.compile("[0-9]+");
    private static final Pattern LETTERS = Pattern.compile("[a-z][a-z]+");
    private static final Pattern SINGLE_LETTER = Pattern.compile("[a-z]");
    private static final Pattern CAPS = Pattern.compile("[A-Z][A-Z]+");
    private static final Pattern SINGLE_CAP = Pattern.compile("[A-Z]");
    private static final Pattern GREEKS = Pattern.compile("[αβγδεζηθικλμνξοπρςστυφχψω]+");
    private static final String WORD_FEATURE = "w=";
    private static final int WORD_FEATURE_LENGTH = WORD_FEATURE.length();
    private static final Pattern suffixPattern = Pattern.compile(".*?((yl|ide|ite|ate|ium|ane|yne|ene|ol|ase|ic|oxy|ino|at(ed|ion|ing)|lys(is|es|ed|ing|tic)|i[sz](ed|ations|ing)|)s?)");
    private static final Pattern wordPattern = Pattern.compile(".*[a-z][a-z].*");
    private static final Pattern oxPattern = Pattern.compile("\\(([oO]|[iI]{1,4}|[iI]{0,3}[xvXV]|[xvXV][iI]{0,4})\\)");
    private static final Pattern pnPattern = Pattern.compile("(Mc|Mac)?[A-Z]\\p{Ll}\\p{Ll}+(s'|'s)?");

    public static List<FeatureList> extractFeatures(TokenSequence tokenSequence, MEMMModel mEMMModel) {
        return extractFeatures(tokenSequence, mEMMModel.getNGram(), mEMMModel.getExtractedTrainingData(), mEMMModel.getChemNameDictNames());
    }

    public static List<FeatureList> extractFeatures(TokenSequence tokenSequence, NGram nGram, UnmodifiableSet unmodifiableSet) {
        return extractFeatures(tokenSequence, nGram, new ExtractedTrainingData(), unmodifiableSet);
    }

    static List<FeatureList> extractFeatures(TokenSequence tokenSequence, NGram nGram, ExtractedTrainingData extractedTrainingData, UnmodifiableSet unmodifiableSet) {
        return new FeatureExtractor(tokenSequence, nGram, extractedTrainingData, unmodifiableSet).getFeatureLists();
    }

    private List<FeatureList> getFeatureLists() {
        ArrayList arrayList = new ArrayList(this.tokenFeatureSets.size());
        Iterator<FeatureSet> it = this.tokenFeatureSets.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().getFeatures());
        }
        return arrayList;
    }

    private FeatureExtractor(TokenSequence tokenSequence, NGram nGram, ExtractedTrainingData extractedTrainingData, UnmodifiableSet unmodifiableSet) {
        this.tokSeq = tokenSequence;
        this.ngram = nGram;
        this.etd = extractedTrainingData;
        this.chemNameDictNames = unmodifiableSet;
        makeFeatures();
    }

    private void makeFeatures() {
        initFeatureSets();
        for (int i = 0; i < this.tokSeq.getSize(); i++) {
            makeFeatures(i);
        }
        for (int i2 = 0; i2 < this.tokSeq.getSize(); i2++) {
            mergeFeatures(i2);
        }
    }

    private void initFeatureSets() {
        this.tokenFeatureSets = new ArrayList(this.tokSeq.getSize());
        for (int i = 0; i < this.tokSeq.getSize(); i++) {
            this.tokenFeatureSets.add(new FeatureSet());
        }
    }

    private String makeWordFeature(String str) {
        return new StringBuilder(str.length() + WORD_FEATURE_LENGTH).append(WORD_FEATURE).append(str).toString();
    }

    private void makeFeatures(int i) {
        FeatureList features = this.tokenFeatureSets.get(i).getFeatures();
        FeatureList contextableFeatures = this.tokenFeatureSets.get(i).getContextableFeatures();
        FeatureList bigramableFeatures = this.tokenFeatureSets.get(i).getBigramableFeatures();
        Token token = this.tokSeq.getToken(i);
        String surface = token.getSurface();
        contextableFeatures.addFeature(makeWordFeature(surface));
        String normaliseName = StringTools.normaliseName(surface);
        if (!surface.equals(normaliseName)) {
            contextableFeatures.addFeature(makeWordFeature(normaliseName));
        }
        makeWordFeatures(surface, normaliseName, bigramableFeatures, this.etd);
        makeReactionFeatures(surface, bigramableFeatures, contextableFeatures, this.etd);
        contextableFeatures.addFeature(WITHOUT_TERMINAL_S_FEATURE + StringTools.withoutTerminalS(normaliseName));
        makeShapeFeatures(surface, bigramableFeatures, contextableFeatures);
        makeSuffixFeature(surface, contextableFeatures);
        makeNGramFeatures(surface, features);
        if (wordPattern.matcher(surface).matches()) {
            if (this.newSuffixes) {
                handleNewSuffices(surface, normaliseName, bigramableFeatures, contextableFeatures, features, token);
            } else {
                handleNoNewSuffices(surface, normaliseName, bigramableFeatures, contextableFeatures, features, token);
            }
        }
        if (this.chemNameDictNames.contains(surface)) {
            features.addFeature(IN_NAMEDICT_FEATURE);
        }
        if (TermSets.getDefaultInstance().getElements().contains(normaliseName)) {
            contextableFeatures.addFeature("element");
            bigramableFeatures.addFeature("element");
        }
        if (TermSets.getDefaultInstance().getEndingInElementNamePattern().matcher(surface).matches()) {
            contextableFeatures.addFeature(ENDS_IN_ELEMENT_FEATURE);
            bigramableFeatures.addFeature(ENDS_IN_ELEMENT_FEATURE);
        }
        if (oxPattern.matcher(surface).matches()) {
            contextableFeatures.addFeature(OXIDATION_STATE_FEATURE);
            bigramableFeatures.addFeature(OXIDATION_STATE_FEATURE);
        }
        if (TermSets.getDefaultInstance().getStopWords().contains(normaliseName)) {
            features.addFeature(STOPWORD_FEATURE);
        }
        if (TermSets.getDefaultInstance().getClosedClass().contains(normaliseName)) {
            features.addFeature(STOPWORD_CLOSED_CLASS_FEATURE);
        }
        if (this.etd.getNonChemicalWords().contains(normaliseName)) {
            features.addFeature(STOPWORD_NON_CHEMICAL_WORD_FEATURE);
        }
        if (this.etd.getNonChemicalNonWords().contains(normaliseName) && !TermSets.getDefaultInstance().getElements().contains(normaliseName)) {
            features.addFeature(STOPWORD_NONCHEMICALNONWORD_FEATURE);
        }
        if (!TermSets.getDefaultInstance().getUsrDictWords().contains(normaliseName) || this.chemNameDictNames.contains(normaliseName) || this.etd.getChemicalWords().contains(normaliseName)) {
            return;
        }
        features.addFeature(STOPWORD_USER_DEFINED_FEATURE);
    }

    private void handleNoNewSuffices(String str, String str2, FeatureList featureList, FeatureList featureList2, FeatureList featureList3, Token token) {
        double testWord = this.ngram.testWord(str);
        NamedEntityType classifyBySuffix = TokenSuffixClassifier.classifyBySuffix(token.getSurface());
        double min = Math.min(Math.max(testWord, -15.0d), 15.0d);
        for (int i = 0; i < min; i++) {
            featureList3.addFeature((NGRAM_INC_FEATURE + classifyBySuffix.getName()).intern());
        }
        for (int i2 = 0; i2 > min; i2--) {
            featureList3.addFeature((NGRAM_DEC_FEATURE + classifyBySuffix.getName()).intern());
        }
        if (TermSets.getDefaultInstance().getUsrDictWords().contains(str2) || TermSets.getDefaultInstance().getUsrDictWords().contains(str)) {
            min = -100.0d;
        }
        if (this.etd.getChemicalWords().contains(str2)) {
            min = 100.0d;
        }
        if (this.chemNameDictNames.contains(str)) {
            min = 100.0d;
        }
        if (min > XPath.MATCH_SCORE_QNAME) {
            featureList2.addFeature(SUFFIX_CT_FEATURE + classifyBySuffix.getName());
            featureList.addFeature(SUFFIX_CT_FEATURE + classifyBySuffix.getName());
        }
    }

    private void handleNewSuffices(String str, String str2, FeatureList featureList, FeatureList featureList2, FeatureList featureList3, Token token) {
        double testWordSuffix = this.ngram.testWordSuffix(str);
        NamedEntityType classifyBySuffix = TokenSuffixClassifier.classifyBySuffix(token.getSurface());
        double min = Math.min(Math.max(testWordSuffix, -15.0d), 15.0d);
        for (int i = 0; i < min; i++) {
            featureList3.addFeature((SUFFIX_SCORE_INC_FEATURE + classifyBySuffix.getName()).intern());
        }
        for (int i2 = 0; i2 > min; i2--) {
            featureList3.addFeature((SUFFIX_SCORE_DEC_FEATURE + classifyBySuffix.getName()).intern());
        }
        if (TermSets.getDefaultInstance().getUsrDictWords().contains(str2) || TermSets.getDefaultInstance().getUsrDictWords().contains(str)) {
            min = -100.0d;
        }
        if (this.etd.getChemicalWords().contains(str2)) {
            min = 100.0d;
        }
        if (this.chemNameDictNames.contains(str)) {
            min = 100.0d;
        }
        double min2 = Math.min(Math.max(this.ngram.testWord(str), -15.0d), 15.0d);
        for (int i3 = 0; i3 < min2; i3++) {
            featureList3.addFeature((NGRAMSCORE_INC_FEATURE + classifyBySuffix.getName()).intern());
        }
        for (int i4 = 0; i4 > min2; i4--) {
            featureList3.addFeature((NGRAMSCORE_DEC_FEATURE + classifyBySuffix.getName()).intern());
        }
        if (min > XPath.MATCH_SCORE_QNAME) {
            featureList2.addFeature(SUFFIX_CT_FEATURE + classifyBySuffix.getName());
            featureList.addFeature(SUFFIX_CT_FEATURE + classifyBySuffix.getName());
        }
    }

    private void makeNGramFeatures(String str, FeatureList featureList) {
        StringBuilder append = new StringBuilder("^").append(str).append("$");
        for (int i = 0; i < append.length() - 3; i++) {
            for (int i2 = 1; i2 <= 4; i2++) {
                if (i >= 4 - i2) {
                    featureList.addFeature(makeNGramFeature(append, i, i2));
                }
            }
        }
    }

    static String makeNGramFeature(StringBuilder sb, int i, int i2) {
        StringBuilder sb2 = new StringBuilder();
        if (i2 == 1) {
            sb2.append('1');
        } else if (i2 == 2) {
            sb2.append('2');
        } else if (i2 == 3) {
            sb2.append('3');
        } else if (i2 == 4) {
            sb2.append('4');
        } else {
            sb2.append(i2);
        }
        sb2.append(NGRAM_FEATURE);
        int i3 = i + i2;
        for (int i4 = i; i4 < i3; i4++) {
            sb2.append(sb.charAt(i4));
        }
        return sb2.toString().intern();
    }

    private void makeSuffixFeature(String str, FeatureList featureList) {
        featureList.addFeature(SUFFIX_FEATURE + getSuffix(str));
    }

    private void makeShapeFeatures(String str, FeatureList featureList, FeatureList featureList2) {
        String wordShape = wordShape(str);
        if (wordShape.length() > 3) {
            wordShape = SHAPE_COMPLEX_FEATURE;
        }
        if (wordShape.equals(str)) {
            return;
        }
        String str2 = SHAPE_FEATURE + wordShape;
        featureList.addFeature(str2);
        featureList2.addFeature(str2);
    }

    private void makeReactionFeatures(String str, FeatureList featureList, FeatureList featureList2, ExtractedTrainingData extractedTrainingData) {
        if (extractedTrainingData.getRnEnd().contains(str)) {
            featureList.addFeature(RNEND_FEATURE);
            featureList2.addFeature(RNEND_FEATURE);
        }
        if (extractedTrainingData.getRnMid().contains(str)) {
            featureList.addFeature(RNMID_FEATURE);
            featureList2.addFeature(RNMID_FEATURE);
        }
    }

    private void makeWordFeatures(String str, String str2, FeatureList featureList, ExtractedTrainingData extractedTrainingData) {
        if (str.length() < 4 || extractedTrainingData.getPolysemous().contains(str) || extractedTrainingData.getRnEnd().contains(str) || extractedTrainingData.getRnMid().contains(str)) {
            featureList.addFeature(makeWordFeature(str));
            if (str.equals(str2)) {
                return;
            }
            featureList.addFeature(makeWordFeature(str2));
        }
    }

    private void mergeFeatures(int i) {
        FeatureList features = this.tokenFeatureSets.get(i).getFeatures();
        int min = Math.min(1, i);
        int min2 = Math.min(1, (this.tokSeq.getSize() - i) - 1);
        if (!this.noC) {
            for (int i2 = -min; i2 <= min2; i2++) {
                Iterator<String> it = this.tokenFeatureSets.get(i + i2).getContextableFeatures().iterator();
                while (it.hasNext()) {
                    features.addFeature((AbstractBottomUpParser.COMPLETE + i2 + EuclidConstants.S_COLON + it.next()).intern());
                }
            }
        }
        for (int i3 = -min; i3 <= min2; i3++) {
            for (int i4 = i3 + 1; i4 <= min2; i4++) {
                if (i4 - i3 == 1 || i4 == i3) {
                    String str = "bg:" + i3 + EuclidConstants.S_COLON + i4 + EuclidConstants.S_COLON;
                    Iterator<String> it2 = this.tokenFeatureSets.get(i + i3).getBigramableFeatures().iterator();
                    while (it2.hasNext()) {
                        String next = it2.next();
                        Iterator<String> it3 = this.tokenFeatureSets.get(i + i4).getBigramableFeatures().iterator();
                        while (it3.hasNext()) {
                            String next2 = it3.next();
                            if (i4 != i3 || next != next2) {
                                features.addFeature((str + next + CMLBond.HASH_SYMB + next2).intern());
                            }
                        }
                    }
                }
            }
        }
        String surface = this.tokSeq.getToken(i).getSurface();
        if (pnPattern.matcher(surface).matches()) {
            boolean z = false;
            if (surface.matches("[A-Z][a-z]+") && TermSets.getDefaultInstance().getUsrDictWords().contains(surface.toLowerCase()) && !TermSets.getDefaultInstance().getUsrDictWords().contains(surface)) {
                z = true;
            }
            if (!this.noPC && this.etd.getPnStops().contains(surface)) {
                z = true;
            }
            int i5 = i + 1;
            while (i5 < this.tokSeq.getSize() - 2 && StringTools.isHyphen(this.tokSeq.getToken(i5).getSurface()) && pnPattern.matcher(this.tokSeq.getToken(i5 + 1).getSurface()).matches()) {
                i5 += 2;
                z = false;
            }
            if (i5 < this.tokSeq.getSize()) {
                Iterator<String> it4 = this.tokenFeatureSets.get(i5).getBigramableFeatures().iterator();
                while (it4.hasNext()) {
                    String next3 = it4.next();
                    if (z) {
                        features.addFeature(("suspectpn->bg:" + next3).intern());
                    } else {
                        features.addFeature(("pn->bg:" + next3).intern());
                    }
                }
                if (!z) {
                    Iterator<String> it5 = this.tokenFeatureSets.get(i5).getContextableFeatures().iterator();
                    while (it5.hasNext()) {
                        features.addFeature(("pn->c:" + it5.next()).intern());
                    }
                }
                for (int i6 = i + 1; i6 <= i5; i6++) {
                    if (z) {
                        this.tokenFeatureSets.get(i6).getFeatures().addFeature("inSuspectPN");
                    } else {
                        this.tokenFeatureSets.get(i6).getFeatures().addFeature("inPN");
                    }
                }
            }
        }
    }

    private String getSuffix(String str) {
        Matcher matcher = suffixPattern.matcher(str);
        return matcher.matches() ? matcher.group(1) : "unknown";
    }

    private static String wordShape(String str) {
        return GREEKS.matcher(SINGLE_CAP.matcher(CAPS.matcher(SINGLE_LETTER.matcher(LETTERS.matcher(DIGITS.matcher(str).replaceAll("0")).replaceAll("1")).replaceAll("2")).replaceAll("3")).replaceAll(FOUR)).replaceAll(FIVE);
    }
}
