package org.xmlcml.ami2.plugins.word;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableSortedMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import nu.xom.Attribute;
import nu.xom.IllegalCharacterDataException;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.xmlcml.ami2.plugins.AMIArgProcessor;
import org.xmlcml.ami2.tokens.LuceneUtils;
import org.xmlcml.cmine.args.DefaultArgProcessor;
import org.xmlcml.cmine.files.CTree;
import org.xmlcml.cmine.files.ResultElement;
import org.xmlcml.cmine.files.ResultsElement;
import org.xmlcml.cmine.files.ResultsElementList;

/* loaded from: input_file:org/xmlcml/ami2/plugins/word/WordCollectionFactory.class */
public class WordCollectionFactory {
    private static final Logger LOG = Logger.getLogger(WordCollectionFactory.class);
    private static final String COUNT2 = "count";
    private static final String VALUE = "value";
    private static final String LENGTH = "length";
    private static final String LENGTHS = "lengths";
    private static final String FREQUENCIES_ATT = "frequencies";
    private static final String FREQUENCY_ATT = "frequency";
    private static final String PROPERTY = "property";
    private static final String DOCUMENT_FREQUENCY = "documentFrequency";
    private static final String BOOLEAN_FREQUENCIES = "booleanFrequencies";
    private static final String TFIDF_FREQUENCY = "tfidfFrequency";
    private static final String TFIDF_FREQUENCIES = "tfidfFrequencies";
    private static final int DEFAULT_MIN_COUNT_IN_SET = 4;
    private static final int DEFAULT_MIN_RAW_WORD_LENGTH = 3;
    private static final int DEFAULT_MAX_RAW_WORD_LENGTH = 99999;
    private WordSetWrapper stopwords;
    private AMIArgProcessor amiArgProcessor;
    private List<String> abbreviations;
    private List<String> capitalized;
    private int minCountInSet;
    private int minRawWordLength;
    private int maxRawWordLength;
    private String sortControl = "count";
    private Iterable<Multiset.Entry<String>> entriesSortedByCount;
    private Iterable<Multiset.Entry<String>> entriesSortedByValue;
    private WordResultsElement frequenciesElement;
    private WordResultsElement aggregatedFrequenciesElement;
    private WordResultsElement booleanFrequenciesElement;

    public WordCollectionFactory(AMIArgProcessor aMIArgProcessor) {
        this.amiArgProcessor = aMIArgProcessor;
        setDefaults();
    }

    private void setDefaults() {
        this.minCountInSet = 4;
        this.minRawWordLength = 3;
        this.maxRawWordLength = DEFAULT_MAX_RAW_WORD_LENGTH;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public void extractWords() {
        List<String> createWordList = createWordList();
        if (createWordList == null) {
            LOG.warn("no words found to extract");
        }
        WordArgProcessor wordArgProcessor = (WordArgProcessor) this.amiArgProcessor;
        List<String> chosenMethods = wordArgProcessor.getChosenMethods();
        if (chosenMethods.contains(WordArgProcessor.WORD_LENGTHS)) {
            wordArgProcessor.addResultsElement(createWordLengthsResultsElement(createWordList));
        }
        if (chosenMethods.contains(WordArgProcessor.WORD_FREQUENCIES)) {
            wordArgProcessor.addResultsElement(getWordFrequencies(createWordList));
        }
    }

    public List<String> createWordList() {
        CTree currentCTree = this.amiArgProcessor.getCurrentCTree();
        List<String> list = null;
        if (currentCTree != null) {
            if (currentCTree.hasScholarlyHTML()) {
                list = currentCTree.extractWordsFromScholarlyHtml();
            } else if (currentCTree.hasFulltextPDFTXT()) {
                list = currentCTree.extractWordsFromPDFTXT();
            } else {
                LOG.warn("No scholarlyHtml or PDFTXT: " + currentCTree.getDirectory());
            }
        }
        return createTransformedWords(list);
    }

    private List<String> createTransformedWords(List<String> list) {
        LOG.trace("REFACTOR createTransformedWords");
        List<String> list2 = null;
        if (list != null) {
            list2 = list;
            if (this.amiArgProcessor instanceof WordArgProcessor) {
                list2 = transformWordStream(list2);
            } else {
                LOG.trace("must develop TokenStream for : " + this.amiArgProcessor);
            }
        }
        return list2;
    }

    private List<String> transformWordStream(List<String> list) {
        WordArgProcessor wordArgProcessor = (WordArgProcessor) this.amiArgProcessor;
        if (wordArgProcessor.getChosenWordTypes().contains(WordArgProcessor.ABBREVIATION)) {
            list = createAbbreviations(list);
        }
        if (wordArgProcessor.getChosenWordTypes().contains(WordArgProcessor.CAPITALIZED)) {
            list = createCapitalized(list);
        }
        if (wordArgProcessor.getWordCaseList().contains(WordArgProcessor.IGNORE)) {
            list = toLowerCase(list);
        }
        Iterator<WordSetWrapper> it = wordArgProcessor.getStopwordSetList().iterator();
        while (it.hasNext()) {
            list = applyStopwordFilter(it.next(), list);
        }
        if (wordArgProcessor.getStemming()) {
            list = LuceneUtils.applyPorterStemming(list);
        }
        return list;
    }

    private List<String> createAbbreviations(List<String> list) {
        this.abbreviations = new ArrayList();
        for (String str : list) {
            if (isAbbreviation(str)) {
                this.abbreviations.add(str);
            }
        }
        return this.abbreviations;
    }

    private List<String> createCapitalized(List<String> list) {
        this.capitalized = new ArrayList();
        for (String str : list) {
            if (isCapitalized(str)) {
                this.capitalized.add(str);
            }
        }
        return this.capitalized;
    }

    private boolean isAbbreviation(String str) {
        if (str == null || str.length() == 0 || !Character.isUpperCase(str.charAt(0))) {
            return false;
        }
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        for (int i5 = 1; i5 < str.length(); i5++) {
            char charAt = str.charAt(i5);
            if (Character.isUpperCase(charAt)) {
                i++;
            } else if (Character.isLowerCase(charAt)) {
                i2++;
            } else if (Character.isDigit(charAt)) {
                i4++;
            } else {
                if (charAt != '-') {
                    return false;
                }
                i3++;
            }
        }
        return i > i2;
    }

    private boolean isCapitalized(String str) {
        if (!Character.isUpperCase(str.charAt(0))) {
            return false;
        }
        for (int i = 1; i < str.length(); i++) {
            if (!Character.isLowerCase(str.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    private List<String> applyStopwordFilter(WordSetWrapper wordSetWrapper, List<String> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            String trim = it.next().trim();
            if (!wordSetWrapper.contains(trim.toLowerCase())) {
                arrayList.add(trim);
            }
        }
        LOG.trace("stopwords " + wordSetWrapper.size() + "; current words: " + arrayList.size());
        return arrayList;
    }

    private List<String> toLowerCase(List<String> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().toLowerCase());
        }
        return arrayList;
    }

    private WordResultsElement createWordLengthsResultsElement(List<String> list) {
        HashMultiset create = HashMultiset.create();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            create.add(Integer.valueOf(it.next().length()));
        }
        return getWordLengths(create);
    }

    private WordResultsElement getWordLengths(Multiset<Integer> multiset) {
        WordResultsElement wordResultsElement = new WordResultsElement(LENGTHS);
        for (Multiset.Entry<Integer> entry : multiset.entrySet()) {
            WordResultElement wordResultElement = new WordResultElement("length");
            wordResultElement.setLength(Integer.valueOf(entry.getElement().intValue()));
            wordResultElement.setCount(Integer.valueOf(entry.getCount()));
            wordResultsElement.appendChild(wordResultElement);
        }
        return wordResultsElement;
    }

    private WordResultsElement getWordFrequencies(List<String> list) {
        HashMultiset create = HashMultiset.create();
        if (list == null) {
            LOG.warn("No current words ");
            this.frequenciesElement = null;
        } else {
            Iterator<String> it = list.iterator();
            while (it.hasNext()) {
                String replaceAll = it.next().replaceAll("[\\d+]", "");
                if (replaceAll.length() >= this.minRawWordLength && replaceAll.length() <= this.maxRawWordLength) {
                    create.add(replaceAll);
                }
            }
            this.entriesSortedByCount = getEntriesSortedByCount(create);
            this.entriesSortedByValue = getEntriesSortedByValue(create);
            this.frequenciesElement = createFrequenciesElement(getSortedEntries());
        }
        return this.frequenciesElement;
    }

    private WordResultsElement createFrequenciesElement(Iterable<Multiset.Entry<String>> iterable) {
        this.frequenciesElement = new WordResultsElement("frequencies");
        List<WordResultElement> arrayList = new ArrayList();
        for (Multiset.Entry<String> entry : iterable) {
            int count = entry.getCount();
            if (count >= this.minCountInSet) {
                WordResultElement wordResultElement = new WordResultElement(FREQUENCY_ATT);
                try {
                    wordResultElement.addAttribute(new Attribute("word", String.valueOf(entry.getElement())));
                    wordResultElement.addAttribute(new Attribute("count", String.valueOf(count)));
                    arrayList.add(wordResultElement);
                } catch (IllegalCharacterDataException e) {
                }
            }
        }
        if ("count".equals(this.sortControl)) {
            arrayList = sortByValue(arrayList);
        }
        Iterator<WordResultElement> it = arrayList.iterator();
        while (it.hasNext()) {
            this.frequenciesElement.appendChild(it.next());
        }
        return this.frequenciesElement;
    }

    private List<WordResultElement> sortByValue(List<WordResultElement> list) {
        ArrayList arrayList = new ArrayList();
        int i = -1;
        ArrayList arrayList2 = new ArrayList();
        ArrayList arrayList3 = null;
        for (WordResultElement wordResultElement : list) {
            int intValue = wordResultElement.getCount().intValue();
            if (intValue != i) {
                arrayList3 = new ArrayList();
                arrayList2.add(arrayList3);
                i = intValue;
            }
            arrayList3.add(wordResultElement);
        }
        Iterator it = arrayList2.iterator();
        while (it.hasNext()) {
            Iterator<WordResultElement> it2 = sortByValue0((List) it.next()).iterator();
            while (it2.hasNext()) {
                arrayList.add(it2.next());
            }
        }
        return arrayList;
    }

    private List<WordResultElement> sortByValue0(List<WordResultElement> list) {
        ArrayList arrayList = new ArrayList();
        HashMap hashMap = new HashMap();
        ArrayList arrayList2 = new ArrayList();
        for (WordResultElement wordResultElement : list) {
            String word = wordResultElement.getWord();
            arrayList2.add(word);
            hashMap.put(word, wordResultElement);
        }
        Collections.sort(arrayList2);
        Iterator it = arrayList2.iterator();
        while (it.hasNext()) {
            arrayList.add((WordResultElement) hashMap.get((String) it.next()));
        }
        return arrayList;
    }

    public static Iterable<Multiset.Entry<String>> getEntriesSortedByValue(Multiset<String> multiset) {
        return ImmutableSortedMultiset.copyOf((Iterable) multiset).entrySet();
    }

    public static Iterable<Multiset.Entry<String>> getEntriesSortedByCount(Multiset<String> multiset) {
        return Multisets.copyHighestCountFirst(multiset).entrySet();
    }

    private Iterable<Multiset.Entry<String>> getSortedEntries() {
        Iterable<Multiset.Entry<String>> iterable = null;
        if (this.sortControl.equals("count")) {
            iterable = this.entriesSortedByCount;
        } else if (this.sortControl.equals("value")) {
            iterable = this.entriesSortedByValue;
        }
        return iterable;
    }

    public int getMinCountInSet() {
        return this.minCountInSet;
    }

    public void setMinCountInSet(int i) {
        this.minCountInSet = i;
    }

    public int getMinRawWordLength() {
        return this.minRawWordLength;
    }

    public void setMinRawWordLength(int i) {
        this.minRawWordLength = i;
    }

    public int getMaxRawWordLength() {
        return this.maxRawWordLength;
    }

    public void setMaxRawWordLength(int i) {
        this.maxRawWordLength = i;
    }

    public WordSetWrapper getStopwords() {
        return this.stopwords;
    }

    public List<String> getAbbreviations() {
        return this.abbreviations;
    }

    public List<String> getCapitalized() {
        return this.capitalized;
    }

    public void createAggregateFrequenciesElement(ResultsElementList resultsElementList) {
        createWordResultElementsAndAddToAggregateFrequenciesElement(getEntriesSortedByCount(createAggregateSet(resultsElementList)));
    }

    private Multiset<String> createAggregateSet(ResultsElementList resultsElementList) {
        HashMultiset create = HashMultiset.create();
        Iterator<ResultsElement> it = resultsElementList.iterator();
        while (it.hasNext()) {
            ResultsElement next = it.next();
            String title = next.getTitle();
            if ("frequencies".equals(title)) {
                addResultsToSet(create, next);
            } else {
                LOG.debug("Skipped non:frequencies result: " + title);
            }
        }
        return create;
    }

    private WordResultsElement createWordResultElementsAndAddToAggregateFrequenciesElement(Iterable<Multiset.Entry<String>> iterable) {
        this.aggregatedFrequenciesElement = new WordResultsElement("frequencies");
        for (Multiset.Entry<String> entry : iterable) {
            WordResultElement wordResultElement = new WordResultElement(FREQUENCY_ATT);
            wordResultElement.setWord(entry.getElement());
            wordResultElement.setCount(Integer.valueOf(entry.getCount()));
            this.aggregatedFrequenciesElement.appendChild(wordResultElement);
        }
        return this.aggregatedFrequenciesElement;
    }

    private void addResultsToSet(Multiset<String> multiset, ResultsElement resultsElement) {
        Iterator<ResultElement> it = resultsElement.iterator();
        while (it.hasNext()) {
            WordResultElement wordResultElement = (WordResultElement) it.next();
            multiset.add(wordResultElement.getWord(), wordResultElement.getCount().intValue());
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public WordResultsElement createAggregatedFrequenciesElement(WordResultsElementList wordResultsElementList) {
        this.aggregatedFrequenciesElement = createWordResultElementsAndAddToAggregateFrequenciesElement(getEntriesSortedByCount(createAggregateSet(wordResultsElementList)));
        return this.aggregatedFrequenciesElement;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public WordResultsElement createBooleanFrequencies(DefaultArgProcessor defaultArgProcessor, WordResultsElementList wordResultsElementList) {
        this.aggregatedFrequenciesElement = createAggregatedFrequenciesElement(wordResultsElementList);
        this.booleanFrequenciesElement = new WordResultsElement(BOOLEAN_FREQUENCIES);
        Iterator<ResultElement> it = this.aggregatedFrequenciesElement.iterator();
        while (it.hasNext()) {
            String word = ((WordResultElement) it.next()).getWord();
            int singleCountsOfWord = wordResultsElementList.getSingleCountsOfWord(word);
            if (singleCountsOfWord > 0) {
                WordResultElement wordResultElement = new WordResultElement(DOCUMENT_FREQUENCY);
                wordResultElement.setWord(word);
                wordResultElement.setCount(Integer.valueOf(singleCountsOfWord));
                this.booleanFrequenciesElement.appendChild(wordResultElement);
            }
        }
        return this.booleanFrequenciesElement;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public WordResultsElement createTFIDFFrequencies(DefaultArgProcessor defaultArgProcessor, WordResultsElementList wordResultsElementList) {
        return new WordResultsElement(TFIDF_FREQUENCIES);
    }

    static {
        LOG.setLevel(Level.DEBUG);
    }
}
