package edu.northwestern.at.morphadorner.tools.namedentities;

import edu.northwestern.at.morphadorner.gate.Annie;
import edu.northwestern.at.utils.FileNameUtils;
import edu.northwestern.at.utils.FileUtils;
import edu.northwestern.at.utils.Formatters;
import edu.northwestern.at.utils.ListFactory;
import edu.northwestern.at.utils.PatternReplacer;
import edu.northwestern.at.utils.StringUtils;
import edu.northwestern.at.utils.TextFile;
import edu.northwestern.at.utils.xml.DOMUtils;
import java.io.File;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;

/* loaded from: input_file:edu/northwestern/at/morphadorner/tools/namedentities/AdornWithNamedEntities.class */
public class AdornWithNamedEntities {
    protected static Document document;
    protected static final int INITPARAMS = 1;
    protected static String outputDirectory;
    protected static Annie annie;
    protected static final String teiHeaderPattern = "tei|tei\\.2|TEI|TEI\\.2";
    protected static int docsToProcess = 0;
    protected static int currentDocNumber = 0;
    protected static String fixupsURL = "resources/fixups.txt";
    protected static List<PatternReplacer> fixupsList = ListFactory.createNewList();

    public static void main(String[] strArr) {
        if (!initialize(strArr)) {
            System.exit(1);
        }
        terminate(processFiles(strArr), ((System.currentTimeMillis() - System.currentTimeMillis()) + 999) / 1000);
    }

    protected static boolean initialize(String[] strArr) {
        if (strArr.length < 2) {
            System.out.println("Not enough parameters.");
            return false;
        }
        outputDirectory = strArr[0];
        boolean loadFixups = loadFixups();
        if (loadFixups) {
            try {
                annie = new Annie();
                loadFixups = true;
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return loadFixups;
    }

    protected static boolean loadFixups() {
        for (String str : new TextFile(AdornWithNamedEntities.class.getResourceAsStream(fixupsURL), "utf-8").toArray()) {
            String trim = str.trim();
            if (trim.length() > 0 && trim.charAt(0) != '#') {
                String[] split = trim.split("\t");
                if (split.length == 2) {
                    fixupsList.add(new PatternReplacer(split[0], split[1]));
                }
            }
        }
        return true;
    }

    protected static void processOneFile(String str) {
        currentDocNumber++;
        System.out.println("Processing " + str + " (" + currentDocNumber + "/" + docsToProcess + ")");
        try {
            long currentTimeMillis = System.currentTimeMillis();
            document = DOMUtils.parse(str);
            long currentTimeMillis2 = ((System.currentTimeMillis() - currentTimeMillis) + 999) / 1000;
            System.out.println("   Document loaded and parsed in " + Formatters.formatLongWithCommas(currentTimeMillis2) + StringUtils.pluralize(currentTimeMillis2, " second.", " seconds."));
            List<Node> findChildren = DOMUtils.findChildren(findTextNodesParent(document), "text|TEXT");
            long currentTimeMillis3 = System.currentTimeMillis();
            for (int i = 0; i < findChildren.size(); i++) {
                traverse(findChildren.get(i));
            }
            String[] splitDocumentText = splitDocumentText(DOMUtils.saveToString(document), "</teiHeader>|</temphead>|</TEMPHEAD>|</tempHead>");
            splitDocumentText[1] = splitDocumentText[1].replaceAll("&lt;", "<");
            splitDocumentText[1] = splitDocumentText[1].replaceAll("&gt;", ">");
            splitDocumentText[1] = applyFixups(splitDocumentText[1]);
            String str2 = splitDocumentText[0] + splitDocumentText[1];
            long currentTimeMillis4 = ((System.currentTimeMillis() - currentTimeMillis3) + 999) / 1000;
            System.out.println("   Named entities added in " + Formatters.formatLongWithCommas(currentTimeMillis4) + StringUtils.pluralize(currentTimeMillis4, " second.", " seconds."));
            String canonicalPath = new File(outputDirectory, FileNameUtils.stripPathName(str)).getCanonicalPath();
            FileUtils.createPathForFile(canonicalPath);
            FileUtils.writeTextFile(canonicalPath, false, str2, "utf-8");
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("   *** Failed");
        }
    }

    protected static int processFiles(String[] strArr) {
        String[] strArr2 = new String[strArr.length - 1];
        for (int i = 1; i < strArr.length; i++) {
            strArr2[i - 1] = strArr[i];
        }
        String[] expandFileNameWildcards = FileNameUtils.expandFileNameWildcards(strArr2);
        docsToProcess = expandFileNameWildcards.length;
        for (String str : expandFileNameWildcards) {
            processOneFile(str);
        }
        return expandFileNameWildcards.length;
    }

    protected static void terminate(int i, long j) {
        System.out.println("Processed " + Formatters.formatIntegerWithCommas(i) + StringUtils.pluralize(j, " file in ", " files in ") + Formatters.formatLongWithCommas(j) + StringUtils.pluralize(j, " second.", " seconds."));
    }

    protected static void traverse(Node node) {
        Text text;
        String data;
        String addNamedEntities;
        NodeList childNodes = node.getChildNodes();
        if (childNodes != null) {
            for (int i = 0; i < childNodes.getLength(); i++) {
                traverse(childNodes.item(i));
            }
        }
        if (node.getNodeType() != 3 || (data = (text = (Text) node).getData()) == null || data.length() <= 0 || (addNamedEntities = addNamedEntities(data)) == null) {
            return;
        }
        text.setData(addNamedEntities);
    }

    protected static String addNamedEntities(String str) {
        return annie.adornText(str);
    }

    protected static String applyFixups(String str) {
        String replaceAll = str.replaceAll("&amp;(\\w+);", "&$1;").replaceAll("&apos;", "'").replaceAll("&lt;", "<").replaceAll("&gt;", ">").replaceAll("&quot;", "\"");
        for (int i = 0; i < fixupsList.size(); i++) {
            replaceAll = fixupsList.get(i).replace(replaceAll);
        }
        return replaceAll;
    }

    protected static String[] splitDocumentText(String str, String str2) {
        String[] strArr = new String[2];
        Matcher matcher = Pattern.compile(str2).matcher(str);
        if (matcher.find()) {
            int start = matcher.start();
            strArr[0] = str.substring(0, start);
            strArr[1] = str.substring(start);
        } else {
            strArr[0] = "";
            strArr[1] = str;
        }
        return strArr;
    }

    protected static Node findTextNodesParent(Document document2) {
        Element element;
        Element documentElement = document2.getDocumentElement();
        Element findChild = documentElement.getTagName().matches(teiHeaderPattern) ? documentElement : DOMUtils.findChild(documentElement, teiHeaderPattern);
        Element findChild2 = DOMUtils.findChild(documentElement, "eebo|EEBO");
        Element element2 = null;
        if (findChild2 != null) {
            element2 = DOMUtils.findChild(findChild2, "group|GROUP");
        }
        if (element2 != null) {
            element = element2;
        } else {
            element = findChild2;
            if (element == null) {
                element = findChild;
            }
        }
        return element;
    }

    protected AdornWithNamedEntities() {
    }
}
