package org.xmlcml.norma.image.ocr;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import nu.xom.Attribute;
import nu.xom.Element;
import nu.xom.Elements;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.xmlcml.euclid.Real2;
import org.xmlcml.euclid.Real2Range;
import org.xmlcml.graphics.svg.SVGElement;
import org.xmlcml.graphics.svg.SVGG;
import org.xmlcml.graphics.svg.SVGRect;
import org.xmlcml.graphics.svg.SVGSVG;
import org.xmlcml.graphics.svg.SVGText;
import org.xmlcml.graphics.svg.SVGUtil;
import org.xmlcml.graphics.svg.text.SVGPhrase;
import org.xmlcml.graphics.svg.text.SVGWord;
import org.xmlcml.graphics.svg.text.SVGWordBlock;
import org.xmlcml.graphics.svg.text.SVGWordLine;
import org.xmlcml.graphics.svg.text.SVGWordPage;
import org.xmlcml.graphics.svg.text.SVGWordPageList;
import org.xmlcml.graphics.svg.text.SVGWordPara;
import org.xmlcml.html.HtmlBody;
import org.xmlcml.html.HtmlDiv;
import org.xmlcml.html.HtmlElement;
import org.xmlcml.html.HtmlEm;
import org.xmlcml.html.HtmlHead;
import org.xmlcml.html.HtmlHtml;
import org.xmlcml.html.HtmlMeta;
import org.xmlcml.html.HtmlP;
import org.xmlcml.html.HtmlSpan;
import org.xmlcml.html.HtmlStrong;
import org.xmlcml.image.ImageUtil;
import org.xmlcml.norma.editor.SubstitutionEditor;
import org.xmlcml.norma.input.InputReader;
import org.xmlcml.xml.XMLUtil;

/* loaded from: input_file:org/xmlcml/norma/image/ocr/HOCRReader.class */
public class HOCRReader extends InputReader {
    public static final Logger LOG = Logger.getLogger(HOCRReader.class);
    private static final String POTENTIAL_LABEL = "potential_label";
    private static final int TESSERACT_SLEEP = 200;
    private static final int TESSERACT_TRIES = 10;
    private static final String SEPARATOR = "~";
    private static final String HELVETICA = "helvetica";
    private static final String LOW_CONF_COL = "red";
    private static final String UNEDITED_COL = "green";
    private static final String LINE_COL = "yellow";
    private static final double DEFAULT_FONT_SIZE = 10.0d;
    private static final String WORD = "word";
    private static final String LINE = "line";
    private static final String ITALIC = "italic";
    private static final String BOLD = "bold";
    private static final String OCR_CAREA = "ocr_carea";
    private static final String OCR_LINE = "ocr_line";
    private static final String OCR_PAGE = "ocr_page";
    private static final String OCR_PAR = "ocr_par";
    private static final String OCRX_WORD = "ocrx_word";
    private static final String CLASS = "class";
    public static final String HOCR = ".hocr";
    public static final String HOCR_HTML = ".hocr.html";
    public static final String HOCR_SVG = ".hocr.svg";
    private static final double MIN_WIDTH = 4.5d;
    private static final double RECT_OPACITY = 0.2d;
    private static final Double LOW_CONF_WIDTH;
    private static final double MAX_FONT_SIZE = 30.0d;
    private HtmlElement hocrHtmlElement;
    private SVGSVG svgSvg;
    private HtmlBody newBody;
    private HtmlHtml rawHtml;
    private HtmlHead rawHead;
    private HtmlBody rawBody;
    private String title;
    private List<HtmlMeta> metaList;
    private long tesseractSleep;
    private int tesseractTries;
    private double maxFontSize;
    private Pattern labelPattern;
    private int imageMarginX = 0;
    private int imageMarginY = 0;
    private int marginColor = -1;
    private List<HOCRLabel> potentialLabelList;
    private List<HOCRText> potentialTextList;
    private List<HOCRPhrase> potentialPhraseList;
    private Real2Range wordJoiningBox;
    private List<SVGWordLine> wordLineList;
    private List<SVGPhrase> allPhraseList;
    private SubstitutionEditor substitutionManager;

    public int getImageMarginX() {
        return this.imageMarginX;
    }

    public void setImageMarginX(int i) {
        this.imageMarginX = i;
    }

    public int getImageMarginY() {
        return this.imageMarginY;
    }

    public void setImageMarginY(int i) {
        this.imageMarginY = i;
    }

    public int getMarginColor() {
        return this.marginColor;
    }

    public void setMarginColor(int i) {
        this.marginColor = i;
    }

    public HOCRReader() {
        setup();
    }

    private void setup() {
        clearVariables();
        setDefaults();
    }

    private void setDefaults() {
        this.tesseractSleep = 200L;
        this.tesseractTries = TESSERACT_TRIES;
        this.maxFontSize = MAX_FONT_SIZE;
    }

    public void clearVariables() {
        this.hocrHtmlElement = null;
        this.svgSvg = null;
        this.newBody = null;
        this.rawHtml = null;
        this.rawHead = null;
        this.rawBody = null;
        this.title = null;
        this.metaList = null;
        this.potentialLabelList = null;
    }

    public void readHOCR(InputStream inputStream) throws IOException {
        readHOCR(HtmlElement.create(XMLUtil.stripDTDAndParse(IOUtils.toString(inputStream, "UTF-8"))));
        applyUniversalSubstitutions();
        processHTMLAndCreateSVG();
    }

    private void applyUniversalSubstitutions() {
        ensureUniversalSubstitutions();
    }

    private void ensureUniversalSubstitutions() {
    }

    public void readHOCR(HtmlElement htmlElement) {
        this.hocrHtmlElement = htmlElement;
        this.rawHtml = htmlElement instanceof HtmlHtml ? (HtmlHtml) htmlElement : null;
    }

    public HtmlElement getHocrElement() {
        return this.hocrHtmlElement;
    }

    public SVGElement getOrCreateSVG() {
        if (this.svgSvg == null && this.hocrHtmlElement != null) {
            processHTMLAndCreateSVG();
        }
        if (this.labelPattern != null) {
            getOrCreatePotentialLabelElements(this.svgSvg);
        }
        return this.svgSvg;
    }

    public HtmlBody getOrCreateHtmlBody() {
        getOrCreateSVG();
        return this.newBody;
    }

    private void processHTMLAndCreateSVG() {
        processHead();
        processBody();
        createSVGAndHTML();
    }

    private void processHead() {
        this.rawHead = this.rawHtml == null ? null : this.rawHtml.getHead();
        if (this.rawHead != null) {
            this.title = this.rawHead.getTitle();
            this.metaList = this.rawHead.getMetaElements();
        }
    }

    private void processBody() {
        this.rawBody = this.rawHtml.getBody();
    }

    public HtmlHead getHead() {
        return this.rawHead;
    }

    public List<HtmlMeta> getMetaElements() {
        return this.metaList;
    }

    public String getTesseractVersion() {
        for (HtmlMeta htmlMeta : this.metaList) {
            if ("ocr-system".equals(htmlMeta.getName())) {
                return htmlMeta.getContent();
            }
        }
        return null;
    }

    private void createSVGAndHTML() {
        this.svgSvg = new SVGSVG();
        this.svgSvg.setFontFamily(HELVETICA);
        this.svgSvg.setFontSize(Double.valueOf(DEFAULT_FONT_SIZE));
        this.newBody = new HtmlBody();
        this.svgSvg.addAttribute(new Attribute("tesseractVersion", getTesseractVersion()));
        Elements childElements = this.rawBody.getChildElements();
        if (childElements.size() > 0) {
            SVGWordPageList sVGWordPageList = new SVGWordPageList();
            this.svgSvg.appendChild(sVGWordPageList);
            for (int i = 0; i < childElements.size(); i++) {
                Element element = childElements.get(i);
                if (!(element instanceof HtmlDiv)) {
                    throw new RuntimeException("unknown element " + element.toXML());
                }
                HtmlDiv htmlDiv = (HtmlDiv) element;
                if (!OCR_PAGE.equals(htmlDiv.getClassAttribute())) {
                    throw new RuntimeException("unknown div " + htmlDiv.toXML());
                }
                HtmlSVG createPageFromTesseract = createPageFromTesseract(htmlDiv);
                sVGWordPageList.appendChild(createPageFromTesseract.svg);
                this.newBody.appendChild(createPageFromTesseract.html);
            }
        }
    }

    private HtmlSVG createPageFromTesseract(HtmlDiv htmlDiv) {
        SVGWordPage sVGWordPage = htmlDiv == null ? null : new SVGWordPage();
        HtmlDiv htmlDiv2 = new HtmlDiv();
        copyAttributes(htmlDiv, sVGWordPage);
        XMLUtil.copyAttributes(htmlDiv, htmlDiv2);
        HtmlSVG htmlSVG = new HtmlSVG(htmlDiv2, sVGWordPage);
        Elements childElements = htmlDiv.getChildElements();
        for (int i = 0; i < childElements.size(); i++) {
            Element element = childElements.get(i);
            if (!(element instanceof HtmlDiv)) {
                throw new RuntimeException("unknown element " + element.toXML());
            }
            HtmlDiv htmlDiv3 = (HtmlDiv) element;
            if (!OCR_CAREA.equals(htmlDiv3.getClassAttribute())) {
                throw new RuntimeException("unknown div " + htmlDiv3.toXML());
            }
            HtmlSVG createBlockFromTesseract = createBlockFromTesseract(htmlDiv3);
            sVGWordPage.appendChild(createBlockFromTesseract.svg);
            htmlDiv2.appendChild(createBlockFromTesseract.html);
        }
        return htmlSVG;
    }

    private HtmlSVG createBlockFromTesseract(HtmlDiv htmlDiv) {
        HtmlDiv htmlDiv2 = new HtmlDiv();
        XMLUtil.copyAttributes(htmlDiv, htmlDiv2);
        htmlDiv2.setClassAttribute("block");
        SVGWordBlock sVGWordBlock = htmlDiv == null ? null : new SVGWordBlock();
        copyAttributes(htmlDiv, sVGWordBlock);
        HtmlSVG htmlSVG = new HtmlSVG(htmlDiv2, sVGWordBlock);
        Elements childElements = htmlDiv.getChildElements();
        for (int i = 0; i < childElements.size(); i++) {
            Element element = childElements.get(i);
            if (!(element instanceof HtmlP)) {
                throw new RuntimeException("unknown element " + element.toXML());
            }
            HtmlP htmlP = (HtmlP) element;
            if (!OCR_PAR.equals(htmlP.getClassAttribute())) {
                throw new RuntimeException("unknown div " + htmlP.toXML());
            }
            HtmlSVG createParFromTesseract = createParFromTesseract(htmlP);
            sVGWordBlock.appendChild(createParFromTesseract.svg);
            htmlDiv2.appendChild(createParFromTesseract.html);
        }
        return htmlSVG;
    }

    private HtmlSVG createParFromTesseract(HtmlP htmlP) {
        SVGWordPara sVGWordPara = htmlP == null ? null : new SVGWordPara();
        copyAttributes(htmlP, sVGWordPara);
        HtmlP htmlP2 = new HtmlP();
        XMLUtil.copyAttributes(htmlP, htmlP2);
        HtmlSVG htmlSVG = new HtmlSVG(htmlP2, sVGWordPara);
        Elements childElements = htmlP.getChildElements();
        for (int i = 0; i < childElements.size(); i++) {
            Element element = childElements.get(i);
            if (!(element instanceof HtmlSpan)) {
                throw new RuntimeException("unknown element " + element.getClass() + " / " + element.toXML());
            }
            HtmlSpan htmlSpan = (HtmlSpan) element;
            if (!OCR_LINE.equals(htmlSpan.getClassAttribute())) {
                throw new RuntimeException("unknown span " + htmlSpan.toXML());
            }
            HtmlSVG createLineFromTesseract = createLineFromTesseract(htmlSpan);
            sVGWordPara.appendChild(createLineFromTesseract.svg);
            htmlP2.appendChild(createLineFromTesseract.html);
        }
        return htmlSVG;
    }

    private HtmlSVG createLineFromTesseract(HtmlSpan htmlSpan) {
        SVGG sVGWordLine = new SVGWordLine();
        copyAttributes(htmlSpan, sVGWordLine);
        HtmlSpan htmlSpan2 = new HtmlSpan();
        XMLUtil.copyAttributes(htmlSpan, htmlSpan2);
        HtmlSVG htmlSVG = new HtmlSVG(htmlSpan2, sVGWordLine);
        HOCRTitle hOCRTitle = new HOCRTitle(htmlSpan.getTitle());
        Real2Range boundingBox = hOCRTitle.getBoundingBox();
        if (boundingBox.getXRange().getRange() > MIN_WIDTH && boundingBox.getYRange().getRange() > MIN_WIDTH) {
            boolean z = false;
            hOCRTitle.addAttributes(sVGWordLine);
            SVGRect createFromReal2Range = SVGRect.createFromReal2Range(boundingBox);
            createFromReal2Range.setFill(LINE_COL);
            createFromReal2Range.setOpacity(RECT_OPACITY);
            sVGWordLine.appendChild(createFromReal2Range);
            Double valueOf = Double.valueOf(sVGWordLine.getBoundingBox().getYRange().getRange());
            if (valueOf.doubleValue() > getMaxFontSize()) {
                LOG.trace("largeText " + valueOf);
                valueOf = Double.valueOf(getMaxFontSize());
                z = true;
            }
            sVGWordLine.setFontSize(valueOf);
            sVGWordLine.setClassName(LINE);
            Elements childElements = htmlSpan.getChildElements();
            for (int i = 0; i < childElements.size(); i++) {
                Element element = childElements.get(i);
                if (element instanceof HtmlSpan) {
                    HtmlSpan htmlSpan3 = (HtmlSpan) element;
                    String classAttribute = htmlSpan3.getClassAttribute();
                    if (OCRX_WORD.equals(classAttribute)) {
                        addWord(sVGWordLine, htmlSpan2, z, htmlSpan3);
                    } else {
                        LOG.debug("omitted attribute: " + classAttribute);
                    }
                }
            }
        }
        return htmlSVG;
    }

    private void addWord(SVGWordLine sVGWordLine, HtmlSpan htmlSpan, boolean z, HtmlSpan htmlSpan2) {
        HtmlSVG createWordFromTesseract = createWordFromTesseract(htmlSpan2);
        createWordFromTesseract.setLargeText(z);
        sVGWordLine.appendChild(createWordFromTesseract.svg);
        String value = htmlSpan2.getValue();
        if (value.trim().length() == 0 && value.length() > 0) {
            addSpaceMarker(sVGWordLine, htmlSpan2, createWordFromTesseract);
        }
        if (fitsRegex(value.trim())) {
            String attributeValue = createWordFromTesseract.svg.getAttributeValue(CLASS);
            createWordFromTesseract.svg.addAttribute(new Attribute(CLASS, attributeValue == null ? POTENTIAL_LABEL : attributeValue + " " + POTENTIAL_LABEL));
        }
        htmlSpan.appendChild(createWordFromTesseract.html);
    }

    private void addSpaceMarker(SVGWordLine sVGWordLine, HtmlSpan htmlSpan, HtmlSVG htmlSVG) {
        LOG.trace("SPACE...");
        LOG.trace("span " + htmlSpan.toXML());
        LOG.trace(sVGWordLine.toXML());
        Real2Range boundingBox = htmlSVG.svg.getBoundingBox();
        if (boundingBox != null) {
            SVGText sVGText = new SVGText(boundingBox.getCorners()[0], "SPACE");
            sVGText.setFontSize(Double.valueOf(15.0d));
            htmlSVG.svg.appendChild(sVGText);
            LOG.trace(htmlSVG.svg.toXML());
        }
    }

    private boolean fitsRegex(String str) {
        boolean matches = (str == null || this.labelPattern == null) ? false : this.labelPattern.matcher(str).matches();
        if (matches) {
            LOG.trace("matches: " + str);
        }
        return matches;
    }

    private static void copyAttributes(HtmlElement htmlElement, SVGG svgg) {
        copyAttribute("id", htmlElement, svgg);
    }

    private static void copyAttribute(String str, HtmlElement htmlElement, SVGG svgg) {
        String attributeValue = htmlElement.getAttributeValue(str);
        if (attributeValue != null) {
            svgg.addAttribute(new Attribute(str, attributeValue));
        }
    }

    private HtmlSVG createWordFromTesseract(HtmlSpan htmlSpan) {
        LOG.trace("createWordFromTesseract");
        SVGWord sVGWord = new SVGWord();
        copyAttributes(htmlSpan, sVGWord);
        HtmlSpan htmlSpan2 = new HtmlSpan();
        HtmlSVG htmlSVG = new HtmlSVG(htmlSpan2, sVGWord);
        HOCRTitle hOCRTitle = new HOCRTitle(htmlSpan.getTitle());
        ensureSubstitutionManager();
        Real2Range boundingBox = hOCRTitle.getBoundingBox();
        if (boundingBox.getXRange().getRange() > MIN_WIDTH && boundingBox.getYRange().getRange() > MIN_WIDTH) {
            String value = htmlSpan.getValue();
            if (value.trim().length() > 0) {
                double range = boundingBox.getYRange().getRange();
                SVGRect createFromReal2Range = SVGRect.createFromReal2Range(boundingBox);
                createFromReal2Range.setFill(UNEDITED_COL);
                createFromReal2Range.setOpacity(RECT_OPACITY);
                sVGWord.appendChild(createFromReal2Range);
                sVGWord.setClassName(WORD);
                Elements childElements = htmlSpan.getChildElements();
                int size = childElements.size();
                if (size > 1) {
                    throw new RuntimeException("multiple styles in word");
                }
                String applySubstitutions = this.substitutionManager.applySubstitutions(sVGWord, value, createFromReal2Range);
                boolean z = false;
                if (range > getMaxFontSize()) {
                    range = getMaxFontSize();
                    z = true;
                }
                SVGText createTextElement = createTextElement(boundingBox, applySubstitutions, range);
                htmlSpan2.setValue(applySubstitutions);
                sVGWord.appendChild(createTextElement);
                if (hOCRTitle.getWConf() != null && hOCRTitle.getWConf().intValue() < 50) {
                    z = true;
                }
                if (z) {
                    Integer wConf = hOCRTitle.getWConf();
                    if (wConf == null) {
                        wConf = 100;
                    }
                    createTextElement.setOpacity(wConf.intValue() * 0.007d);
                    createTextElement.setFill(LOW_CONF_COL);
                    createFromReal2Range.setStrokeWidth(LOW_CONF_WIDTH);
                }
                hOCRTitle.addAttributes(sVGWord);
                addStylesFromStrongEm(childElements, size, createTextElement);
            }
        }
        return htmlSVG;
    }

    private void ensureSubstitutionManager() {
        if (this.substitutionManager == null) {
            this.substitutionManager = new SubstitutionEditor();
        }
    }

    private static SVGText createTextElement(Real2Range real2Range, String str, double d) {
        SVGText sVGText = new SVGText();
        sVGText.removeAttributes();
        SimpleFontMetrics simpleFontMetrics = new SimpleFontMetrics(str);
        double descenderFraction = simpleFontMetrics.getDescenderFraction();
        double d2 = 1.0d;
        if (simpleFontMetrics.hasDescenders()) {
            d2 = 1.0d + simpleFontMetrics.getDescenderFraction();
        } else if (!simpleFontMetrics.hasAscenders() && !simpleFontMetrics.hasDescenders()) {
            d2 = 0.7692307692307692d;
        }
        sVGText.setFontSize(Double.valueOf(d / d2));
        sVGText.setXY(real2Range.getCorners()[0].plus(new Real2(0.0d, d * (1.0d - descenderFraction))));
        sVGText.appendChild(str);
        return sVGText;
    }

    private static void addStylesFromStrongEm(Elements elements, int i, SVGText sVGText) {
        if (i > 0) {
            Element element = elements.get(0);
            if (element instanceof HtmlStrong) {
                sVGText.setFontStyle(BOLD);
            } else {
                if (!(element instanceof HtmlEm)) {
                    throw new RuntimeException("unknown element " + element.getClass() + " / " + element.toXML());
                }
                sVGText.setFontStyle(ITALIC);
            }
        }
    }

    public List<HtmlSpan> getNonEmptyLines() {
        List<HtmlSpan> queryElements = XMLUtil.getQueryElements(getOrCreateHtmlBody(), ".//*[local-name()='span' and @class='ocr_line']");
        ArrayList arrayList = new ArrayList();
        for (HtmlSpan htmlSpan : queryElements) {
            if (htmlSpan.getValue().trim().length() != 0) {
                arrayList.add(htmlSpan);
            }
        }
        return arrayList;
    }

    public void createHTMLandSVG(File file, String str, BufferedImage bufferedImage, String str2) throws Exception {
        File file2 = new File(file, str2 + "." + str);
        clearVariables();
        createHTMLandSVG(file, str, bufferedImage, str2, file2);
    }

    public void createHTMLandSVG(File file, String str, NamedImage namedImage) throws Exception {
        String key = namedImage.getKey();
        File file2 = new File(file, key + "." + str);
        clearVariables();
        createHTMLandSVG(file, str, namedImage.getImage(), key, file2);
    }

    private void createHTMLandSVG(File file, String str, BufferedImage bufferedImage, String str2, File file2) throws Exception {
        ImageIO.write(addMargins(bufferedImage), str, new FileOutputStream(file2));
        File convertImageToHOCR = new ImageToHOCRConverter().convertImageToHOCR(file2, new File(file, str2 + HOCR));
        if (convertImageToHOCR == null) {
            return;
        }
        readHOCR(new FileInputStream(convertImageToHOCR));
        SVGElement orCreateSVG = getOrCreateSVG();
        getOrCreatePotentialTextElements(orCreateSVG);
        List<HOCRLabel> orCreatePotentialLabelElements = getOrCreatePotentialLabelElements(orCreateSVG);
        getOrCreatePotentialPhraseElements(orCreateSVG);
        new GridExtractor(new Real2(8.0d, 8.0d)).deduceGrid(orCreatePotentialLabelElements);
        SVGSVG.wrapAndWriteAsSVG(orCreateSVG, new File(file, str2 + HOCR_SVG));
    }

    private BufferedImage addMargins(BufferedImage bufferedImage) {
        return (this.imageMarginX > 0 || this.imageMarginY > 0) ? ImageUtil.addBorders(bufferedImage, this.imageMarginX, this.imageMarginY, this.marginColor) : bufferedImage;
    }

    public List<HOCRLabel> getOrCreatePotentialLabelElements(SVGElement sVGElement) {
        if (this.potentialLabelList == null) {
            List<SVGG> querySVGElements = SVGUtil.getQuerySVGElements(sVGElement, "//*[local-name()='g' and contains(@class, 'potential_label')]");
            this.potentialLabelList = new ArrayList();
            for (SVGG svgg : querySVGElements) {
                addDecorativeBoxToPotentialLabel(svgg);
                if (!(svgg instanceof SVGG)) {
                    LOG.error("expected text, found: " + svgg.toXML());
                }
                this.potentialLabelList.add(new HOCRLabel(svgg));
            }
        }
        return this.potentialLabelList;
    }

    public List<HOCRPhrase> getOrCreatePotentialPhraseElements(SVGElement sVGElement) {
        List querySVGElements = SVGUtil.getQuerySVGElements(sVGElement, "//*[local-name()='g' and contains(@class, 'line')]");
        this.potentialPhraseList = new ArrayList();
        Iterator it = querySVGElements.iterator();
        while (it.hasNext()) {
            List querySVGElements2 = SVGUtil.getQuerySVGElements((SVGElement) it.next(), "*[local-name()='g' and contains(@class,'word')]");
            ArrayList arrayList = new ArrayList();
            HOCRPhrase hOCRPhrase = null;
            HOCRPhrase hOCRPhrase2 = null;
            for (int i = 0; i < querySVGElements2.size(); i++) {
                SVGG svgg = (SVGG) querySVGElements2.get(i);
                String attributeValue = svgg.getAttributeValue(CLASS);
                if (attributeValue == null || !attributeValue.contains(POTENTIAL_LABEL)) {
                    HOCRText hOCRText = new HOCRText(svgg);
                    if (hOCRPhrase != null) {
                        joinWords(hOCRPhrase, hOCRPhrase2, svgg, hOCRText);
                    } else {
                        hOCRPhrase2 = new HOCRPhrase(svgg);
                        LOG.trace("new: " + (hOCRText == null ? null : hOCRText.getText() == null ? null : hOCRText.getText().getText()));
                        arrayList.add(hOCRPhrase2);
                        hOCRPhrase = hOCRPhrase2;
                    }
                }
            }
            Iterator it2 = arrayList.iterator();
            while (it2.hasNext()) {
                LOG.trace(">phrase>" + ((HOCRPhrase) it2.next()).toString());
            }
            this.potentialPhraseList.addAll(arrayList);
        }
        return this.potentialPhraseList;
    }

    private void joinWords(HOCRPhrase hOCRPhrase, HOCRPhrase hOCRPhrase2, SVGG svgg, HOCRText hOCRText) {
        Double boxEnd = hOCRPhrase.getBoxEnd();
        LOG.trace(">>" + (hOCRPhrase.getBboxRect() == null ? null : hOCRPhrase.getBboxRect().toXML()));
        if (boxEnd != null) {
            LOG.trace(">>" + hOCRText.getBboxRect().toXML());
            double doubleValue = hOCRText.getBoxStart().doubleValue();
            Double valueOf = Double.valueOf(doubleValue - boxEnd.doubleValue());
            LOG.trace(doubleValue + " - " + boxEnd);
            if (valueOf.doubleValue() < 0.0d) {
                LOG.error("previous overlaps this");
            } else if (!HOCRText.isWordInPhrase(valueOf, HOCRChunk.getMeanSize(hOCRPhrase.getFontSize(), hOCRText.getFontSize()), 0.0d, 2.0d)) {
                LOG.trace("didn't add: " + hOCRText.getText().getText() + " to " + hOCRPhrase2.getText().getText());
            } else {
                hOCRPhrase2.add(svgg);
                LOG.trace("added " + hOCRText.getText().getText() + " :" + hOCRPhrase2.getText().getText() + ":");
            }
        }
    }

    public List<HOCRText> getOrCreatePotentialTextElements(SVGElement sVGElement) {
        if (this.potentialTextList == null) {
            List<SVGG> querySVGElements = SVGUtil.getQuerySVGElements(sVGElement, "//*[local-name()='g' and not(contains(@class, 'potential_label')) and *[local-name()='text']]");
            this.potentialTextList = new ArrayList();
            for (SVGG svgg : querySVGElements) {
                this.potentialTextList.add(new HOCRText(svgg));
            }
        }
        return this.potentialTextList;
    }

    private void addDecorativeBoxToPotentialLabel(SVGElement sVGElement) {
        SVGRect createFromReal2Range = SVGRect.createFromReal2Range(sVGElement.getBoundingBox());
        if (createFromReal2Range != null) {
            createFromReal2Range.setFill("magenta");
            createFromReal2Range.setOpacity(RECT_OPACITY);
            sVGElement.appendChild(createFromReal2Range);
        }
    }

    public static List<HtmlSpan> getWords(HtmlSpan htmlSpan) {
        ArrayList arrayList = new ArrayList();
        Iterator it = XMLUtil.getQueryElements(htmlSpan, "*[local-name()='span' and not(normalize-space(.)='')]").iterator();
        while (it.hasNext()) {
            arrayList.add((Element) it.next());
        }
        return arrayList;
    }

    public static List<String> matchPattern(HtmlSpan htmlSpan, Pattern pattern) {
        ArrayList arrayList = new ArrayList();
        List<HtmlSpan> words = getWords(htmlSpan);
        StringBuilder sb = new StringBuilder();
        Iterator<HtmlSpan> it = words.iterator();
        while (it.hasNext()) {
            sb.append(it.next().getValue());
            sb.append(SEPARATOR);
        }
        Matcher matcher = pattern.matcher(sb.toString());
        if (matcher.matches()) {
            for (int i = 1; i <= matcher.groupCount(); i++) {
                String group = matcher.group(i);
                arrayList.add(group == null ? null : group.replaceAll(SEPARATOR, ""));
            }
        }
        return arrayList;
    }

    public static String getSpacedValue(HtmlSpan htmlSpan) {
        StringBuilder sb = new StringBuilder();
        Iterator<HtmlSpan> it = getWords(htmlSpan).iterator();
        while (it.hasNext()) {
            sb.append(it.next().getValue() + " ");
        }
        return sb.toString().trim();
    }

    public void setMaxFontSize(double d) {
        this.maxFontSize = d;
    }

    public double getMaxFontSize() {
        return this.maxFontSize;
    }

    public void labelSubImages(String str) {
        this.labelPattern = Pattern.compile(str);
    }

    public void setJoiningBox(Real2Range real2Range) {
        setWordJoiningBox(real2Range);
    }

    public List<SVGWordLine> createWordLineList(File file) throws IOException, FileNotFoundException {
        readHOCR(new FileInputStream(file));
        return getOrCreateWordLineList((SVGSVG) getOrCreateSVG());
    }

    public List<SVGPhrase> createPhraseList(File file) throws IOException, FileNotFoundException {
        createWordLineList(file);
        return createPhraseList(this.wordLineList);
    }

    public List<SVGWordLine> getOrCreateWordLineList() {
        return getOrCreateWordLineList((SVGSVG) getOrCreateSVG());
    }

    private List<SVGWordLine> getOrCreateWordLineList(SVGSVG svgsvg) {
        this.wordLineList = svgsvg.getSingleSVGPage().getSVGLineList();
        Iterator<SVGWordLine> it = this.wordLineList.iterator();
        while (it.hasNext()) {
            it.next().makePhrasesFromWords();
        }
        return this.wordLineList;
    }

    public Real2Range getWordJoiningBox() {
        return this.wordJoiningBox;
    }

    public void setWordJoiningBox(Real2Range real2Range) {
        this.wordJoiningBox = real2Range;
    }

    public List<SVGPhrase> getOrCreatePhraseList() {
        if (this.allPhraseList == null) {
            getOrCreateWordLineList();
            this.allPhraseList = createPhraseList(this.wordLineList);
        }
        return this.allPhraseList;
    }

    public static List<SVGPhrase> createPhraseList(List<SVGWordLine> list) {
        ArrayList arrayList = new ArrayList();
        Iterator<SVGWordLine> it = list.iterator();
        while (it.hasNext()) {
            arrayList.addAll(it.next().getOrCreateSVGPhraseList());
        }
        return arrayList;
    }

    static {
        LOG.setLevel(Level.DEBUG);
        LOW_CONF_WIDTH = Double.valueOf(3.0d);
    }
}
