package org.xmlcml.html.util;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import nu.xom.Document;
import nu.xom.Element;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.custommonkey.xmlunit.XMLConstants;
import org.joda.time.DateTime;
import org.w3c.tidy.Node;
import org.w3c.tidy.Tidy;
import org.xmlcml.html.HtmlElement;
import org.xmlcml.html.HtmlFactory;
import org.xmlcml.xml.XMLUtil;

/* loaded from: input_file:org/xmlcml/html/util/HTMLTidy.class */
public class HTMLTidy {
    private static final String DOCTYPE_REGEX = "<!DOCTYPE[^>]*>";
    private static final String NEWLINE_REGEX = "[\\r\\n]+";
    private static final String WHITESPACE = "[\\r\\n\\t]+";
    private static final String ENDTAG_PREFIX = "(</[A-Za-z_][A-Za-z_0-9]*:)";
    private static final String STARTTAG_PREFIX = "(<[A-Za-z_][A-Za-z_0-9]*:)";
    private static final Logger LOG = Logger.getLogger(HTMLTidy.class);
    private Tidy tidy = createTidyWithOptions();
    private List<HTMLTagReplacement> tagReplacementList;
    private ByteArrayOutputStream baos;
    private Node node;
    private boolean stripDoctype;
    private boolean removeXMLLang;
    private boolean flattenNewline;
    private boolean removeForeignPrefixes;

    public HTMLTidy() {
        setCommonDefaults();
    }

    public static Document htmlTidy(InputStream inputStream) throws IOException {
        if (inputStream == null) {
            throw new RuntimeException("Null input for HTMLTidy");
        }
        byte[] byteArray = IOUtils.toByteArray(inputStream);
        Tidy createTidyWithOptions = createTidyWithOptions();
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArray);
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        createTidyWithOptions.parse(byteArrayInputStream, byteArrayOutputStream);
        byte[] byteArray2 = byteArrayOutputStream.toByteArray();
        LOG.debug("made bytes: " + byteArray2.length);
        try {
            FileUtils.writeByteArrayToFile(new File("target/htmlIn" + new DateTime().getMillisOfDay() + ".html"), byteArray2);
            ByteArrayInputStream byteArrayInputStream2 = new ByteArrayInputStream(byteArray2);
            LOG.debug("parsing...");
            Document parseQuietlyToDocument = XMLUtil.parseQuietlyToDocument(byteArrayInputStream2);
            LOG.debug("parsed bytes");
            return parseQuietlyToDocument;
        } catch (RuntimeException e) {
            FileUtils.writeByteArrayToFile(new File("target/badhtmlIn" + new DateTime().getMillisOfDay() + ".html"), byteArray);
            FileUtils.writeByteArrayToFile(new File("target/badhtml" + new DateTime().getMillisOfDay() + ".html"), byteArray2);
            throw e;
        }
    }

    public static String tidyWhitespaceAndForeignNamePrefixesAndLang(String str) {
        String removeLangXMLLang = removeLangXMLLang(str);
        if (!removeLangXMLLang.equals(str)) {
            LOG.trace("tidied xml:lang");
        }
        String normalizeWhitespace = normalizeWhitespace(removeLangXMLLang);
        if (!normalizeWhitespace.equals(removeLangXMLLang)) {
            LOG.trace("tidied whitespace");
        }
        String removeDTD = removeDTD(normalizeWhitespace);
        if (!normalizeWhitespace.equals(removeDTD)) {
            LOG.trace("stripped DTD");
        }
        String removeForeignNamespacePrefixes = removeForeignNamespacePrefixes(removeDTD);
        if (!normalizeWhitespace.equals(removeForeignNamespacePrefixes)) {
            LOG.trace("removed namespacePrefixes");
        }
        return removeForeignNamespacePrefixes;
    }

    private static String removeLangXMLLang(String str) {
        return str.replaceAll("xml\\:lang\\s*=\\s*\\\"([^\\\"]*)\\\"", "");
    }

    private static String removeForeignNamespacePrefixes(String str) {
        return str.replaceAll(STARTTAG_PREFIX, "<_").replaceAll(ENDTAG_PREFIX, "</_");
    }

    private static String normalizeWhitespace(String str) {
        return str.replaceAll(WHITESPACE, " ");
    }

    private static String flattenNewline(String str) {
        return str.replaceAll(NEWLINE_REGEX, "");
    }

    private static String removeDTD(String str) {
        return str.replaceAll(DOCTYPE_REGEX, "");
    }

    private void preTidy(StringBuilder sb) {
        if (this.tagReplacementList != null) {
            Iterator<HTMLTagReplacement> it = this.tagReplacementList.iterator();
            while (it.hasNext()) {
                it.next().replaceAll(sb);
            }
        }
        if (this.stripDoctype) {
            stripDoctype(sb);
        }
        if (this.removeXMLLang || this.flattenNewline || this.removeForeignPrefixes) {
            String sb2 = sb.toString();
            if (this.removeXMLLang) {
                sb2 = removeLangXMLLang(sb2);
            }
            if (this.flattenNewline) {
                sb2 = flattenNewline(sb2);
            }
            if (this.removeForeignPrefixes) {
                sb2 = removeForeignNamespacePrefixes(sb2);
            }
            sb.replace(0, sb.length(), sb2);
        }
    }

    public boolean isRemoveForeignPrefixes() {
        return this.removeForeignPrefixes;
    }

    public void setRemoveForeignPrefixes(boolean z) {
        this.removeForeignPrefixes = z;
    }

    public void setStripDoctype(boolean z) {
        this.stripDoctype = z;
    }

    public void setRemoveXMLLang(boolean z) {
        this.removeXMLLang = z;
    }

    public void setFlattenNewline(boolean z) {
        this.flattenNewline = z;
    }

    public static void stripDoctype(StringBuilder sb) {
        int indexOf = sb.indexOf("<!DOCTYPE");
        if (indexOf != -1) {
            int indexOf2 = sb.indexOf(">", indexOf);
            if (indexOf2 == -1) {
                throw new RuntimeException("Bad DOCTYPE at: " + indexOf);
            }
            sb.delete(indexOf, indexOf2 + 1);
        }
    }

    public static String replaceBadTags(String str, String str2, String str3) {
        return str.replaceAll("<" + str2 + "[^>]*>", "<" + str3 + ">").replaceAll("<" + str2 + "[^>/]*/>", "<" + str3 + "/>").replaceAll(XMLConstants.OPEN_END_NODE + str2 + "\\s*>", XMLConstants.OPEN_END_NODE + str3 + ">");
    }

    private static Tidy createTidyWithOptions() {
        Tidy tidy = new Tidy();
        tidy.setDocType(null);
        tidy.setXmlOut(true);
        tidy.setDropEmptyParas(true);
        tidy.setDropFontTags(true);
        tidy.setMakeClean(true);
        tidy.setNumEntities(true);
        tidy.setXHTML(true);
        tidy.setQuiet(true);
        tidy.setQuoteMarks(true);
        tidy.setShowWarnings(false);
        return tidy;
    }

    public static Element convertStringToXHTML(String str) {
        try {
            Document htmlTidy = htmlTidy(new ByteArrayInputStream(str.getBytes()));
            if (htmlTidy == null) {
                return null;
            }
            return htmlTidy.getRootElement();
        } catch (Exception e) {
            throw new RuntimeException("parse: " + e);
        }
    }

    public String tidy(InputStream inputStream) throws IOException {
        StringBuilder sb = new StringBuilder(IOUtils.toString(inputStream));
        preTidy(sb);
        InputStream inputStream2 = IOUtils.toInputStream(sb.toString());
        this.baos = new ByteArrayOutputStream();
        this.node = this.tidy.parse(inputStream2, this.baos);
        StringBuilder sb2 = new StringBuilder(this.baos.toString());
        postTidy(sb2);
        String sb3 = sb2.toString();
        LOG.trace("SB " + sb3);
        this.baos = new ByteArrayOutputStream();
        IOUtils.write(sb3.getBytes(), this.baos);
        return sb3;
    }

    public HtmlElement createHtmlElement(InputStream inputStream) throws Exception {
        return new HtmlFactory().parse(tidy(inputStream));
    }

    private void postTidy(StringBuilder sb) {
        preTidy(sb);
    }

    public void addTagReplacement(HTMLTagReplacement hTMLTagReplacement) {
        if (this.tagReplacementList == null) {
            this.tagReplacementList = new ArrayList();
        }
        this.tagReplacementList.add(hTMLTagReplacement);
    }

    public void deleteTag(String str) {
        addTagReplacement(new HTMLTagReplacement(str));
    }

    public void replacetag(String str, String str2) {
        addTagReplacement(new HTMLTagReplacement(str, str2));
    }

    public void seStripDoctype(boolean z) {
        this.stripDoctype = z;
    }

    public Node getNode() {
        return this.node;
    }

    public ByteArrayOutputStream getByteArrayOutputStream() {
        return this.baos;
    }

    public String getOutputString() {
        if (this.baos == null) {
            return null;
        }
        return this.baos.toString();
    }

    public Tidy getTidy() {
        return this.tidy;
    }

    public void setCommonDefaults() {
        setStripDoctype(true);
        getTidy().setDocType(null);
        setFlattenNewline(true);
    }
}
