package org.xmlcml.html.util;

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import java.io.File;
import java.io.FileOutputStream;
import java.net.URL;
import nu.xom.Element;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.xmlcml.html.HtmlElement;
import org.xmlcml.html.HtmlFactory;
import org.xmlcml.xml.XMLUtil;

/* loaded from: input_file:org/xmlcml/html/util/HtmlUnitWrapper.class */
public class HtmlUnitWrapper {
    private static final Logger LOG = Logger.getLogger(HtmlUnitWrapper.class);
    private WebClient webClient;
    private HtmlPage rawHtmlPage;
    private String pageAsXml;
    private Element xmlElement;
    private HtmlElement htmlElement;

    public HtmlElement readAndCreateElement(URL url) throws Exception {
        this.webClient = new WebClient();
        this.rawHtmlPage = (HtmlPage) this.webClient.getPage(url.toString());
        this.pageAsXml = this.rawHtmlPage.asXml();
        int length = this.pageAsXml.length();
        this.pageAsXml = HtmlUtil.removeBMCHorror(this.pageAsXml);
        if (length != this.pageAsXml.length()) {
            LOG.debug("Removed BMC Horror");
        }
        FileUtils.write(new File("target/pageAsXml.xml"), this.pageAsXml);
        this.webClient.closeAllWindows();
        this.htmlElement = null;
        try {
            HtmlFactory htmlFactory = new HtmlFactory();
            htmlFactory.setIgnoreNamespaces(true);
            this.xmlElement = XMLUtil.parseXML(this.pageAsXml);
            XMLUtil.debug(this.xmlElement, new FileOutputStream("target/htmlUnit.xml"), 1);
            this.htmlElement = htmlFactory.parse(this.xmlElement);
        } catch (Exception e) {
            LOG.error("cannot parse HTML " + this.pageAsXml, e);
        }
        return this.htmlElement;
    }
}
