package com.flamenk.article;

import com.flamenk.AlgorithmConfiguration;
import com.flamenk.TagConstants;
import com.flamenk.article.manipulators.HtmlNodeManipulator;
import com.flamenk.dom.HtmlDocument;
import com.flamenk.dom.HtmlDocumentBuilder;
import com.flamenk.dom.HtmlNode;
import com.flamenk.dom.HtmlNodeArticleRanker;
import com.flamenk.dom.HtmlNodeDisplayMode;
import com.flamenk.dom.HtmlNodeRange;
import com.flamenk.dom.SimpleHtmlNodeArticleRanker;
import com.flamenk.dom.SimpleTextTokenizer;
import com.flamenk.dom.TextTokenizer;
import com.flamenk.histogram.HistogramBuilder;
import com.flamenk.histogram.HtmlNodeTokenizer;
import com.flamenk.histogram.SimpleHtmlNodeTokenizer;
import com.flamenk.histogram.SimpleTokenRanker;
import com.flamenk.histogram.Token;
import com.flamenk.histogram.TokenRanker;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
import org.apache.commons.lang.StringEscapeUtils;

/* loaded from: input_file:com/flamenk/article/ArticleExtractor.class */
public class ArticleExtractor {
    private static final String SRC_ATTR = "src";
    private static final String ALT_ATTR = "alt";
    private static final String TITLE_ATTR = "title";
    private static final int TIMEOUT = 30000;
    private static final HtmlDocumentBuilder HTML_DOCUMENT_BUILDER = HtmlDocumentBuilder.getInstance();
    private volatile HtmlNodeArticleRanker mHtmlNodeRanker = SimpleHtmlNodeArticleRanker.getInstance();
    private volatile HtmlNodeTokenizer mNodeTokenizer = SimpleHtmlNodeTokenizer.getInstance();
    private volatile TextTokenizer mTextTokenizer = SimpleTextTokenizer.getInstance();
    private volatile TokenRanker mTokenRanker = SimpleTokenRanker.getInstance();
    private volatile AlgorithmConfiguration mAlgoConfig = AlgorithmConfiguration.getDefault();
    private volatile int mConnectionTimeout = TIMEOUT;
    private final HistogramBuilder mHistogramBuilder = HistogramBuilder.getInstance();
    private final HistogramSegmentExtractor mHistogramSegExtractor = HistogramSegmentExtractor.getInstance();
    private final List<HtmlNodeManipulator> mManipulators = new CopyOnWriteArrayList();

    public ArticleExtractor addNodeManupulator(HtmlNodeManipulator htmlNodeManipulator) {
        Preconditions.checkNotNull(htmlNodeManipulator);
        this.mManipulators.add(htmlNodeManipulator);
        return this;
    }

    public ArticleExtractor withNodeArticleRanker(HtmlNodeArticleRanker htmlNodeArticleRanker) {
        Preconditions.checkNotNull(htmlNodeArticleRanker);
        this.mHtmlNodeRanker = htmlNodeArticleRanker;
        return this;
    }

    public ArticleExtractor withNodeTokenizer(HtmlNodeTokenizer htmlNodeTokenizer) {
        Preconditions.checkNotNull(htmlNodeTokenizer);
        this.mNodeTokenizer = htmlNodeTokenizer;
        return this;
    }

    public ArticleExtractor withTextTokenizer(TextTokenizer textTokenizer) {
        Preconditions.checkNotNull(textTokenizer);
        this.mTextTokenizer = textTokenizer;
        return this;
    }

    public ArticleExtractor withTokenRanker(TokenRanker tokenRanker) {
        Preconditions.checkNotNull(tokenRanker);
        this.mTokenRanker = tokenRanker;
        return this;
    }

    public ArticleExtractor withConnectionTimeout(int i) {
        Preconditions.checkArgument(i >= 0);
        this.mConnectionTimeout = i;
        return this;
    }

    public ArticleExtractor withAlgorithmConfiguration(AlgorithmConfiguration algorithmConfiguration) {
        Preconditions.checkNotNull(algorithmConfiguration);
        this.mAlgoConfig = algorithmConfiguration;
        return this;
    }

    private void manipulateDocument(HtmlDocument htmlDocument, List<HtmlNodeRange> list, HtmlNodeRange htmlNodeRange, HtmlNodeRange htmlNodeRange2, Article article) {
        LinkedList linkedList = new LinkedList();
        linkedList.push(htmlDocument.getRootNode());
        while (!linkedList.isEmpty()) {
            HtmlNode htmlNode = (HtmlNode) linkedList.pop();
            if (htmlNode.getDisplayMode() != HtmlNodeDisplayMode.NOT_DISPLAY) {
                if (isNodeInRanges(list, htmlNode)) {
                    htmlNode.setDisplayMode(HtmlNodeDisplayMode.DISPLAY);
                } else {
                    htmlNode.setDisplayMode(HtmlNodeDisplayMode.DISPLAY_ONLY_CHILDREN);
                }
                if (htmlNodeRange.isNodeInRange(htmlNode)) {
                    Iterator<HtmlNodeManipulator> it = this.mManipulators.iterator();
                    while (it.hasNext()) {
                        it.next().manipulate(htmlNode, article, htmlNodeRange2);
                    }
                }
                Iterator<HtmlNode> descendingIterator = htmlNode.getChildren().descendingIterator();
                while (descendingIterator.hasNext()) {
                    linkedList.push(descendingIterator.next());
                }
            }
        }
    }

    private boolean isNodeInRanges(List<HtmlNodeRange> list, HtmlNode htmlNode) {
        for (HtmlNodeRange htmlNodeRange : list) {
            if (((HtmlNode) htmlNodeRange.getStartNode().get()) != ((HtmlNode) htmlNodeRange.getEndNode().get()) && htmlNodeRange.isNodeInRange(htmlNode)) {
                return true;
            }
        }
        return false;
    }

    private HtmlDocumentBuilder.HtmlDocumentWrapper buildDocument(URL url) throws IOException {
        return HTML_DOCUMENT_BUILDER.build(this.mHtmlNodeRanker, this.mTextTokenizer, url, this.mConnectionTimeout);
    }

    public Optional<Article> extractArticle(URL url) throws IOException {
        Preconditions.checkNotNull(url);
        HtmlDocumentBuilder.HtmlDocumentWrapper buildDocument = buildDocument(url);
        long startProcessingTime = buildDocument.getStartProcessingTime();
        HtmlDocument htmlDocument = buildDocument.getHtmlDocument();
        Optional<HtmlNode> nodeWithMaxRank = htmlDocument.getNodeWithMaxRank();
        if (!nodeWithMaxRank.isPresent()) {
            return Optional.absent();
        }
        List<HistogramSegment> buildHistogramSegments = buildHistogramSegments((HtmlNode) nodeWithMaxRank.get());
        if (buildHistogramSegments.isEmpty()) {
            return Optional.absent();
        }
        ArrayList arrayList = new ArrayList(buildHistogramSegments.size());
        HistogramSegment histogramSegment = null;
        for (HistogramSegment histogramSegment2 : buildHistogramSegments) {
            Optional<HtmlNodeRange> nodeRange = histogramSegment2.getNodeRange();
            if (nodeRange.isPresent()) {
                arrayList.add(nodeRange.get());
                if (histogramSegment == null || histogramSegment2.getRank() > histogramSegment.getRank()) {
                    histogramSegment = histogramSegment2;
                }
            }
        }
        if (histogramSegment == null) {
            return Optional.absent();
        }
        HtmlNodeRange htmlNodeRange = (HtmlNodeRange) histogramSegment.getNodeRange().get();
        HtmlNodeRange withIncludedEndNode = new HtmlNodeRange().withIncludedStartNode((HtmlNode) ((HtmlNodeRange) arrayList.get(0)).getStartNode().get()).withIncludedEndNode((HtmlNode) ((HtmlNodeRange) arrayList.get(arrayList.size() - 1)).getEndNode().get());
        Article article = new Article(url);
        findTitleAndImageAndSet(withIncludedEndNode, htmlNodeRange, htmlDocument, article);
        manipulateDocument(htmlDocument, arrayList, withIncludedEndNode, htmlNodeRange, article);
        article.setBody(htmlDocument.getRootNode().toDisplayString());
        article.setProcessingTime(System.currentTimeMillis() - startProcessingTime);
        return Optional.of(article);
    }

    private List<HistogramSegment> buildHistogramSegments(HtmlNode htmlNode) {
        List<Token> list = this.mNodeTokenizer.tokenize(htmlNode);
        if (list.isEmpty()) {
            return Collections.EMPTY_LIST;
        }
        return this.mHistogramSegExtractor.extract(this.mHistogramBuilder.build(list, this.mTokenRanker, this.mAlgoConfig), this.mAlgoConfig);
    }

    private void findTitleAndImageAndSet(HtmlNodeRange htmlNodeRange, HtmlNodeRange htmlNodeRange2, HtmlDocument htmlDocument, Article article) {
        HtmlNode findTitleAndSet = findTitleAndSet(htmlNodeRange2, htmlDocument, article);
        HtmlNodeRange htmlNodeRange3 = htmlNodeRange2;
        if (findTitleAndSet != null && !htmlNodeRange2.isNodeInRange(findTitleAndSet)) {
            htmlNodeRange3 = new HtmlNodeRange(findTitleAndSet, (HtmlNode) htmlNodeRange.getEndNode().get());
        }
        findImageAndSet(htmlNodeRange3, htmlDocument, article);
    }

    private Multiset<String> tokenize(String str) {
        HashMultiset create = HashMultiset.create();
        for (String str2 : this.mTextTokenizer.tokenize(str)) {
            if (!str2.trim().isEmpty()) {
                create.add(str2.toLowerCase());
            }
        }
        return create;
    }

    /* JADX WARN: Multi-variable type inference failed */
    private HtmlNode findTitleAndSet(HtmlNodeRange htmlNodeRange, HtmlDocument htmlDocument, Article article) {
        Optional<String> title = htmlDocument.getTitle();
        String cleanHeadTitle = title.isPresent() ? cleanHeadTitle((String) title.get()) : null;
        Optional<HtmlNode> nodeWithMaxRank = htmlDocument.getNodeWithMaxRank();
        if (!nodeWithMaxRank.isPresent()) {
            if (cleanHeadTitle == null) {
                return null;
            }
            article.setTitle(cleanHeadTitle);
            return null;
        }
        HtmlNodeRange withIncludedStartNode = new HtmlNodeRange().withIncludedStartNode(htmlDocument.getRootNode());
        Optional<HtmlNode> startNode = htmlNodeRange.getStartNode();
        if (startNode.isPresent()) {
            withIncludedStartNode.withExcludedEndNode((HtmlNode) startNode.get());
        }
        List<HtmlNode> allNodesByName = htmlDocument.getAllNodesByName(TagConstants.H1, withIncludedStartNode);
        Optional<HtmlNode> firstNodeByName = htmlDocument.getFirstNodeByName(TagConstants.H1, htmlNodeRange);
        if (firstNodeByName.isPresent()) {
            allNodesByName.add(firstNodeByName.get());
        }
        HtmlNode htmlNode = (HtmlNode) nodeWithMaxRank.get();
        HtmlNode htmlNode2 = null;
        String str = null;
        double d = 0.0d;
        for (HtmlNode htmlNode3 : allNodesByName) {
            String text = htmlNode3.toText();
            double rankFor = rankFor(text, htmlNode);
            if (rankFor > d) {
                d = rankFor;
                htmlNode2 = htmlNode3;
                str = text.trim();
            }
        }
        if (htmlNode2 == null && cleanHeadTitle != null) {
            str = cleanHeadTitle;
        } else if (cleanHeadTitle != null) {
            String cleanHeadTitle2 = cleanHeadTitle(cleanHeadTitle);
            if (rankFor(cleanHeadTitle2, htmlNode) > d) {
                str = cleanHeadTitle2;
                htmlNode2 = null;
            }
        }
        if (str != null) {
            article.setTitle(str);
        }
        if (htmlNode2 != null) {
            htmlNode2.setDisplayMode(HtmlNodeDisplayMode.NOT_DISPLAY);
        }
        return htmlNode2;
    }

    private String cleanHeadTitle(String str) {
        String str2 = str;
        if (str2.contains("|")) {
            str2 = str2.substring(0, str2.lastIndexOf("|")).trim();
        } else if (str2.contains("-")) {
            str2 = str2.substring(0, str2.lastIndexOf("-")).trim();
        }
        return str2;
    }

    private void findImageAndSet(HtmlNodeRange htmlNodeRange, HtmlDocument htmlDocument, Article article) {
        Optional<String> title = article.getTitle();
        if (title.isPresent()) {
            Multiset<String> multiset = tokenize((String) title.get());
            List<HtmlNode> allNodesByName = htmlDocument.getAllNodesByName(TagConstants.IMG, htmlNodeRange);
            HtmlNode htmlNode = (HtmlNode) htmlDocument.getNodeWithMaxRank().get();
            String str = null;
            double d = 0.0d;
            for (HtmlNode htmlNode2 : allNodesByName) {
                if (htmlNode2.hasAttribute(SRC_ATTR)) {
                    String attributeValue = htmlNode2.hasAttribute(ALT_ATTR) ? htmlNode2.getAttributeValue(ALT_ATTR) : "";
                    if (htmlNode2.hasAttribute(TITLE_ATTR)) {
                        attributeValue = attributeValue + " " + htmlNode2.getAttributeValue(TITLE_ATTR);
                    }
                    if (!attributeValue.isEmpty()) {
                        String[] strArr = this.mTextTokenizer.tokenize(attributeValue.toLowerCase());
                        double d2 = 0.0d;
                        double d3 = 0.0d;
                        for (String str2 : strArr) {
                            if (str2.length() >= 5) {
                                d2 += htmlNode.getCountForTerm(r0);
                                d3 += multiset.count(r0);
                            }
                        }
                        double length = strArr.length * multiset.size();
                        double length2 = strArr.length * htmlNode.getNumTextTokens();
                        double d4 = (length > 0.0d ? d3 / length : 0.0d) + (length2 > 0.0d ? d2 / length2 : 0.0d);
                        if (d4 > d) {
                            d = d4;
                            str = htmlNode2.getAttributeValue(SRC_ATTR);
                        }
                    }
                }
            }
            if (str != null) {
                if (!str.startsWith("http://")) {
                    try {
                        str = new URL(article.getUrl(), str).toString();
                    } catch (MalformedURLException e) {
                    }
                }
                article.setMainImageUrl(str);
            }
        }
    }

    private double rankFor(String str, HtmlNode htmlNode) {
        String[] strArr = this.mTextTokenizer.tokenize(StringEscapeUtils.unescapeHtml(str));
        double d = 0.0d;
        for (String str2 : strArr) {
            if (str2.length() >= 5) {
                d += htmlNode.getCountForTerm(r0);
            }
        }
        return strArr.length > 0 ? (d / 1.0d) + Math.log(strArr.length) : 0.0d;
    }
}
