package com.khala.extractor;

import java.util.Iterator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/* loaded from: input_file:com/khala/extractor/BasicParser.class */
public class BasicParser implements Parser {
    @Override // com.khala.extractor.Parser
    public Document denoiseForDoc(Document document) {
        document.getElementsByTag("script").remove();
        document.getElementsByTag("style").remove();
        document.getElementsByTag("select").remove();
        document.getElementsByTag("link").remove();
        document.getElementsByTag("input").remove();
        document.getElementsByTag("object").remove();
        document.getElementsByTag("textarea").remove();
        document.getElementsByTag("a").attr("href", "javascript:void(0)").remove();
        document.getElementsByAttributeValue("display", "none").remove();
        document.getElementsByAttributeValueContaining("style", "display:none").remove();
        document.getElementsByAttributeValueContaining("style", "overflow: hidden").remove();
        return document;
    }

    @Override // com.khala.extractor.Parser
    public Element excavateContent(Document document) {
        Element body = document.body();
        doScoreToElement(body);
        String stringBuffer = checkPath(getMaxScoreChild(body), new StringBuffer(), document).toString();
        if (stringBuffer.contains(">p>")) {
            stringBuffer = stringBuffer.split(">p>")[0];
        }
        if (stringBuffer.endsWith(">")) {
            stringBuffer = stringBuffer.substring(0, stringBuffer.length() - 1);
        }
        if (stringBuffer.endsWith(">p")) {
            stringBuffer = stringBuffer.substring(0, stringBuffer.length() - 2);
        }
        return body.select(stringBuffer).first();
    }

    @Override // com.khala.extractor.Parser
    public void denioseForContentElement(Element element) {
    }

    @Override // com.khala.extractor.Parser
    public void downloadImg(Element element) {
        if (element.getElementsByTag("img") == null) {
        }
    }

    @Override // com.khala.extractor.Parser
    public String removeNeedlessChars(String str) {
        for (String str2 : new String[]{"<!--.[^-]*(?=-->)-->", "(?is)<!--.*?-->", "摘自\\w{2,5}网", "【[\\u4e00-\\u9fa5]{2,6}网】", "\\(([^\\(]*)?\\d{5,6}([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?简称([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?微博([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?基金吧([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?股吧([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?代码([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?记者([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?编辑([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?作者([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?点击([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?访问([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?www\\.([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?http://([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?来源([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?标题([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?微信([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?收盘价([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?客户端([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?交易所([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?行情([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?评论([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?声明([^\\(|\\)]*)?\\)", "\\(([^\\(]*)?版权([^\\(|\\)]*)?\\)"}) {
            str = str.replaceAll(str2, "");
        }
        return str;
    }

    @Override // com.khala.extractor.Parser
    public String removeTails(String str) {
        for (String str2 : new String[]{"【免责声明", "【版权声明", "【重点推荐", "【延伸阅读", "【推荐阅读", "【相关阅读", "免责声明", "版权声明", "【更多详情", "【相关专题", "上一篇：", "下一篇："}) {
            int indexOf = str.indexOf(str2);
            if (indexOf > 0) {
                str = str.substring(0, indexOf);
            }
        }
        return str;
    }

    @Override // com.khala.extractor.Parser
    public Element format(String str) {
        return Jsoup.parse(str).body();
    }

    @Override // com.khala.extractor.Parser
    public String getContent(Document document) {
        return getContentEle(document).toString().replace("<body>", "").replace("</body>", "");
    }

    public String getTitle(Document document) {
        return document.getElementsByTag("h1").size() != 0 ? ((Element) document.getElementsByTag("h1").get(0)).text() : document.getElementsByTag("title").text();
    }

    @Override // com.khala.extractor.Parser
    public Element getContentEle(Document document) {
        denoiseForDoc(document);
        Element excavateContent = excavateContent(document);
        if (excavateContent == null) {
            return null;
        }
        denioseForContentElement(excavateContent);
        downloadImg(excavateContent);
        return format(removeTails(removeNeedlessChars(excavateContent.toString())));
    }

    @Override // com.khala.extractor.Parser
    public String getContentPath() {
        return null;
    }

    @Override // com.khala.extractor.Parser
    public String getContentText(Document document) {
        return getContentEle(document).text();
    }

    private int doScoreToElement(Element element) {
        Elements children = element.children();
        if (children.size() == 0) {
            return Rating.doRate(element);
        }
        int doOwnTextRate = Rating.doOwnTextRate(element);
        Iterator it = children.iterator();
        while (it.hasNext()) {
            doOwnTextRate += doScoreToElement((Element) it.next());
        }
        element.attr("score", String.valueOf(doOwnTextRate));
        return doOwnTextRate;
    }

    private StringBuffer checkPath(Element element, StringBuffer stringBuffer, Document document) {
        if (element != null && element.parent() != null) {
            if (element.parent() != null) {
                Element parent = element.parent();
                String tagName = parent.tagName();
                if (parent.hasAttr("id")) {
                    stringBuffer.insert(0, tagName + "#" + parent.attr("id") + ">");
                } else if (parent.hasAttr("class")) {
                    String replace = parent.attr("class").trim().replace(" ", ".");
                    if ("p".equals(tagName)) {
                        replace = "";
                    }
                    if ("".equals(replace)) {
                        stringBuffer.insert(0, tagName + ">");
                    } else {
                        stringBuffer.insert(0, tagName + "." + replace + ">");
                    }
                    if (!"p".equals(tagName) && document.getElementsByClass(replace).size() <= 1) {
                        return stringBuffer;
                    }
                    stringBuffer = checkPath(element.parent(), stringBuffer, document);
                } else {
                    stringBuffer.insert(0, tagName + ">");
                    if (!"body".equals(tagName)) {
                        stringBuffer = checkPath(element.parent(), stringBuffer, document);
                    }
                }
            }
            return stringBuffer;
        }
        return stringBuffer;
    }

    public Element getMaxScoreChild(Element element) {
        if (element.childNodeSize() == 0) {
            return element;
        }
        Elements children = element.children();
        if (children == null || children.size() == 0) {
            return element;
        }
        Element first = children.first();
        int i = 0;
        Iterator it = children.iterator();
        while (it.hasNext()) {
            Element element2 = (Element) it.next();
            String attr = element2.attr("score");
            if (attr != null && Integer.valueOf(attr).intValue() > i) {
                first = element2;
                i = Integer.valueOf(attr).intValue();
            }
        }
        return getMaxScoreChild(first);
    }
}
