package org.opensextant.xtext.converters;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.BodyContentHandler;
import org.opensextant.util.TextUtils;
import org.opensextant.xtext.ConvertedDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/opensextant/xtext/converters/TikaHTMLConverter.class */
public class TikaHTMLConverter extends ConverterAdapter {
    public static final int MAX_HTML_FILE_SIZE = 524288;
    HtmlParser parser;
    private boolean scrubHTMLArticle;
    private int maxHTMLDocumentSize;
    private static Logger log = LoggerFactory.getLogger(TikaHTMLConverter.class);

    public static boolean isUsefulMeta(String str) {
        String lowerCase = str.toLowerCase();
        if (lowerCase.contains("date") || lowerCase.contains("time")) {
            return true;
        }
        if (lowerCase.startsWith("twitter:") || lowerCase.startsWith("fb:")) {
            return false;
        }
        if (lowerCase.contains("description") || lowerCase.contains("subject") || lowerCase.contains("keywords") || lowerCase.startsWith("article:")) {
            return true;
        }
        log.debug("HTTP meta tag found meta={}", str);
        return false;
    }

    public TikaHTMLConverter(boolean z) throws IOException {
        this.parser = new HtmlParser();
        this.scrubHTMLArticle = false;
        this.maxHTMLDocumentSize = MAX_HTML_FILE_SIZE;
        this.scrubHTMLArticle = z;
    }

    public TikaHTMLConverter(boolean z, int i) throws IOException {
        this(z);
        this.maxHTMLDocumentSize = i;
    }

    @Override // org.opensextant.xtext.converters.ConverterAdapter
    protected ConvertedDocument conversionImplementation(InputStream inputStream, File file) throws IOException {
        Metadata metadata = new Metadata();
        HashMap hashMap = new HashMap();
        BoilerpipeContentHandler bodyContentHandler = new BodyContentHandler(this.maxHTMLDocumentSize);
        BoilerpipeContentHandler boilerpipeContentHandler = this.scrubHTMLArticle ? new BoilerpipeContentHandler(bodyContentHandler) : null;
        try {
            try {
                this.parser.parse(inputStream, this.scrubHTMLArticle ? boilerpipeContentHandler : bodyContentHandler, metadata, new ParseContext());
                if (file != null) {
                    parseHTMLMetadata(file, hashMap);
                }
                ConvertedDocument convertedDocument = new ConvertedDocument(file);
                convertedDocument.is_converted = true;
                convertedDocument.addTitle(metadata.get(TikaCoreProperties.TITLE));
                convertedDocument.setText(TextUtils.reduce_line_breaks(this.scrubHTMLArticle ? boilerpipeContentHandler.getTextDocument().getText(true, false) : bodyContentHandler.toString()));
                if (TextUtils.isASCII(convertedDocument.buffer.getBytes())) {
                    convertedDocument.setEncoding("ASCII");
                } else {
                    convertedDocument.setEncoding(metadata.get("Content-Encoding"));
                }
                convertedDocument.addProperty("filtered", this.scrubHTMLArticle);
                convertedDocument.addProperty("converter", TikaHTMLConverter.class.getName());
                if (!hashMap.isEmpty()) {
                    for (String str : hashMap.keySet()) {
                        convertedDocument.addUserProperty(str, (String) hashMap.get(str));
                    }
                }
                return convertedDocument;
            } catch (Exception e) {
                throw new IOException("Unable to parse content", e);
            }
        } finally {
            inputStream.close();
        }
    }

    private void parseHTMLMetadata(File file, Map<String, String> map) throws IOException {
        String attributeValue;
        for (StartTag startTag : new Source(file).getAllStartTags("meta")) {
            String attributeValue2 = startTag.getAttributeValue("name");
            String attributeValue3 = startTag.getAttributeValue("property");
            if (attributeValue3 == null && attributeValue2 == null) {
                log.debug("Unmatched metadata in HTML {}", startTag.toString());
            } else {
                String str = attributeValue3 != null ? attributeValue3 : attributeValue2;
                if (isUsefulMeta(str) && (attributeValue = startTag.getAttributeValue("content")) != null) {
                    map.put(str, attributeValue);
                }
            }
        }
    }
}
