package org.opensextant.xtext.collectors.web;

import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.client.utils.DateUtils;
import org.opensextant.ConfigException;
import org.opensextant.util.FileUtility;
import org.opensextant.xtext.ConvertedDocument;
import org.opensextant.xtext.ExclusionFilter;
import org.opensextant.xtext.collectors.CollectionListener;
import org.opensextant.xtext.collectors.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/opensextant/xtext/collectors/web/DefaultWebCrawl.class */
public class DefaultWebCrawl extends WebClient implements ExclusionFilter, Collector, CrawlFilter {
    protected CollectionListener listener;
    private final Logger log;
    private boolean allowCurrentSiteOnly;
    private boolean allowCurrentDirOnly;
    private HashSet<String> errorPages;
    private List<String> prefixFilters;
    private List<String> prefixIgnore;

    public DefaultWebCrawl(String str, String str2) throws MalformedURLException, ConfigException {
        super(str, str2);
        this.listener = null;
        this.log = LoggerFactory.getLogger(getClass());
        this.allowCurrentSiteOnly = true;
        this.allowCurrentDirOnly = false;
        this.errorPages = new HashSet<>();
        this.prefixFilters = new ArrayList();
        this.prefixIgnore = new ArrayList();
    }

    public void setListener(CollectionListener collectionListener) {
        this.listener = collectionListener;
    }

    public void addPrefixFilter(String str) {
        if (StringUtils.isNotBlank(str)) {
            this.prefixFilters.add(str);
        }
    }

    public void addPrefixFilters(Collection<String> collection) {
        if (collection != null) {
            this.prefixFilters.addAll(collection);
        }
    }

    public void addIgnoreFilter(String str) {
        if (StringUtils.isNotBlank(str)) {
            this.prefixIgnore.add(str);
        }
    }

    public void addIgnoreFilters(Collection<String> collection) {
        if (collection != null) {
            this.prefixIgnore.addAll(collection);
        }
    }

    @Override // org.opensextant.xtext.ExclusionFilter
    public boolean filterOutFile(String str) {
        String lowerCase = str.toLowerCase();
        return lowerCase.startsWith("mailto:") || lowerCase.endsWith(".atom") || lowerCase.endsWith(".rss") || lowerCase.endsWith(".flv") || lowerCase.endsWith(".mp4") || lowerCase.contains("xmlrpc");
    }

    @Override // org.opensextant.xtext.collectors.Collector
    public void collect() throws IOException {
        try {
            collectItems(null, getSite());
        } catch (NoSuchAlgorithmException e) {
            this.log.error("Hashing error", e);
        }
    }

    public boolean filterOut(HyperLink hyperLink) {
        if (filterOutFile(hyperLink.getAbsoluteURL())) {
            return true;
        }
        if (!hyperLink.isPageAnchor()) {
            return false;
        }
        this.log.debug("Filter out anchor link {}", hyperLink);
        return true;
    }

    public void collectItems(String str, URL url) throws IOException, NoSuchAlgorithmException {
        String url2 = url.toString();
        if (str != null) {
            url2 = str;
        }
        HyperLink hyperLink = new HyperLink(url2, new URL(url2), getSite());
        if (this.errorPages.contains(hyperLink.getAbsoluteURL())) {
            this.log.debug("Do not visit error pages tracked in this session; link: {}", url2);
            return;
        }
        HttpResponse page = getPage(prepURL(url2));
        Header firstHeader = page.getFirstHeader("Last-Modified");
        if (firstHeader != null) {
            DateUtils.parseDate(firstHeader.getValue());
        }
        String readTextStream = WebClient.readTextStream(page.getEntity().getContent());
        String normalPath = hyperLink.getNormalPath();
        if (hyperLink.isDynamic() && !normalPath.endsWith("html")) {
            normalPath = String.format("%s.html", normalPath);
        }
        File createArchiveFile = createArchiveFile(normalPath, hyperLink.isFolder());
        if (!createArchiveFile.exists()) {
            FileUtility.writeFile(readTextStream, createArchiveFile.getAbsolutePath());
        }
        this.log.info("Starting in on {} from {} @ depth=" + this.depth, url2, this.site);
        pause();
        this.depth++;
        collectItemsOnPage(readTextStream, hyperLink.getURL(), getSite());
    }

    public void collect(File file) throws IOException {
        collectItemsOnPage(FileUtility.readFile(file, ConvertedDocument.OUTPUT_ENCODING), getSite(), getSite());
    }

    protected boolean userFilteredOut(String str) {
        boolean z = true;
        if (this.prefixFilters.size() > 0) {
            z = false;
            Iterator<String> it = this.prefixFilters.iterator();
            while (true) {
                if (!it.hasNext()) {
                    break;
                }
                if (str.startsWith(it.next())) {
                    z = true;
                    break;
                }
            }
        }
        if (!z) {
            return true;
        }
        if (this.prefixIgnore.size() > 0) {
            Iterator<String> it2 = this.prefixIgnore.iterator();
            while (true) {
                if (!it2.hasNext()) {
                    break;
                }
                if (str.startsWith(it2.next())) {
                    z = false;
                    break;
                }
            }
        }
        return !z;
    }

    protected void collectItemsOnPage(String str, URL url, URL url2) {
        for (HyperLink hyperLink : parseContentPage(str, url, url2)) {
            if (!filterOut(hyperLink)) {
                if (isAllowCurrentSiteOnly() && !hyperLink.isCurrentSite() && !hyperLink.isCurrentHost()) {
                    this.log.debug("Not on current site: {}", hyperLink);
                } else if (!isAllowCurrentDirOnly() || hyperLink.isCurrentPage()) {
                    String normalPath = hyperLink.getNormalPath();
                    if (normalPath == null) {
                        normalPath = hyperLink.getAbsoluteURL();
                    }
                    if (!this.found.containsKey(normalPath)) {
                        if (userFilteredOut(normalPath)) {
                            this.log.debug("Filtered Out by User: {}", normalPath);
                        } else {
                            this.found.put(normalPath, hyperLink);
                            if (!this.saved.contains(hyperLink.getId()) && (hyperLink.isFile() || hyperLink.isWebPage())) {
                                pause();
                                this.log.info("Pulling page {}", hyperLink);
                                try {
                                    try {
                                        if (this.listener == null || !this.listener.exists(hyperLink.getId())) {
                                            HttpResponse page = getPage(hyperLink.getURL());
                                            if (page.getStatusLine().getStatusCode() >= 400) {
                                                this.errorPages.add(hyperLink.getAbsoluteURL());
                                                this.log.error("Failing on this request, HTTP status>=400, LINK={}", hyperLink.getURL());
                                            } else {
                                                Header contentType = page.getEntity().getContentType();
                                                if (contentType != null) {
                                                    hyperLink.setMIMEType(contentType.getValue());
                                                }
                                                String normalPath2 = hyperLink.getNormalPath();
                                                if (hyperLink.isDynamic() && !normalPath2.endsWith(".html")) {
                                                    normalPath2 = normalPath2 + ".html";
                                                }
                                                File createArchiveFile = createArchiveFile(normalPath2, false);
                                                FileUtility.makeDirectory(new File(createArchiveFile.getParentFile().getAbsolutePath()));
                                                hyperLink.setFilepath(createArchiveFile);
                                                this.saved.add(hyperLink.getId());
                                                WebClient.downloadFile(page.getEntity(), createArchiveFile.getAbsolutePath());
                                                convertContent(createArchiveFile, hyperLink);
                                                if (hyperLink.isWebPage() && this.depth <= 5) {
                                                    collectItems(hyperLink.getAbsoluteURL(), url2);
                                                }
                                            }
                                        }
                                    } catch (Exception e) {
                                        this.log.error("Collection Listener error", e);
                                    }
                                } catch (Exception e2) {
                                    this.log.error("Item for URL {} was not saved due to a net or IO issue.", hyperLink.getAbsoluteURL(), e2);
                                }
                            }
                        }
                    }
                } else {
                    this.log.debug("Not on current directory: {}", hyperLink);
                }
            }
        }
        this.depth--;
    }

    protected void convertContent(File file, HyperLink hyperLink) throws IOException, ConfigException, NoSuchAlgorithmException {
        if (file == null || hyperLink == null) {
            throw new IOException("Bad data - null values for file and link...");
        }
        if (this.converter == null && this.listener != null) {
            this.log.debug("Link {} was saved to {}", hyperLink.getAbsoluteURL(), file.getAbsolutePath());
            this.listener.collected(file);
            return;
        }
        if (file.exists()) {
            ConvertedDocument convert = this.converter.convert(file);
            if (convert == null) {
                this.log.error("Document was not converted, FILE={}", file);
                return;
            }
            if (convert.textpath == null) {
                this.log.error("Expecting the content to be non-null for {}", convert.getFilepath());
                return;
            }
            convert.setId(hyperLink.getId());
            convert.addSourceURL(hyperLink.getAbsoluteURL(), hyperLink.getReferrer());
            convert.saveBuffer(new File(convert.textpath));
            if (this.listener != null) {
                this.listener.collected(convert, file.getAbsolutePath());
            }
        }
    }

    @Override // org.opensextant.xtext.collectors.web.CrawlFilter
    public boolean isAllowCurrentDirOnly() {
        return this.allowCurrentDirOnly;
    }

    @Override // org.opensextant.xtext.collectors.web.CrawlFilter
    public void setAllowCurrentDirOnly(boolean z) {
        this.allowCurrentDirOnly = z;
    }

    @Override // org.opensextant.xtext.collectors.web.CrawlFilter
    public boolean isAllowCurrentSiteOnly() {
        return this.allowCurrentSiteOnly;
    }

    @Override // org.opensextant.xtext.collectors.web.CrawlFilter
    public void setAllowCurrentSiteOnly(boolean z) {
        this.allowCurrentSiteOnly = z;
    }
}
