package cc.wikitools.lucene;

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wikiclean.WikiClean;
import org.wikiclean.WikiCleanBuilder;
import org.wikiclean.WikipediaBz2DumpInputStream;

/* loaded from: input_file:cc/wikitools/lucene/IndexWikipediaDump.class */
public class IndexWikipediaDump {
    private static final Logger LOG = Logger.getLogger(IndexWikipediaDump.class);
    public static final Analyzer ANALYZER = new StandardAnalyzer(Version.LUCENE_43);
    static final FieldType TEXT_OPTIONS = new FieldType();
    private static final int DEFAULT_NUM_THREADS = 4;
    private static final String INPUT_OPTION = "input";
    private static final String INDEX_OPTION = "index";
    private static final String MAX_OPTION = "maxdocs";
    private static final String OPTIMIZE_OPTION = "optimize";
    private static final String THREADS_OPTION = "threads";

    /* loaded from: input_file:cc/wikitools/lucene/IndexWikipediaDump$AddDocumentRunnable.class */
    private static class AddDocumentRunnable implements Runnable {
        private final IndexWriter writer;
        private final WikiClean cleaner;
        private final String page;

        AddDocumentRunnable(IndexWriter indexWriter, WikiClean wikiClean, String str) {
            this.writer = indexWriter;
            this.cleaner = wikiClean;
            this.page = str;
        }

        @Override // java.lang.Runnable
        public void run() {
            Document document = new Document();
            document.add(new IntField(IndexField.ID.name, Integer.parseInt(this.cleaner.getId(this.page)), Field.Store.YES));
            document.add(new Field(IndexField.TEXT.name, this.cleaner.clean(this.page), IndexWikipediaDump.TEXT_OPTIONS));
            document.add(new Field(IndexField.TITLE.name, this.cleaner.getTitle(this.page), IndexWikipediaDump.TEXT_OPTIONS));
            try {
                this.writer.addDocument(document);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /* loaded from: input_file:cc/wikitools/lucene/IndexWikipediaDump$IndexField.class */
    public enum IndexField {
        ID("id"),
        TITLE("title"),
        TEXT("text");

        public final String name;

        IndexField(String str) {
            this.name = str;
        }
    }

    public static void main(String[] strArr) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName("path");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("bz2 Wikipedia XML dump file");
        options.addOption(OptionBuilder.create(INPUT_OPTION));
        OptionBuilder.withArgName("dir");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("index location");
        options.addOption(OptionBuilder.create(INDEX_OPTION));
        OptionBuilder.withArgName("num");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("maximum number of documents to index");
        options.addOption(OptionBuilder.create(MAX_OPTION));
        OptionBuilder.withArgName("num");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("number of indexing threads");
        options.addOption(OptionBuilder.create(THREADS_OPTION));
        options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
        CommandLine commandLine = null;
        try {
            commandLine = new GnuParser().parse(options, strArr);
        } catch (ParseException e) {
            System.err.println("Error parsing command line: " + e.getMessage());
            System.exit(-1);
        }
        if (!commandLine.hasOption(INPUT_OPTION) || !commandLine.hasOption(INDEX_OPTION)) {
            new HelpFormatter().printHelp(IndexWikipediaDump.class.getCanonicalName(), options);
            System.exit(-1);
        }
        String optionValue = commandLine.getOptionValue(INDEX_OPTION);
        int parseInt = commandLine.hasOption(MAX_OPTION) ? Integer.parseInt(commandLine.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE;
        int parseInt2 = commandLine.hasOption(THREADS_OPTION) ? Integer.parseInt(commandLine.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS;
        long currentTimeMillis = System.currentTimeMillis();
        String optionValue2 = commandLine.getOptionValue(INPUT_OPTION);
        PrintStream printStream = new PrintStream((OutputStream) System.out, true, "UTF-8");
        WikiClean build = new WikiCleanBuilder().withTitle(true).build();
        FSDirectory open = FSDirectory.open(new File(optionValue));
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        IndexWriter indexWriter = new IndexWriter(open, indexWriterConfig);
        LOG.info("Creating index at " + optionValue);
        LOG.info("Indexing with " + parseInt2 + " threads");
        try {
            try {
                WikipediaBz2DumpInputStream wikipediaBz2DumpInputStream = new WikipediaBz2DumpInputStream(optionValue2);
                ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(parseInt2);
                int i = 0;
                while (true) {
                    String readNext = wikipediaBz2DumpInputStream.readNext();
                    if (readNext == null) {
                        break;
                    }
                    String title = build.getTitle(readNext);
                    if (!title.startsWith("Wikipedia:") && !title.startsWith("Portal:") && !title.startsWith("File:") && !readNext.contains("#REDIRECT") && !readNext.contains("#redirect") && !readNext.contains("#Redirect")) {
                        newFixedThreadPool.execute(new AddDocumentRunnable(indexWriter, build, readNext));
                        i++;
                        if (i % 10000 == 0) {
                            LOG.info(i + " articles added");
                        }
                        if (i >= parseInt) {
                            break;
                        }
                    }
                }
                newFixedThreadPool.shutdown();
                do {
                } while (!newFixedThreadPool.isTerminated());
                LOG.info("Total of " + i + " articles indexed.");
                if (commandLine.hasOption(OPTIMIZE_OPTION)) {
                    LOG.info("Merging segments...");
                    indexWriter.forceMerge(1);
                    LOG.info("Done!");
                }
                LOG.info("Total elapsed time: " + (System.currentTimeMillis() - currentTimeMillis) + "ms");
                indexWriter.close();
                open.close();
                printStream.close();
            } catch (Exception e2) {
                e2.printStackTrace();
                indexWriter.close();
                open.close();
                printStream.close();
            }
        } catch (Throwable th) {
            indexWriter.close();
            open.close();
            printStream.close();
            throw th;
        }
    }

    static {
        TEXT_OPTIONS.setIndexed(true);
        TEXT_OPTIONS.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
        TEXT_OPTIONS.setStored(true);
        TEXT_OPTIONS.setTokenized(true);
    }
}
