package com.entopix.maui.main;

import com.entopix.maui.filters.MauiFilter;
import com.entopix.maui.stemmers.PorterStemmer;
import com.entopix.maui.stemmers.Stemmer;
import com.entopix.maui.stopwords.Stopwords;
import com.entopix.maui.stopwords.StopwordsEnglish;
import com.entopix.maui.util.DataLoader;
import com.entopix.maui.util.Evaluator;
import com.entopix.maui.util.MauiDocument;
import com.entopix.maui.util.MauiTopics;
import com.entopix.maui.util.Topic;
import com.entopix.maui.vocab.Vocabulary;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Vector;
import org.apache.xerces.impl.xs.SchemaSymbols;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import weka.classifiers.lazy.kstar.KStarConstants;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;

/* loaded from: input_file:WEB-INF/lib/maui-1.4.6.jar:com/entopix/maui/main/MauiTopicExtractor.class */
public class MauiTopicExtractor implements OptionHandler {
    private static final Logger log = LoggerFactory.getLogger(MauiTopicExtractor.class);
    public String inputDirectoryName = null;
    public String modelName = null;
    public String vocabularyName = "none";
    public String vocabularyFormat = null;
    public String documentLanguage = "en";
    public String documentEncoding = "default";
    public boolean serialize = false;
    public double cutOffTopicProbability = KStarConstants.FLOOR;
    private MauiFilter mauiFilter = null;
    int topicsPerDocument = 10;
    public Stemmer stemmer = new PorterStemmer();
    public Stopwords stopwords = new StopwordsEnglish();
    private Vocabulary vocabulary = null;
    boolean additionalInfo = false;
    boolean buildGlobalDictionary = false;

    @Override // weka.core.OptionHandler
    public void setOptions(String[] strArr) throws Exception {
        String option = Utils.getOption('l', strArr);
        if (option.length() <= 0) {
            this.inputDirectoryName = null;
            throw new Exception("Name of directory required argument.");
        }
        this.inputDirectoryName = option;
        String option2 = Utils.getOption('m', strArr);
        if (option2.length() <= 0) {
            this.modelName = null;
            throw new Exception("Path to the model file is a required argument.");
        }
        this.modelName = option2;
        String option3 = Utils.getOption('v', strArr);
        if (option3.length() > 0) {
            this.vocabularyName = option3;
        }
        String option4 = Utils.getOption('f', strArr);
        if (!"".equals(option3) && !option3.equals("none")) {
            if (option4.length() <= 0) {
                throw new Exception("If a controlled vocabulary is used, format of vocabulary required argument (skos or text).");
            }
            if (!option4.equals("skos") && !option4.equals("text")) {
                throw new Exception("Unsupported format of vocabulary. It should be either \"skos\" or \"text\".");
            }
            this.vocabularyFormat = option4;
        }
        String option5 = Utils.getOption('e', strArr);
        if (option5.length() > 0) {
            this.documentEncoding = option5;
        }
        String option6 = Utils.getOption('i', strArr);
        if (option6.length() > 0) {
            this.documentLanguage = option6;
        }
        String option7 = Utils.getOption('n', strArr);
        if (option7.length() > 0) {
            this.topicsPerDocument = Integer.parseInt(option7);
        }
        String option8 = Utils.getOption('s', strArr);
        if (option8.length() > 0) {
            this.stopwords = (Stopwords) Class.forName("com.entopix.maui.stopwords.".concat(option8)).newInstance();
        }
        String option9 = Utils.getOption('t', strArr);
        if (option9.length() > 0) {
            this.stemmer = (Stemmer) Class.forName("com.entopix.maui.stemmers.".concat(option9)).newInstance();
        }
        this.serialize = Utils.getFlag('z', strArr);
        this.buildGlobalDictionary = Utils.getFlag('b', strArr);
        this.additionalInfo = Utils.getFlag('a', strArr);
        String option10 = Utils.getOption('c', strArr);
        if (option10.length() > 0) {
            this.cutOffTopicProbability = Double.parseDouble(option10);
        }
        Utils.checkForRemainingOptions(strArr);
    }

    @Override // weka.core.OptionHandler
    public String[] getOptions() {
        String[] strArr = new String[22];
        int i = 0 + 1;
        strArr[0] = "-l";
        int i2 = i + 1;
        strArr[i] = "" + this.inputDirectoryName;
        int i3 = i2 + 1;
        strArr[i2] = "-m";
        int i4 = i3 + 1;
        strArr[i3] = "" + this.modelName;
        int i5 = i4 + 1;
        strArr[i4] = "-v";
        int i6 = i5 + 1;
        strArr[i5] = "" + this.vocabularyName;
        int i7 = i6 + 1;
        strArr[i6] = "-f";
        int i8 = i7 + 1;
        strArr[i7] = "" + this.vocabularyFormat;
        int i9 = i8 + 1;
        strArr[i8] = "-e";
        int i10 = i9 + 1;
        strArr[i9] = "" + this.documentEncoding;
        int i11 = i10 + 1;
        strArr[i10] = "-i";
        int i12 = i11 + 1;
        strArr[i11] = "" + this.documentLanguage;
        int i13 = i12 + 1;
        strArr[i12] = "-n";
        int i14 = i13 + 1;
        strArr[i13] = "" + this.topicsPerDocument;
        int i15 = i14 + 1;
        strArr[i14] = "-c";
        int i16 = i15 + 1;
        strArr[i15] = "" + this.cutOffTopicProbability;
        int i17 = i16 + 1;
        strArr[i16] = "-t";
        int i18 = i17 + 1;
        strArr[i17] = "" + this.stemmer.getClass().getName();
        int i19 = i18 + 1;
        strArr[i18] = "-s";
        int i20 = i19 + 1;
        strArr[i19] = "" + this.stopwords.getClass().getName();
        if (this.serialize) {
            i20++;
            strArr[i20] = "-z";
        }
        if (this.buildGlobalDictionary) {
            int i21 = i20;
            i20++;
            strArr[i21] = "-b";
        }
        if (this.additionalInfo) {
            int i22 = i20;
            i20++;
            strArr[i22] = "-a";
        }
        while (i20 < strArr.length) {
            int i23 = i20;
            i20++;
            strArr[i23] = "";
        }
        return strArr;
    }

    public void setVocabulary(Vocabulary vocabulary) {
        this.vocabulary = vocabulary;
    }

    public void setTopicProbability(double d) {
        this.cutOffTopicProbability = d;
    }

    @Override // weka.core.OptionHandler
    public Enumeration<Option> listOptions() {
        Vector vector = new Vector(13);
        vector.addElement(new Option("\tSpecifies name of directory.", "l", 1, "-l <directory name>"));
        vector.addElement(new Option("\tSpecifies name of model.", "m", 1, "-m <model name>"));
        vector.addElement(new Option("\tSpecifies vocabulary name.", "v", 1, "-v <vocabulary name>"));
        vector.addElement(new Option("\tSpecifies vocabulary format.", "f", 1, "-f <vocabulary format>"));
        vector.addElement(new Option("\tSpecifies encoding.", "e", 1, "-e <encoding>"));
        vector.addElement(new Option("\tSpecifies document language (en (default), es, de, fr).", "i", 1, "-i <document language>"));
        vector.addElement(new Option("\tSpecifies number of phrases to be output (default: 5).", "n", 1, "-n"));
        vector.addElement(new Option("\tSpecifies cut off probability for each topic (default: 0.0).", "c", 1, "-c"));
        vector.addElement(new Option("\tSet the stemmer to use (default: SremovalStemmer).", "t", 1, "-t <name of stemmer class>"));
        vector.addElement(new Option("\tSet the stopwords class to use (default: EnglishStopwords).", "s", 1, "-s <name of stopwords class>"));
        vector.addElement(new Option("\tTurns serialization on.", "s", 0, "-s"));
        vector.addElement(new Option("\tBuilds global dictionaries for computing TFIDF from the test collection.", "b", 0, "-b"));
        vector.addElement(new Option("\tAlso write stemmed phrase and score into \".key\" file.", "a", 0, "-a"));
        return vector.elements();
    }

    public void loadVocabulary() {
        if (this.vocabulary != null) {
            return;
        }
        try {
            log.info("--- Loading the vocabulary...");
            this.vocabulary = new Vocabulary();
            this.vocabulary.setStemmer(this.stemmer);
            if (!this.vocabularyName.equals("lcsh")) {
                this.vocabulary.setStopwords(this.stopwords);
            }
            this.vocabulary.setLanguage(this.documentLanguage);
            this.vocabulary.setSerialize(this.serialize);
            this.vocabulary.initializeVocabulary(this.vocabularyName, this.vocabularyFormat);
        } catch (Exception e) {
            log.error("Failed to load thesaurus!", (Throwable) e);
        }
    }

    public List<MauiDocument> loadDocuments() throws MauiFilter.MauiFilterException {
        return DataLoader.loadTestDocuments(this.inputDirectoryName);
    }

    public List<MauiTopics> extractTopics(List<MauiDocument> list) throws MauiFilter.MauiFilterException {
        ArrayList arrayList = new ArrayList();
        FastVector fastVector = new FastVector(3);
        fastVector.addElement(new Attribute("filename", (FastVector) null));
        fastVector.addElement(new Attribute("doc", (FastVector) null));
        fastVector.addElement(new Attribute("keyphrases", (FastVector) null));
        Instances instances = new Instances("keyphrase_training_data", fastVector, 0);
        log.info("-- Extracting keyphrases... ");
        for (MauiDocument mauiDocument : list) {
            double[] dArr = new double[3];
            dArr[0] = instances.attribute(0).addStringValue(mauiDocument.getFileName());
            if (mauiDocument.getTextContent().length() > 0) {
                dArr[1] = instances.attribute(1).addStringValue(mauiDocument.getTextContent());
            } else {
                dArr[1] = Instance.missingValue();
            }
            if (mauiDocument.getTopicsString().length() > 0) {
                dArr[2] = instances.attribute(2).addStringValue(mauiDocument.getTopicsString());
            } else {
                dArr[2] = Instance.missingValue();
            }
            instances.add(new Instance(1.0d, dArr));
            this.mauiFilter.input(instances.instance(0));
            instances = instances.stringFreeStructure();
            log.info("-- Processing document: " + mauiDocument.getFileName());
            Instance[] instanceArr = new Instance[this.topicsPerDocument];
            MauiTopics mauiTopics = new MauiTopics(mauiDocument.getFilePath());
            mauiTopics.setPossibleCorrect(mauiDocument.getTopicsString().split("\n").length);
            int i = 0;
            log.info("-- Keyphrases and feature values:");
            while (true) {
                Instance output = this.mauiFilter.output();
                if (output != null) {
                    double value = output.value(this.mauiFilter.getProbabilityIndex());
                    if (i < this.topicsPerDocument && value > this.cutOffTopicProbability) {
                        instanceArr[i] = output;
                        String stringValue = instanceArr[i].stringValue(this.mauiFilter.getOutputFormIndex());
                        Topic topic = new Topic(stringValue, SchemaSymbols.ATTVAL_TRUE_1, value);
                        if (((int) instanceArr[i].value(instanceArr[i].numAttributes() - 1)) == 1) {
                            topic.setCorrectness(true);
                        } else {
                            topic.setCorrectness(false);
                        }
                        mauiTopics.addTopic(topic);
                        log.info("Topic " + stringValue + " " + SchemaSymbols.ATTVAL_TRUE_1 + " " + value + " > " + topic.isCorrect());
                        i++;
                    }
                }
            }
            arrayList.add(mauiTopics);
        }
        this.mauiFilter.batchFinished();
        return arrayList;
    }

    public void loadModel() {
        this.mauiFilter = DataLoader.loadModel(this.modelName);
        if (this.buildGlobalDictionary) {
            log.info("-- The global dictionaries will be built from this test collection..");
            this.mauiFilter.globalDictionary = null;
        }
        this.mauiFilter.setVocabularyName(this.vocabularyName);
        this.mauiFilter.setVocabularyFormat(this.vocabularyFormat);
        this.mauiFilter.setDocumentLanguage(this.documentLanguage);
        this.mauiFilter.setStemmer(this.stemmer);
        if (this.vocabularyName.equals("none")) {
            return;
        }
        loadVocabulary();
        this.mauiFilter.setVocabulary(this.vocabulary);
    }

    public void printTopics(List<MauiTopics> list) {
        for (MauiTopics mauiTopics : list) {
            try {
                FileOutputStream fileOutputStream = new FileOutputStream(mauiTopics.getFilePath().replace(".txt", ".maui"));
                PrintWriter printWriter = !this.documentEncoding.equals("default") ? new PrintWriter(new OutputStreamWriter(fileOutputStream, this.documentEncoding)) : new PrintWriter(fileOutputStream);
                for (Topic topic : mauiTopics.getTopics()) {
                    printWriter.print(topic.getTitle());
                    if (this.additionalInfo) {
                        printWriter.print("\t");
                        printWriter.print(topic.getProbability());
                    }
                    printWriter.println();
                }
                printWriter.close();
                fileOutputStream.close();
            } catch (FileNotFoundException e) {
                log.error(e.getMessage());
            } catch (IOException e2) {
                log.error(e2.getMessage());
            }
        }
    }

    public void setModel(MauiFilter mauiFilter) {
        this.mauiFilter = mauiFilter;
    }

    public static void main(String[] strArr) {
        MauiTopicExtractor mauiTopicExtractor = new MauiTopicExtractor();
        try {
            mauiTopicExtractor.setOptions(strArr);
            log.info("Extracting keyphrases with options: ");
            String str = "";
            for (String str2 : mauiTopicExtractor.getOptions()) {
                str = str + str2 + " ";
            }
            log.info(str);
            log.info("-- Loading the model... ");
            mauiTopicExtractor.loadModel();
            List<MauiTopics> extractTopics = mauiTopicExtractor.extractTopics(mauiTopicExtractor.loadDocuments());
            mauiTopicExtractor.printTopics(extractTopics);
            Evaluator.evaluateTopics(extractTopics);
        } catch (Exception e) {
            log.error("Error running MauiTopicExtractor..", (Throwable) e);
            log.error(e.getMessage());
            log.error("\nOptions:\n");
            Enumeration<Option> listOptions = mauiTopicExtractor.listOptions();
            while (listOptions.hasMoreElements()) {
                Option nextElement = listOptions.nextElement();
                log.error(nextElement.synopsis());
                log.error(nextElement.description());
            }
        }
    }
}
