package org.topbraid.mauiserver.tagger;

import com.fasterxml.jackson.databind.JsonNode;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.topbraid.mauiserver.framework.JsonLinesParser;
import weka.gui.beans.xml.XMLBeans;

/* loaded from: input_file:WEB-INF/classes/org/topbraid/mauiserver/tagger/TrainingDataParser.class */
public class TrainingDataParser {
    private static final Logger log = LoggerFactory.getLogger(TrainingDataParser.class);
    private final JsonLinesParser in;
    private int skippedDocumentCount = 0;

    public TrainingDataParser(JsonLinesParser jsonLinesParser) {
        this.in = jsonLinesParser;
    }

    public int getSkippedDocumentCount() {
        return this.skippedDocumentCount + this.in.getSkippedBadLinesCount();
    }

    public List<TrainingDocument> getCorpus() {
        ArrayList arrayList = new ArrayList();
        while (this.in.hasNext()) {
            TrainingDocument trainingDocument = toTrainingDocument(this.in.next(), this.in.getLineNumber());
            if (trainingDocument != null) {
                arrayList.add(trainingDocument);
            }
        }
        return arrayList;
    }

    private TrainingDocument toTrainingDocument(JsonNode jsonNode, int i) {
        String str = "doc-" + i;
        if (jsonNode.isArray()) {
            logSkipDocument("doc-" + i, "Not a JSON object");
            return null;
        }
        if (jsonNode.has(XMLBeans.VAL_ID) && !"".equals(jsonNode.get(XMLBeans.VAL_ID).asText())) {
            str = jsonNode.get(XMLBeans.VAL_ID).asText();
        }
        if (!jsonNode.has("content") || "".equals(jsonNode.get("content").asText())) {
            logSkipDocument(str, "Field 'content' missing, empty, or not a string");
            return null;
        }
        String asText = jsonNode.get("content").asText();
        if (!jsonNode.has("topics") || !jsonNode.get("topics").isArray() || jsonNode.get("topics").size() == 0) {
            logSkipDocument(str, "Field 'topics' missing, empty, or not an array");
            return null;
        }
        ArrayList arrayList = new ArrayList();
        int i2 = 0;
        Iterator<JsonNode> it = jsonNode.get("topics").iterator();
        while (it.hasNext()) {
            JsonNode next = it.next();
            if (!next.isTextual() || "".equals(next.asText())) {
                i2++;
            } else {
                arrayList.add(next.asText());
            }
        }
        if (arrayList.isEmpty()) {
            logSkipDocument(str, "All " + i2 + " topics were invalid (non-string)");
            return null;
        }
        logAddDocument(str, asText.split("\\s+").length, arrayList.size(), i2);
        return new TrainingDocument(str, asText, arrayList);
    }

    private void logSkipDocument(String str, String str2) {
        log.warn("Skipping document " + str + ": " + str2);
        this.skippedDocumentCount++;
    }

    private void logAddDocument(String str, int i, int i2, int i3) {
        log.debug("Adding training document " + str + ": " + i + " words, " + i2 + " topics" + (i3 > 0 ? ", " + i3 + " non-string topics skipped" : ""));
    }
}
