package eus.ixa.ixa.pipe.ml.tok;

import eus.ixa.ixa.pipe.ml.utils.IOUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Properties;
import java.util.regex.Pattern;

/* loaded from: input_file:eus/ixa/ixa/pipe/ml/tok/RuleBasedSegmenter.class */
public class RuleBasedSegmenter implements SentenceSegmenter {
    public static final String PARAGRAPH = "¶¶";
    private NonPeriodBreaker nonBreaker;
    private final String text;
    private boolean isHardParagraph;
    public static final String LINE_BREAK = "<JAR>";
    public static Pattern lineBreak = Pattern.compile(LINE_BREAK);
    public static Pattern doubleLineBreak = Pattern.compile("(<JAR><JAR>)");
    public static Pattern paragraph = Pattern.compile("(¶¶)");
    public static String INITIAL_PUNCT = "[#'\"\\¿\\¡«<\u0091\u0093‛“‟‘‹]";
    public static String FINAL_PUNCT = "['\"\\)\\]\\%»=\u0092\u0094”›’]";
    public static Pattern endPunctLinkPara = Pattern.compile("([?!\\.])[\\ ]*(¶¶)+[\\ ]*(http|www|ftp)");
    public static Pattern conventionalPara = Pattern.compile("([?!\\.])[\\ ]*(¶¶)+[\\ ]*(" + INITIAL_PUNCT + "*[\\p{Lu}])", 256);
    public static Pattern endInsideQuotesPara = Pattern.compile("([?!\\.](¶)*" + FINAL_PUNCT + "+)(¶¶)+(" + INITIAL_PUNCT + "*(¶¶)*[\\p{Lu}])", 256);
    public static Pattern multiDotsParaStarters = Pattern.compile("(\\.[\\.]+)(¶¶)+(" + INITIAL_PUNCT + "*[\\p{Lu}])", 256);
    public static Pattern spuriousParagraph = Pattern.compile("(¶¶)+\\s*([\\p{Lower}\\!#\\$%&\\(\\)\\*\\+,-\\/:;=>\\?@\\[\\\\\\]\\^\\{\\|\\}~])", 256);
    public static Pattern alphaNumParaLowerNum = Pattern.compile("(\\p{Alnum})\\s*(¶¶)+\\s*([\\p{Lower}\\p{Digit}])", 256);
    public static Pattern noPeriodSpaceEnd = Pattern.compile("([?!])[\\ ]+(" + INITIAL_PUNCT + "*[\\p{Lu}])", 256);
    public static Pattern multiDotsSpaceStarters = Pattern.compile("(\\.[\\.]+)[\\ ]+(" + INITIAL_PUNCT + "*[\\p{Lu}])", 256);
    public static Pattern endInsideQuotesSpace = Pattern.compile("([?!\\.][\\ ]*" + FINAL_PUNCT + "+)[\\ ]+(" + INITIAL_PUNCT + "*[\\ ]*[\\p{Lu}])", 256);
    public static Pattern punctSpaceUpper = Pattern.compile("([?!\\.])[\\ ]+(" + INITIAL_PUNCT + "+[\\ ]*[\\p{Lu}])", 256);
    public static Pattern punctSpaceMultiPunct = Pattern.compile("([?!\\.])[\\ ]+([\\-]+[\\ ]*[\\(]*\\p{Lu})", 256);
    public static Pattern endPunctLinkSpace = Pattern.compile("([?!\\.])[\\ ]*(http|www|ftp)");
    private static Boolean DEBUG = false;

    public RuleBasedSegmenter(String str, Properties properties) {
        this.isHardParagraph = false;
        if (properties.getProperty("hardParagraph").equalsIgnoreCase("yes")) {
            this.isHardParagraph = true;
        }
        if (this.nonBreaker == null) {
            this.nonBreaker = new NonPeriodBreaker(properties);
        }
        this.text = str;
    }

    @Override // eus.ixa.ixa.pipe.ml.tok.SentenceSegmenter
    public String[] segmentSentence() {
        if (DEBUG.booleanValue()) {
            System.err.println("-> Build:" + this.text);
        }
        return segment(this.text);
    }

    private String[] segment(String str) {
        String replaceAll;
        String replaceAll2 = RuleBasedTokenizer.doubleSpaces.matcher(str.trim()).replaceAll(IOUtils.SPACE_DELIMITER);
        if (this.isHardParagraph) {
            replaceAll = paragraph.matcher(replaceAll2).replaceAll("\n$1");
        } else {
            replaceAll = spuriousParagraph.matcher(alphaNumParaLowerNum.matcher(multiDotsParaStarters.matcher(endInsideQuotesPara.matcher(conventionalPara.matcher(endPunctLinkPara.matcher(replaceAll2).replaceAll("$1\n$2$3")).replaceAll("$1\n$2$3")).replaceAll("$1\n$3$4")).replaceAll("$1\n$2$3")).replaceAll("$1 $3")).replaceAll(" $2");
        }
        return this.nonBreaker.segmenterExceptions(punctSpaceMultiPunct.matcher(endPunctLinkSpace.matcher(punctSpaceUpper.matcher(endInsideQuotesSpace.matcher(multiDotsSpaceStarters.matcher(noPeriodSpaceEnd.matcher(replaceAll).replaceAll("$1\n$2")).replaceAll("$1\n$2")).replaceAll("$1\n$2")).replaceAll("$1\n$2")).replaceAll("$1\n$2")).replaceAll("$1\n$2").split("\n"));
    }

    public static String readText(BufferedReader bufferedReader) {
        StringBuilder sb = new StringBuilder();
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                sb.append(readLine).append(LINE_BREAK);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return buildText(sb.toString());
    }

    private static String buildText(String str) {
        return lineBreak.matcher(doubleLineBreak.matcher(str).replaceAll(PARAGRAPH)).replaceAll(IOUtils.SPACE_DELIMITER);
    }
}
