/*
 * Decompiled with CFR 0.152.
 */
package cc.unitmesh.rag.splitter;

import cc.unitmesh.nlp.embedding.EncodingTokenizer;
import cc.unitmesh.nlp.embedding.OpenAiEncoding;
import cc.unitmesh.rag.splitter.TextSplitter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import kotlin.Metadata;
import kotlin.jvm.internal.DefaultConstructorMarker;
import kotlin.jvm.internal.Intrinsics;
import kotlin.jvm.internal.SourceDebugExtension;
import kotlin.text.StringsKt;
import org.jetbrains.annotations.NotNull;

@Metadata(mv={1, 9, 0}, k=1, xi=48, d1={"\u0000&\n\u0002\u0018\u0002\n\u0002\u0018\u0002\n\u0000\n\u0002\u0018\u0002\n\u0000\n\u0002\u0010\b\n\u0002\b\t\n\u0002\u0010\u000e\n\u0000\n\u0002\u0010 \n\u0002\b\u0005\u0018\u00002\u00020\u0001B7\u0012\b\b\u0002\u0010\u0002\u001a\u00020\u0003\u0012\b\b\u0002\u0010\u0004\u001a\u00020\u0005\u0012\b\b\u0002\u0010\u0006\u001a\u00020\u0005\u0012\b\b\u0002\u0010\u0007\u001a\u00020\u0005\u0012\b\b\u0002\u0010\b\u001a\u00020\u0005\u00a2\u0006\u0002\u0010\tJ\u0016\u0010\u000e\u001a\u00020\u000f2\f\u0010\u0010\u001a\b\u0012\u0004\u0012\u00020\u00050\u0011H\u0002J\u0016\u0010\u0012\u001a\b\u0012\u0004\u0012\u00020\u00050\u00112\u0006\u0010\u0013\u001a\u00020\u000fH\u0002J \u0010\u0014\u001a\b\u0012\u0004\u0012\u00020\u000f0\u00112\b\u0010\u0013\u001a\u0004\u0018\u00010\u000f2\u0006\u0010\u0004\u001a\u00020\u0005H\u0002J\u0016\u0010\u0015\u001a\b\u0012\u0004\u0012\u00020\u000f0\u00112\u0006\u0010\u0013\u001a\u00020\u000fH\u0016R\u001a\u0010\u0004\u001a\u00020\u0005X\u0094\u000e\u00a2\u0006\u000e\n\u0000\u001a\u0004\b\n\u0010\u000b\"\u0004\b\f\u0010\rR\u000e\u0010\u0002\u001a\u00020\u0003X\u0082\u0004\u00a2\u0006\u0002\n\u0000R\u000e\u0010\b\u001a\u00020\u0005X\u0082\u0004\u00a2\u0006\u0002\n\u0000R\u000e\u0010\u0007\u001a\u00020\u0005X\u0082\u0004\u00a2\u0006\u0002\n\u0000R\u000e\u0010\u0006\u001a\u00020\u0005X\u0082\u0004\u00a2\u0006\u0002\n\u0000\u00a8\u0006\u0016"}, d2={"Lcc/unitmesh/rag/splitter/TokenTextSplitter;", "Lcc/unitmesh/rag/splitter/TextSplitter;", "encoding", "Lcc/unitmesh/nlp/embedding/EncodingTokenizer;", "chunkSize", "", "minChunkSizeChars", "minChunkLengthToEmbed", "maxNumChunks", "(Lcc/unitmesh/nlp/embedding/EncodingTokenizer;IIII)V", "getChunkSize", "()I", "setChunkSize", "(I)V", "decodeTokens", "", "tokens", "", "getEncodedTokens", "text", "split", "splitText", "cocoa-core"})
@SourceDebugExtension(value={"SMAP\nTokenTextSplitter.kt\nKotlin\n*S Kotlin\n*F\n+ 1 TokenTextSplitter.kt\ncc/unitmesh/rag/splitter/TokenTextSplitter\n+ 2 Strings.kt\nkotlin/text/StringsKt__StringsKt\n*L\n1#1,101:1\n107#2:102\n79#2,22:103\n107#2:125\n79#2,22:126\n107#2:148\n79#2,29:149\n107#2:178\n79#2,22:179\n*S KotlinDebug\n*F\n+ 1 TokenTextSplitter.kt\ncc/unitmesh/rag/splitter/TokenTextSplitter\n*L\n42#1:102\n42#1:103,22\n56#1:125\n56#1:126,22\n72#1:148\n72#1:149,29\n84#1:178\n84#1:179,22\n*E\n"})
public final class TokenTextSplitter
extends TextSplitter {
    @NotNull
    private final EncodingTokenizer encoding;
    private int chunkSize;
    private final int minChunkSizeChars;
    private final int minChunkLengthToEmbed;
    private final int maxNumChunks;

    public TokenTextSplitter(@NotNull EncodingTokenizer encoding, int chunkSize, int minChunkSizeChars, int minChunkLengthToEmbed, int maxNumChunks) {
        Intrinsics.checkNotNullParameter((Object)encoding, (String)"encoding");
        this.encoding = encoding;
        this.chunkSize = chunkSize;
        this.minChunkSizeChars = minChunkSizeChars;
        this.minChunkLengthToEmbed = minChunkLengthToEmbed;
        this.maxNumChunks = maxNumChunks;
    }

    public /* synthetic */ TokenTextSplitter(EncodingTokenizer encodingTokenizer, int n, int n2, int n3, int n4, int n5, DefaultConstructorMarker defaultConstructorMarker) {
        if ((n5 & 1) != 0) {
            encodingTokenizer = new OpenAiEncoding();
        }
        if ((n5 & 2) != 0) {
            n = 800;
        }
        if ((n5 & 4) != 0) {
            n2 = 350;
        }
        if ((n5 & 8) != 0) {
            n3 = 5;
        }
        if ((n5 & 0x10) != 0) {
            n4 = 10000;
        }
        this(encodingTokenizer, n, n2, n3, n4);
    }

    @Override
    protected int getChunkSize() {
        return this.chunkSize;
    }

    @Override
    protected void setChunkSize(int n) {
        this.chunkSize = n;
    }

    @Override
    @NotNull
    public List<String> splitText(@NotNull String text) {
        Intrinsics.checkNotNullParameter((Object)text, (String)"text");
        return this.split(text, this.getChunkSize());
    }

    private final List<String> split(String text, int chunkSize) {
        block27: {
            block26: {
                if (text == null) break block26;
                String $this$trim$iv = text;
                boolean $i$f$trim = false;
                CharSequence $this$trim$iv$iv = $this$trim$iv;
                boolean $i$f$trim2 = false;
                int startIndex$iv$iv = 0;
                int endIndex$iv$iv = $this$trim$iv$iv.length() - 1;
                boolean startFound$iv$iv = false;
                while (startIndex$iv$iv <= endIndex$iv$iv) {
                    boolean match$iv$iv;
                    int index$iv$iv = !startFound$iv$iv ? startIndex$iv$iv : endIndex$iv$iv;
                    char it = $this$trim$iv$iv.charAt(index$iv$iv);
                    boolean bl = false;
                    boolean bl2 = match$iv$iv = Intrinsics.compare((int)it, (int)32) <= 0;
                    if (!startFound$iv$iv) {
                        if (!match$iv$iv) {
                            startFound$iv$iv = true;
                            continue;
                        }
                        ++startIndex$iv$iv;
                        continue;
                    }
                    if (!match$iv$iv) break;
                    --endIndex$iv$iv;
                }
                if (!(((CharSequence)((Object)$this$trim$iv$iv.subSequence(startIndex$iv$iv, endIndex$iv$iv + 1)).toString()).length() == 0)) break block27;
            }
            return new ArrayList();
        }
        List<Integer> tokens = this.getEncodedTokens(text);
        List chunks = new ArrayList();
        int numChunks = 0;
        while (!((Collection)tokens).isEmpty() && numChunks < this.maxNumChunks) {
            String chunkTextToAppend;
            String chunkText;
            List<Integer> chunk = tokens.subList(0, (int)Math.min((double)chunkSize, (double)tokens.size()));
            String $this$trim$iv = chunkText = this.decodeTokens(chunk);
            boolean $i$f$trim = false;
            CharSequence $this$trim$iv$iv = $this$trim$iv;
            boolean $i$f$trim3 = false;
            int startIndex$iv$iv = 0;
            int endIndex$iv$iv = $this$trim$iv$iv.length() - 1;
            boolean startFound$iv$iv = false;
            while (startIndex$iv$iv <= endIndex$iv$iv) {
                boolean match$iv$iv;
                int index$iv$iv = !startFound$iv$iv ? startIndex$iv$iv : endIndex$iv$iv;
                char it = $this$trim$iv$iv.charAt(index$iv$iv);
                boolean bl = false;
                boolean bl3 = match$iv$iv = Intrinsics.compare((int)it, (int)32) <= 0;
                if (!startFound$iv$iv) {
                    if (!match$iv$iv) {
                        startFound$iv$iv = true;
                        continue;
                    }
                    ++startIndex$iv$iv;
                    continue;
                }
                if (!match$iv$iv) break;
                --endIndex$iv$iv;
            }
            if (((CharSequence)((Object)$this$trim$iv$iv.subSequence(startIndex$iv$iv, endIndex$iv$iv + 1)).toString()).length() == 0) {
                tokens = tokens.subList(chunk.size(), tokens.size());
                continue;
            }
            int lastPunctuation = Math.max(StringsKt.lastIndexOf$default((CharSequence)chunkText, (char)'.', (int)0, (boolean)false, (int)6, null), Math.max(StringsKt.lastIndexOf$default((CharSequence)chunkText, (char)'?', (int)0, (boolean)false, (int)6, null), Math.max(StringsKt.lastIndexOf$default((CharSequence)chunkText, (char)'!', (int)0, (boolean)false, (int)6, null), StringsKt.lastIndexOf$default((CharSequence)chunkText, (char)'\n', (int)0, (boolean)false, (int)6, null))));
            if (lastPunctuation != -1 && lastPunctuation > this.minChunkSizeChars) {
                String string = chunkText.substring(0, lastPunctuation + 1);
                Intrinsics.checkNotNullExpressionValue((Object)string, (String)"this as java.lang.String\u2026ing(startIndex, endIndex)");
                chunkText = string;
            }
            if (this.getKeepSeparator()) {
                $this$trim$iv = chunkText;
                $i$f$trim3 = false;
                $this$trim$iv$iv = $this$trim$iv;
                $i$f$trim = false;
                startIndex$iv$iv = 0;
                endIndex$iv$iv = $this$trim$iv$iv.length() - 1;
                startFound$iv$iv = false;
                while (startIndex$iv$iv <= endIndex$iv$iv) {
                    index$iv$iv = !startFound$iv$iv ? startIndex$iv$iv : endIndex$iv$iv;
                    it = $this$trim$iv$iv.charAt(index$iv$iv);
                    boolean bl = false;
                    boolean bl4 = match$iv$iv = Intrinsics.compare((int)it, (int)32) <= 0;
                    if (!startFound$iv$iv) {
                        if (!match$iv$iv) {
                            startFound$iv$iv = true;
                            continue;
                        }
                        ++startIndex$iv$iv;
                        continue;
                    }
                    if (!match$iv$iv) break;
                    --endIndex$iv$iv;
                }
                v4 = ((Object)$this$trim$iv$iv.subSequence(startIndex$iv$iv, endIndex$iv$iv + 1)).toString();
            } else {
                $this$trim$iv = StringsKt.replace$default((String)chunkText, (String)"\n", (String)" ", (boolean)false, (int)4, null);
                $i$f$trim3 = false;
                $this$trim$iv$iv = $this$trim$iv;
                $i$f$trim = false;
                startIndex$iv$iv = 0;
                endIndex$iv$iv = $this$trim$iv$iv.length() - 1;
                startFound$iv$iv = false;
                while (startIndex$iv$iv <= endIndex$iv$iv) {
                    index$iv$iv = !startFound$iv$iv ? startIndex$iv$iv : endIndex$iv$iv;
                    it = $this$trim$iv$iv.charAt(index$iv$iv);
                    boolean bl = false;
                    boolean bl5 = match$iv$iv = Intrinsics.compare((int)it, (int)32) <= 0;
                    if (!startFound$iv$iv) {
                        if (!match$iv$iv) {
                            startFound$iv$iv = true;
                            continue;
                        }
                        ++startIndex$iv$iv;
                        continue;
                    }
                    if (!match$iv$iv) break;
                    --endIndex$iv$iv;
                }
                v4 = chunkTextToAppend = ((Object)$this$trim$iv$iv.subSequence(startIndex$iv$iv, endIndex$iv$iv + 1)).toString();
            }
            if (chunkTextToAppend.length() > this.minChunkLengthToEmbed) {
                chunks.add(chunkTextToAppend);
            }
            tokens = tokens.subList(this.getEncodedTokens(chunkText).size(), tokens.size());
            ++numChunks;
        }
        if (!((Collection)tokens).isEmpty()) {
            String remainingText;
            String $this$trim$iv = StringsKt.replace$default((String)this.decodeTokens(tokens), (String)"\n", (String)" ", (boolean)false, (int)4, null);
            boolean $i$f$trim = false;
            CharSequence $this$trim$iv$iv = $this$trim$iv;
            boolean $i$f$trim4 = false;
            int startIndex$iv$iv = 0;
            int endIndex$iv$iv = $this$trim$iv$iv.length() - 1;
            boolean startFound$iv$iv = false;
            while (startIndex$iv$iv <= endIndex$iv$iv) {
                boolean match$iv$iv;
                int index$iv$iv = !startFound$iv$iv ? startIndex$iv$iv : endIndex$iv$iv;
                char it = $this$trim$iv$iv.charAt(index$iv$iv);
                boolean bl = false;
                boolean bl6 = match$iv$iv = Intrinsics.compare((int)it, (int)32) <= 0;
                if (!startFound$iv$iv) {
                    if (!match$iv$iv) {
                        startFound$iv$iv = true;
                        continue;
                    }
                    ++startIndex$iv$iv;
                    continue;
                }
                if (!match$iv$iv) break;
                --endIndex$iv$iv;
            }
            if ((remainingText = ((Object)$this$trim$iv$iv.subSequence(startIndex$iv$iv, endIndex$iv$iv + 1)).toString()).length() > this.minChunkLengthToEmbed) {
                chunks.add(remainingText);
            }
        }
        return chunks;
    }

    private final List<Integer> getEncodedTokens(String text) {
        return this.encoding.encode(text);
    }

    private final String decodeTokens(List<Integer> tokens) {
        return this.encoding.decode(tokens);
    }

    public TokenTextSplitter() {
        this(null, 0, 0, 0, 0, 31, null);
    }
}

