Skip to content

Commit 9a34a89

Browse files
committed
Splitter enhancement
1 parent 5dfdd8d commit 9a34a89

File tree

1 file changed

+72
-9
lines changed

1 file changed

+72
-9
lines changed

spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TokenTextSplitter.java

Lines changed: 72 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,37 +29,49 @@
2929
/**
3030
* @author Raphael Yu
3131
* @author Christian Tzolov
32+
* @author Ricken Bazolo
3233
*/
3334
public class TokenTextSplitter extends TextSplitter {
3435

3536
private final EncodingRegistry registry = Encodings.newLazyEncodingRegistry();
3637

3738
private final Encoding encoding = registry.getEncoding(EncodingType.CL100K_BASE);
3839

40+
private final static int DEFAULT_CHUNK_SIZE = 800;
41+
42+
private final static int MIN_CHUNK_SIZE_CHARS = 350;
43+
44+
private final static int MIN_CHUNK_LENGTH_TO_EMBED = 5;
45+
46+
private final static int MAX_NUM_CHUNKS = 10000;
47+
48+
private final static boolean KEEP_SEPARATOR = true;
49+
3950
// The target size of each text chunk in tokens
40-
private int defaultChunkSize = 800;
51+
private final int chunkSize;
4152

4253
// The minimum size of each text chunk in characters
43-
private int minChunkSizeChars = 350;
54+
private final int minChunkSizeChars;
4455

4556
// Discard chunks shorter than this
46-
private int minChunkLengthToEmbed = 5;
57+
private final int minChunkLengthToEmbed;
4758

4859
// The maximum number of chunks to generate from a text
49-
private int maxNumChunks = 10000;
60+
private final int maxNumChunks;
5061

51-
private boolean keepSeparator = true;
62+
private final boolean keepSeparator;
5263

5364
public TokenTextSplitter() {
65+
this(DEFAULT_CHUNK_SIZE, MIN_CHUNK_SIZE_CHARS, MIN_CHUNK_LENGTH_TO_EMBED, MAX_NUM_CHUNKS, KEEP_SEPARATOR);
5466
}
5567

5668
public TokenTextSplitter(boolean keepSeparator) {
57-
this.keepSeparator = keepSeparator;
69+
this(DEFAULT_CHUNK_SIZE, MIN_CHUNK_SIZE_CHARS, MIN_CHUNK_LENGTH_TO_EMBED, MAX_NUM_CHUNKS, keepSeparator);
5870
}
5971

60-
public TokenTextSplitter(int defaultChunkSize, int minChunkSizeChars, int minChunkLengthToEmbed, int maxNumChunks,
72+
public TokenTextSplitter(int chunkSize, int minChunkSizeChars, int minChunkLengthToEmbed, int maxNumChunks,
6173
boolean keepSeparator) {
62-
this.defaultChunkSize = defaultChunkSize;
74+
this.chunkSize = chunkSize;
6375
this.minChunkSizeChars = minChunkSizeChars;
6476
this.minChunkLengthToEmbed = minChunkLengthToEmbed;
6577
this.maxNumChunks = maxNumChunks;
@@ -68,7 +80,7 @@ public TokenTextSplitter(int defaultChunkSize, int minChunkSizeChars, int minChu
6880

6981
@Override
7082
protected List<String> splitText(String text) {
71-
return doSplit(text, this.defaultChunkSize);
83+
return doSplit(text, this.chunkSize);
7284
}
7385

7486
protected List<String> doSplit(String text, int chunkSize) {
@@ -133,4 +145,55 @@ private String decodeTokens(List<Integer> tokens) {
133145
return this.encoding.decode(tokensIntArray);
134146
}
135147

148+
public static Builder builder() {
149+
return new Builder();
150+
}
151+
152+
public static class Builder {
153+
154+
private int chunkSize;
155+
156+
private int minChunkSizeChars;
157+
158+
private int minChunkLengthToEmbed;
159+
160+
private int maxNumChunks;
161+
162+
private boolean keepSeparator;
163+
164+
public Builder() {
165+
}
166+
167+
public Builder chunkSize(int chunkSize) {
168+
this.chunkSize = chunkSize;
169+
return this;
170+
}
171+
172+
public Builder minChunkSizeChars(int minChunkSizeChars) {
173+
this.minChunkSizeChars = minChunkSizeChars;
174+
return this;
175+
}
176+
177+
public Builder minChunkLengthToEmbed(int minChunkLengthToEmbed) {
178+
this.minChunkLengthToEmbed = minChunkLengthToEmbed;
179+
return this;
180+
}
181+
182+
public Builder maxNumChunks(int maxNumChunks) {
183+
this.maxNumChunks = maxNumChunks;
184+
return this;
185+
}
186+
187+
public Builder keepSeparator(boolean keepSeparator) {
188+
this.keepSeparator = keepSeparator;
189+
return this;
190+
}
191+
192+
public TokenTextSplitter build() {
193+
return new TokenTextSplitter(this.chunkSize, this.minChunkSizeChars, this.minChunkLengthToEmbed,
194+
this.maxNumChunks, this.keepSeparator);
195+
}
196+
197+
}
198+
136199
}

0 commit comments

Comments
 (0)