2929/**
3030 * @author Raphael Yu
3131 * @author Christian Tzolov
32+ * @author Ricken Bazolo
3233 */
3334public class TokenTextSplitter extends TextSplitter {
3435
3536 private final EncodingRegistry registry = Encodings .newLazyEncodingRegistry ();
3637
3738 private final Encoding encoding = registry .getEncoding (EncodingType .CL100K_BASE );
3839
40+ private final static int DEFAULT_CHUNK_SIZE = 800 ;
41+
42+ private final static int MIN_CHUNK_SIZE_CHARS = 350 ;
43+
44+ private final static int MIN_CHUNK_LENGTH_TO_EMBED = 5 ;
45+
46+ private final static int MAX_NUM_CHUNKS = 10000 ;
47+
48+ private final static boolean KEEP_SEPARATOR = true ;
49+
3950 // The target size of each text chunk in tokens
40- private int defaultChunkSize = 800 ;
51+ private final int chunkSize ;
4152
4253 // The minimum size of each text chunk in characters
43- private int minChunkSizeChars = 350 ;
54+ private final int minChunkSizeChars ;
4455
4556 // Discard chunks shorter than this
46- private int minChunkLengthToEmbed = 5 ;
57+ private final int minChunkLengthToEmbed ;
4758
4859 // The maximum number of chunks to generate from a text
49- private int maxNumChunks = 10000 ;
60+ private final int maxNumChunks ;
5061
51- private boolean keepSeparator = true ;
62+ private final boolean keepSeparator ;
5263
5364 public TokenTextSplitter () {
65+ this (DEFAULT_CHUNK_SIZE , MIN_CHUNK_SIZE_CHARS , MIN_CHUNK_LENGTH_TO_EMBED , MAX_NUM_CHUNKS , KEEP_SEPARATOR );
5466 }
5567
5668 public TokenTextSplitter (boolean keepSeparator ) {
57- this . keepSeparator = keepSeparator ;
69+ this ( DEFAULT_CHUNK_SIZE , MIN_CHUNK_SIZE_CHARS , MIN_CHUNK_LENGTH_TO_EMBED , MAX_NUM_CHUNKS , keepSeparator ) ;
5870 }
5971
60- public TokenTextSplitter (int defaultChunkSize , int minChunkSizeChars , int minChunkLengthToEmbed , int maxNumChunks ,
72+ public TokenTextSplitter (int chunkSize , int minChunkSizeChars , int minChunkLengthToEmbed , int maxNumChunks ,
6173 boolean keepSeparator ) {
62- this .defaultChunkSize = defaultChunkSize ;
74+ this .chunkSize = chunkSize ;
6375 this .minChunkSizeChars = minChunkSizeChars ;
6476 this .minChunkLengthToEmbed = minChunkLengthToEmbed ;
6577 this .maxNumChunks = maxNumChunks ;
@@ -68,7 +80,7 @@ public TokenTextSplitter(int defaultChunkSize, int minChunkSizeChars, int minChu
6880
6981 @ Override
7082 protected List <String > splitText (String text ) {
71- return doSplit (text , this .defaultChunkSize );
83+ return doSplit (text , this .chunkSize );
7284 }
7385
7486 protected List <String > doSplit (String text , int chunkSize ) {
@@ -133,4 +145,55 @@ private String decodeTokens(List<Integer> tokens) {
133145 return this .encoding .decode (tokensIntArray );
134146 }
135147
148+ public static Builder builder () {
149+ return new Builder ();
150+ }
151+
152+ public static class Builder {
153+
154+ private int chunkSize ;
155+
156+ private int minChunkSizeChars ;
157+
158+ private int minChunkLengthToEmbed ;
159+
160+ private int maxNumChunks ;
161+
162+ private boolean keepSeparator ;
163+
164+ public Builder () {
165+ }
166+
167+ public Builder chunkSize (int chunkSize ) {
168+ this .chunkSize = chunkSize ;
169+ return this ;
170+ }
171+
172+ public Builder minChunkSizeChars (int minChunkSizeChars ) {
173+ this .minChunkSizeChars = minChunkSizeChars ;
174+ return this ;
175+ }
176+
177+ public Builder minChunkLengthToEmbed (int minChunkLengthToEmbed ) {
178+ this .minChunkLengthToEmbed = minChunkLengthToEmbed ;
179+ return this ;
180+ }
181+
182+ public Builder maxNumChunks (int maxNumChunks ) {
183+ this .maxNumChunks = maxNumChunks ;
184+ return this ;
185+ }
186+
187+ public Builder keepSeparator (boolean keepSeparator ) {
188+ this .keepSeparator = keepSeparator ;
189+ return this ;
190+ }
191+
192+ public TokenTextSplitter build () {
193+ return new TokenTextSplitter (this .chunkSize , this .minChunkSizeChars , this .minChunkLengthToEmbed ,
194+ this .maxNumChunks , this .keepSeparator );
195+ }
196+
197+ }
198+
136199}
0 commit comments