@@ -8,6 +8,12 @@ namespace ManagedCode.GraphRag.Tests.Chunking;
88public sealed class TokenTextChunkerTests
99{
1010 private readonly TokenTextChunker _chunker = new ( ) ;
11+ private readonly ChunkingConfig _defaultConfig = new ( )
12+ {
13+ Size = 40 ,
14+ Overlap = 10 ,
15+ EncodingModel = TokenizerDefaults . DefaultEncoding
16+ } ;
1117
1218 [ Fact ]
1319 public void Chunk_RespectsTokenBudget ( )
@@ -63,4 +69,142 @@ public void Chunk_CombinesDocumentIdentifiersAcrossSlices()
6369 Assert . Contains ( chunks , chunk => chunk . DocumentIds . Contains ( "doc-1" ) ) ;
6470 Assert . Contains ( chunks , chunk => chunk . DocumentIds . Contains ( "doc-2" ) ) ;
6571 }
72+
73+ [ Fact ]
74+ public void Chunk_OverlapProducesSharedTokensBetweenAdjacentChunks ( )
75+ {
76+ var tokenizer = TokenizerRegistry . GetTokenizer ( TokenizerDefaults . DefaultEncoding ) ;
77+ const string text = "The quick brown fox jumps over the lazy dog and continues running through the forest until it reaches the river where it stops to drink some water." ;
78+ var slices = new [ ] { new ChunkSlice ( "doc-1" , text ) } ;
79+
80+ var config = new ChunkingConfig
81+ {
82+ Size = 20 ,
83+ Overlap = 5 ,
84+ EncodingModel = TokenizerDefaults . DefaultEncoding
85+ } ;
86+
87+ var chunks = _chunker . Chunk ( slices , config ) ;
88+
89+ Assert . True ( chunks . Count >= 2 , "Need at least 2 chunks to verify overlap" ) ;
90+
91+ for ( var i = 0 ; i < chunks . Count - 1 ; i ++ )
92+ {
93+ var currentChunkTokens = tokenizer . EncodeToIds ( chunks [ i ] . Text ) ;
94+ var nextChunkTokens = tokenizer . EncodeToIds ( chunks [ i + 1 ] . Text ) ;
95+
96+ var lastTokensOfCurrent = currentChunkTokens . TakeLast ( config . Overlap ) . ToArray ( ) ;
97+ var firstTokensOfNext = nextChunkTokens . Take ( config . Overlap ) . ToArray ( ) ;
98+
99+ Assert . Equal ( lastTokensOfCurrent , firstTokensOfNext ) ;
100+ }
101+ }
102+
103+ [ Fact ]
104+ public void Chunk_EmptySlicesReturnsEmptyResult ( )
105+ {
106+ var slices = Array . Empty < ChunkSlice > ( ) ;
107+
108+ var chunks = _chunker . Chunk ( slices , _defaultConfig ) ;
109+
110+ Assert . Empty ( chunks ) ;
111+ }
112+
113+ [ Fact ]
114+ public void Chunk_SlicesWithEmptyTextReturnsEmptyResult ( )
115+ {
116+ var slices = new [ ] { new ChunkSlice ( "doc-1" , string . Empty ) } ;
117+
118+ var chunks = _chunker . Chunk ( slices , _defaultConfig ) ;
119+
120+ Assert . Empty ( chunks ) ;
121+ }
122+
123+ [ Fact ]
124+ public void Chunk_NullSlicesThrowsArgumentNullException ( )
125+ {
126+ Assert . Throws < ArgumentNullException > ( ( ) => _chunker . Chunk ( null ! , _defaultConfig ) ) ;
127+ }
128+
129+ [ Fact ]
130+ public void Chunk_NullConfigThrowsArgumentNullException ( )
131+ {
132+ var slices = new [ ] { new ChunkSlice ( "doc-1" , "Some text" ) } ;
133+
134+ Assert . Throws < ArgumentNullException > ( ( ) => _chunker . Chunk ( slices , null ! ) ) ;
135+ }
136+
137+ [ Fact ]
138+ public void Chunk_ZeroOverlapProducesNonOverlappingChunks ( )
139+ {
140+ var tokenizer = TokenizerRegistry . GetTokenizer ( TokenizerDefaults . DefaultEncoding ) ;
141+ const string text = "The quick brown fox jumps over the lazy dog and continues running through the forest until it reaches the river." ;
142+ var slices = new [ ] { new ChunkSlice ( "doc-1" , text ) } ;
143+
144+ var config = new ChunkingConfig
145+ {
146+ Size = 15 ,
147+ Overlap = 0 ,
148+ EncodingModel = TokenizerDefaults . DefaultEncoding
149+ } ;
150+
151+ var chunks = _chunker . Chunk ( slices , config ) ;
152+ Assert . True ( chunks . Count >= 2 , "Need at least 2 chunks to verify zero overlap" ) ;
153+
154+ var allChunkTokens = chunks
155+ . SelectMany ( c => tokenizer . EncodeToIds ( c . Text ) )
156+ . ToList ( ) ;
157+
158+ var originalTokens = tokenizer . EncodeToIds ( text ) ;
159+
160+ Assert . Equal ( originalTokens . Count , allChunkTokens . Count ) ;
161+ }
162+
163+ [ Fact ]
164+ public void Chunk_InputSmallerThanChunkSizeReturnsSingleChunk ( )
165+ {
166+ const string shortText = "Hello world" ;
167+ var slices = new [ ] { new ChunkSlice ( "doc-1" , shortText ) } ;
168+
169+ var config = new ChunkingConfig
170+ {
171+ Size = 100 ,
172+ Overlap = 10 ,
173+ EncodingModel = TokenizerDefaults . DefaultEncoding
174+ } ;
175+
176+ var chunks = _chunker . Chunk ( slices , config ) ;
177+
178+ Assert . Single ( chunks ) ;
179+ Assert . Equal ( shortText , chunks [ 0 ] . Text ) ;
180+ }
181+
182+ [ Fact ]
183+ public void Chunk_ExactBoundaryProducesExpectedChunkCount ( )
184+ {
185+ var tokenizer = TokenizerRegistry . GetTokenizer ( TokenizerDefaults . DefaultEncoding ) ;
186+
187+ const int chunkSize = 10 ;
188+ const int overlap = 2 ;
189+ const int step = chunkSize - overlap ;
190+
191+ var targetTokenCount = step * 3 + overlap ;
192+ var words = Enumerable . Range ( 0 , targetTokenCount * 2 ) . Select ( i => "word" ) . ToArray ( ) ;
193+ var text = string . Join ( " " , words ) ;
194+
195+ var actualTokens = tokenizer . EncodeToIds ( text ) ;
196+ var slices = new [ ] { new ChunkSlice ( "doc-1" , text ) } ;
197+
198+ var config = new ChunkingConfig
199+ {
200+ Size = chunkSize ,
201+ Overlap = overlap ,
202+ EncodingModel = TokenizerDefaults . DefaultEncoding
203+ } ;
204+
205+ var chunks = _chunker . Chunk ( slices , config ) ;
206+
207+ Assert . True ( chunks . Count >= 2 , "Should produce multiple chunks" ) ;
208+ Assert . All ( chunks . SkipLast ( 1 ) , chunk => Assert . Equal ( chunkSize , chunk . TokenCount ) ) ;
209+ }
66210}
0 commit comments