@@ -82,9 +82,22 @@ public void testSplitText() {
8282 assertThat (chunks .get (3 ).getText ())
8383 .isEqualTo ("choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting." );
8484
85- // Verify that the same, merged metadata is copied to all chunks.
86- assertThat (chunks .get (0 ).getMetadata ()).isEqualTo (chunks .get (1 ).getMetadata ());
87- assertThat (chunks .get (2 ).getMetadata ()).isEqualTo (chunks .get (3 ).getMetadata ());
85+ // Verify that the original metadata is copied to all chunks (including
86+ // chunk-specific fields)
87+ assertThat (chunks .get (0 ).getMetadata ()).containsKeys ("key1" , "key2" , "parent_document_id" , "chunk_index" ,
88+ "total_chunks" );
89+ assertThat (chunks .get (1 ).getMetadata ()).containsKeys ("key1" , "key2" , "parent_document_id" , "chunk_index" ,
90+ "total_chunks" );
91+ assertThat (chunks .get (2 ).getMetadata ()).containsKeys ("key2" , "key3" , "parent_document_id" , "chunk_index" ,
92+ "total_chunks" );
93+ assertThat (chunks .get (3 ).getMetadata ()).containsKeys ("key2" , "key3" , "parent_document_id" , "chunk_index" ,
94+ "total_chunks" );
95+
96+ // Verify chunk indices are correct
97+ assertThat (chunks .get (0 ).getMetadata ().get ("chunk_index" )).isEqualTo (0 );
98+ assertThat (chunks .get (1 ).getMetadata ().get ("chunk_index" )).isEqualTo (1 );
99+ assertThat (chunks .get (2 ).getMetadata ().get ("chunk_index" )).isEqualTo (0 );
100+ assertThat (chunks .get (3 ).getMetadata ().get ("chunk_index" )).isEqualTo (1 );
88101 assertThat (chunks .get (0 ).getMetadata ()).containsKeys ("key1" , "key2" ).doesNotContainKeys ("key3" );
89102 assertThat (chunks .get (2 ).getMetadata ()).containsKeys ("key2" , "key3" ).doesNotContainKeys ("key1" );
90103
@@ -148,7 +161,6 @@ public void pageNoChunkSplit() {
148161 @ Test
149162 public void pageWithChunkSplit () {
150163 // given
151-
152164 var doc1 = new Document ("1In the end, writing arises when man realizes that memory is not enough."
153165 + "1The most oppressive thing about the labyrinth is that you are constantly "
154166 + "1being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting." ,
@@ -236,13 +248,137 @@ public void testSplitTextWithNullMetadata() {
236248 assertThat (chunks .get (0 ).getText ()).isEqualTo ("In the end, writing arises when man" );
237249 assertThat (chunks .get (1 ).getText ()).isEqualTo (" realizes that memory is not enough." );
238250
239- // Verify that the same, merged metadata is copied to all chunks.
240- assertThat (chunks .get (0 ).getMetadata ()).isEqualTo (chunks .get (1 ).getMetadata ());
241- assertThat (chunks .get (1 ).getMetadata ()).containsKeys ("key1" );
251+ // Verify that the original metadata is copied to all chunks (with chunk-specific
252+ // fields)
253+ assertThat (chunks .get (0 ).getMetadata ()).containsKeys ("key1" , "parent_document_id" , "chunk_index" ,
254+ "total_chunks" );
255+ assertThat (chunks .get (1 ).getMetadata ()).containsKeys ("key1" , "parent_document_id" , "chunk_index" ,
256+ "total_chunks" );
257+
258+ // Verify chunk indices are different
259+ assertThat (chunks .get (0 ).getMetadata ().get ("chunk_index" )).isEqualTo (0 );
260+ assertThat (chunks .get (1 ).getMetadata ().get ("chunk_index" )).isEqualTo (1 );
242261
243262 // Verify that the content formatters are copied from the parents to the chunks.
244263 assertThat (chunks .get (0 ).getContentFormatter ()).isSameAs (contentFormatter );
245264 assertThat (chunks .get (1 ).getContentFormatter ()).isSameAs (contentFormatter );
246265 }
247266
267+ @ Test
268+ public void testScorePreservation () {
269+ // given
270+ Double originalScore = 0.95 ;
271+ var doc = Document .builder ()
272+ .text ("This is a test document that will be split into multiple chunks." )
273+ .metadata (Map .of ("source" , "test.txt" ))
274+ .score (originalScore )
275+ .build ();
276+
277+ // when
278+ List <Document > chunks = testTextSplitter .apply (List .of (doc ));
279+
280+ // then
281+ assertThat (chunks ).hasSize (2 );
282+ assertThat (chunks .get (0 ).getScore ()).isEqualTo (originalScore );
283+ assertThat (chunks .get (1 ).getScore ()).isEqualTo (originalScore );
284+ }
285+
286+ @ Test
287+ public void testParentDocumentTracking () {
288+ // given
289+ var doc1 = new Document ("First document content for testing splitting functionality." ,
290+ Map .of ("source" , "doc1.txt" ));
291+ var doc2 = new Document ("Second document content for testing splitting functionality." ,
292+ Map .of ("source" , "doc2.txt" ));
293+
294+ String originalId1 = doc1 .getId ();
295+ String originalId2 = doc2 .getId ();
296+
297+ // when
298+ List <Document > chunks = testTextSplitter .apply (List .of (doc1 , doc2 ));
299+
300+ // then
301+ assertThat (chunks ).hasSize (4 );
302+
303+ // Verify parent document tracking for doc1 chunks
304+ assertThat (chunks .get (0 ).getMetadata ().get ("parent_document_id" )).isEqualTo (originalId1 );
305+ assertThat (chunks .get (1 ).getMetadata ().get ("parent_document_id" )).isEqualTo (originalId1 );
306+
307+ // Verify parent document tracking for doc2 chunks
308+ assertThat (chunks .get (2 ).getMetadata ().get ("parent_document_id" )).isEqualTo (originalId2 );
309+ assertThat (chunks .get (3 ).getMetadata ().get ("parent_document_id" )).isEqualTo (originalId2 );
310+ }
311+
312+ @ Test
313+ public void testChunkMetadataInformation () {
314+ // given
315+ var doc = new Document ("This is a longer document that will be split into exactly two chunks for testing." ,
316+ Map .of ("source" , "test.txt" ));
317+
318+ // when
319+ List <Document > chunks = testTextSplitter .apply (List .of (doc ));
320+
321+ // then
322+ assertThat (chunks ).hasSize (2 );
323+
324+ // Verify chunk index and total chunks for first chunk
325+ assertThat (chunks .get (0 ).getMetadata ().get ("chunk_index" )).isEqualTo (0 );
326+ assertThat (chunks .get (0 ).getMetadata ().get ("total_chunks" )).isEqualTo (2 );
327+
328+ // Verify chunk index and total chunks for second chunk
329+ assertThat (chunks .get (1 ).getMetadata ().get ("chunk_index" )).isEqualTo (1 );
330+ assertThat (chunks .get (1 ).getMetadata ().get ("total_chunks" )).isEqualTo (2 );
331+
332+ // Verify original metadata is preserved
333+ assertThat (chunks .get (0 ).getMetadata ().get ("source" )).isEqualTo ("test.txt" );
334+ assertThat (chunks .get (1 ).getMetadata ().get ("source" )).isEqualTo ("test.txt" );
335+ }
336+
337+ @ Test
338+ public void testEnhancedMetadataWithMultipleDocuments () {
339+ // given
340+ var doc1 = Document .builder ()
341+ .text ("First document with score and metadata." )
342+ .metadata (Map .of ("type" , "article" , "priority" , "high" ))
343+ .score (0.8 )
344+ .build ();
345+
346+ var doc2 = Document .builder ()
347+ .text ("Second document with different score." )
348+ .metadata (Map .of ("type" , "report" , "priority" , "medium" ))
349+ .score (0.6 )
350+ .build ();
351+
352+ String originalId1 = doc1 .getId ();
353+ String originalId2 = doc2 .getId ();
354+
355+ // when
356+ List <Document > chunks = testTextSplitter .apply (List .of (doc1 , doc2 ));
357+
358+ // then
359+ assertThat (chunks ).hasSize (4 );
360+
361+ // Verify first document chunks
362+ for (int i = 0 ; i < 2 ; i ++) {
363+ Document chunk = chunks .get (i );
364+ assertThat (chunk .getScore ()).isEqualTo (0.8 );
365+ assertThat (chunk .getMetadata ().get ("parent_document_id" )).isEqualTo (originalId1 );
366+ assertThat (chunk .getMetadata ().get ("chunk_index" )).isEqualTo (i );
367+ assertThat (chunk .getMetadata ().get ("total_chunks" )).isEqualTo (2 );
368+ assertThat (chunk .getMetadata ().get ("type" )).isEqualTo ("article" );
369+ assertThat (chunk .getMetadata ().get ("priority" )).isEqualTo ("high" );
370+ }
371+
372+ // Verify second document chunks
373+ for (int i = 2 ; i < 4 ; i ++) {
374+ Document chunk = chunks .get (i );
375+ assertThat (chunk .getScore ()).isEqualTo (0.6 );
376+ assertThat (chunk .getMetadata ().get ("parent_document_id" )).isEqualTo (originalId2 );
377+ assertThat (chunk .getMetadata ().get ("chunk_index" )).isEqualTo (i - 2 );
378+ assertThat (chunk .getMetadata ().get ("total_chunks" )).isEqualTo (2 );
379+ assertThat (chunk .getMetadata ().get ("type" )).isEqualTo ("report" );
380+ assertThat (chunk .getMetadata ().get ("priority" )).isEqualTo ("medium" );
381+ }
382+ }
383+
248384}
0 commit comments