From f7c1f5e5b01de1cb44c901e4b6cbabbd24b09c85 Mon Sep 17 00:00:00 2001 From: Hyeri1ee Date: Sat, 20 Sep 2025 04:06:45 +0900 Subject: [PATCH 1/3] GH-4428: improve TextSplitter with property preservation and tracking Signed-off-by: Hyeri1ee --- .../ai/document/id/IdGenerator.java | 2 +- .../ai/transformer/splitter/TextSplitter.java | 33 +++- .../splitter/TextSplitterTests.java | 150 +++++++++++++++++- 3 files changed, 169 insertions(+), 16 deletions(-) diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java index 2251016f6d1..a06e55b4ed5 100644 --- a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java +++ b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java @@ -26,7 +26,7 @@ public interface IdGenerator { /** * Generate a unique ID for the given content. Note: some generator, such as the - * random generator might not dependant on or use the content parameters. + * random generator might not dependent on or use the content parameters. * @param contents the content to generate an ID for. * @return the generated ID. */ diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java b/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java index 793c3fb7589..be118a9955b 100644 --- a/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java +++ b/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java @@ -63,18 +63,22 @@ private List doSplitDocuments(List documents) { List texts = new ArrayList<>(); List> metadataList = new ArrayList<>(); List formatters = new ArrayList<>(); + List scores = new ArrayList<>(); + List originalIds = new ArrayList<>(); for (Document doc : documents) { texts.add(doc.getText()); metadataList.add(doc.getMetadata()); formatters.add(doc.getContentFormatter()); + scores.add(doc.getScore()); + originalIds.add(doc.getId()); } - return createDocuments(texts, formatters, metadataList); + return createDocuments(texts, formatters, metadataList, scores, originalIds); } private List createDocuments(List texts, List formatters, - List> metadataList) { + List> metadataList, List scores, List originalIds) { // Process the data in a column oriented way and recreate the Document List documents = new ArrayList<>(); @@ -82,25 +86,38 @@ private List createDocuments(List texts, List metadata = metadataList.get(i); + Double originalScore = scores.get(i); + String originalId = originalIds.get(i); + List chunks = splitText(text); if (chunks.size() > 1) { logger.info("Splitting up document into " + chunks.size() + " chunks."); } - for (String chunk : chunks) { - // only primitive values are in here - - Map metadataCopy = metadata.entrySet() + + for (int chunkIndex = 0; chunkIndex < chunks.size(); chunkIndex++) { + String chunk = chunks.get(chunkIndex); + + Map enhancedMetadata = metadata.entrySet() .stream() .filter(e -> e.getKey() != null && e.getValue() != null) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - Document newDoc = new Document(chunk, metadataCopy); + + enhancedMetadata.put("parent_document_id", originalId); + enhancedMetadata.put("chunk_index", chunkIndex); + enhancedMetadata.put("total_chunks", chunks.size()); + + Document newDoc = Document.builder() + .text(chunk) + .metadata(enhancedMetadata) + .score(originalScore) + .build(); if (this.copyContentFormatter) { // Transfer the content-formatter of the parent to the chunked - // documents it was slit into. + // documents it was split into. newDoc.setContentFormatter(formatters.get(i)); } - // TODO copy over other properties. documents.add(newDoc); } } diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java index 88fee22e92d..c45ef0650ac 100644 --- a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java +++ b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java @@ -82,9 +82,22 @@ public void testSplitText() { assertThat(chunks.get(3).getText()) .isEqualTo("choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting."); - // Verify that the same, merged metadata is copied to all chunks. - assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata()); - assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata()); + // Verify that the original metadata is copied to all chunks (excluding + // chunk-specific fields) + assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index", + "total_chunks"); + + // Verify chunk indices are correct + assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1); + assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1); assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3"); assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1"); @@ -148,7 +161,6 @@ public void pageNoChunkSplit() { @Test public void pageWithChunkSplit() { // given - var doc1 = new Document("1In the end, writing arises when man realizes that memory is not enough." + "1The most oppressive thing about the labyrinth is that you are constantly " + "1being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.", @@ -236,13 +248,137 @@ public void testSplitTextWithNullMetadata() { assertThat(chunks.get(0).getText()).isEqualTo("In the end, writing arises when man"); assertThat(chunks.get(1).getText()).isEqualTo(" realizes that memory is not enough."); - // Verify that the same, merged metadata is copied to all chunks. - assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata()); - assertThat(chunks.get(1).getMetadata()).containsKeys("key1"); + // Verify that the original metadata is copied to all chunks (with chunk-specific + // fields) + assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index", + "total_chunks"); + + // Verify chunk indices are different + assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1); // Verify that the content formatters are copied from the parents to the chunks. assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter); assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter); } + @Test + public void testScorePreservation() { + // given + Double originalScore = 0.95; + var doc = Document.builder() + .text("This is a test document that will be split into multiple chunks.") + .metadata(Map.of("source", "test.txt")) + .score(originalScore) + .build(); + + // when + List chunks = testTextSplitter.apply(List.of(doc)); + + // then + assertThat(chunks).hasSize(2); + assertThat(chunks.get(0).getScore()).isEqualTo(originalScore); + assertThat(chunks.get(1).getScore()).isEqualTo(originalScore); + } + + @Test + public void testParentDocumentTracking() { + // given + var doc1 = new Document("First document content for testing splitting functionality.", + Map.of("source", "doc1.txt")); + var doc2 = new Document("Second document content for testing splitting functionality.", + Map.of("source", "doc2.txt")); + + String originalId1 = doc1.getId(); + String originalId2 = doc2.getId(); + + // when + List chunks = testTextSplitter.apply(List.of(doc1, doc2)); + + // then + assertThat(chunks).hasSize(4); + + // Verify parent document tracking for doc1 chunks + assertThat(chunks.get(0).getMetadata().get("parent_document_id")).isEqualTo(originalId1); + assertThat(chunks.get(1).getMetadata().get("parent_document_id")).isEqualTo(originalId1); + + // Verify parent document tracking for doc2 chunks + assertThat(chunks.get(2).getMetadata().get("parent_document_id")).isEqualTo(originalId2); + assertThat(chunks.get(3).getMetadata().get("parent_document_id")).isEqualTo(originalId2); + } + + @Test + public void testChunkMetadataInformation() { + // given + var doc = new Document("This is a longer document that will be split into exactly two chunks for testing.", + Map.of("source", "test.txt")); + + // when + List chunks = testTextSplitter.apply(List.of(doc)); + + // then + assertThat(chunks).hasSize(2); + + // Verify chunk index and total chunks for first chunk + assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(0).getMetadata().get("total_chunks")).isEqualTo(2); + + // Verify chunk index and total chunks for second chunk + assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1); + assertThat(chunks.get(1).getMetadata().get("total_chunks")).isEqualTo(2); + + // Verify original metadata is preserved + assertThat(chunks.get(0).getMetadata().get("source")).isEqualTo("test.txt"); + assertThat(chunks.get(1).getMetadata().get("source")).isEqualTo("test.txt"); + } + + @Test + public void testEnhancedMetadataWithMultipleDocuments() { + // given + var doc1 = Document.builder() + .text("First document with score and metadata.") + .metadata(Map.of("type", "article", "priority", "high")) + .score(0.8) + .build(); + + var doc2 = Document.builder() + .text("Second document with different score.") + .metadata(Map.of("type", "report", "priority", "medium")) + .score(0.6) + .build(); + + String originalId1 = doc1.getId(); + String originalId2 = doc2.getId(); + + // when + List chunks = testTextSplitter.apply(List.of(doc1, doc2)); + + // then + assertThat(chunks).hasSize(4); + + // Verify first document chunks + for (int i = 0; i < 2; i++) { + Document chunk = chunks.get(i); + assertThat(chunk.getScore()).isEqualTo(0.8); + assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId1); + assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i); + assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2); + assertThat(chunk.getMetadata().get("type")).isEqualTo("article"); + assertThat(chunk.getMetadata().get("priority")).isEqualTo("high"); + } + + // Verify second document chunks + for (int i = 2; i < 4; i++) { + Document chunk = chunks.get(i); + assertThat(chunk.getScore()).isEqualTo(0.6); + assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId2); + assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i - 2); + assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2); + assertThat(chunk.getMetadata().get("type")).isEqualTo("report"); + assertThat(chunk.getMetadata().get("priority")).isEqualTo("medium"); + } + } + } From 82baa4adada7f98026134f5cd4ea711b04faa27b Mon Sep 17 00:00:00 2001 From: Hyeri1ee Date: Sat, 20 Sep 2025 04:47:12 +0900 Subject: [PATCH 2/3] Fix test comments and update TokenTextSplitterTest for enhanced metadata - Correct comment: 'excluding' -> 'including' chunk-specific fields - Update TokenTextSplitterTest to handle new metadata fields - Ensure all tests pass with enhanced TextSplitter functionality Signed-off-by: Hyeri1ee --- .../splitter/TextSplitterTests.java | 2 +- .../splitter/TokenTextSplitterTest.java | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java index c45ef0650ac..66ee5f4e9f5 100644 --- a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java +++ b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java @@ -82,7 +82,7 @@ public void testSplitText() { assertThat(chunks.get(3).getText()) .isEqualTo("choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting."); - // Verify that the original metadata is copied to all chunks (excluding + // Verify that the original metadata is copied to all chunks (including // chunk-specific fields) assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index", "total_chunks"); diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java index e803c8a4e40..96c58f3fa9a 100644 --- a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java +++ b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java @@ -104,9 +104,22 @@ public void testTokenTextSplitterBuilderWithAllFields() { assertThat(chunks.get(4).getText()).isEqualTo("It isn’t the lack of an exit, but"); assertThat(chunks.get(5).getText()).isEqualTo("the abundance of exits that is so disorienting"); - // Verify that the same, merged metadata is copied to all chunks. - assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata()); - assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata()); + // Verify that the original metadata is copied to all chunks (including + // chunk-specific fields) + assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index", + "total_chunks"); + assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index", + "total_chunks"); + + // Verify chunk indices are correct + assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1); + assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0); + assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1); assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3"); assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1"); From c1dbb977f189b4c95cd64e33b0e98d39695561a8 Mon Sep 17 00:00:00 2001 From: Eric Bottard Date: Thu, 9 Oct 2025 13:07:15 +0200 Subject: [PATCH 3/3] Fix the typo fix Signed-off-by: Eric Bottard --- .../java/org/springframework/ai/document/id/IdGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java index a06e55b4ed5..515e0bf797b 100644 --- a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java +++ b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java @@ -26,7 +26,7 @@ public interface IdGenerator { /** * Generate a unique ID for the given content. Note: some generator, such as the - * random generator might not dependent on or use the content parameters. + * random generator might not depend on or use the content parameters. * @param contents the content to generate an ID for. * @return the generated ID. */