From f7c1f5e5b01de1cb44c901e4b6cbabbd24b09c85 Mon Sep 17 00:00:00 2001
From: Hyeri1ee <haerizian10@gmail.com>
Date: Sat, 20 Sep 2025 04:06:45 +0900
Subject: [PATCH 1/3] GH-4428: improve TextSplitter with property preservation
 and tracking

Signed-off-by: Hyeri1ee <haerizian10@gmail.com>
---
 .../ai/document/id/IdGenerator.java           |   2 +-
 .../ai/transformer/splitter/TextSplitter.java |  33 +++-
 .../splitter/TextSplitterTests.java           | 150 +++++++++++++++++-
 3 files changed, 169 insertions(+), 16 deletions(-)
diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java
index 2251016f6d1..a06e55b4ed5 100644
--- a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java
+++ b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java
@@ -26,7 +26,7 @@ public interface IdGenerator {
 
 	/**
 	 * Generate a unique ID for the given content. Note: some generator, such as the
-	 * random generator might not dependant on or use the content parameters.
+	 * random generator might not dependent on or use the content parameters.
 	 * @param contents the content to generate an ID for.
 	 * @return the generated ID.
 	 */
diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java b/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java
index 793c3fb7589..be118a9955b 100644
--- a/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java
+++ b/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java
@@ -63,18 +63,22 @@ private List<Document> doSplitDocuments(List<Document> documents) {
 		List<String> texts = new ArrayList<>();
 		List<Map<String, Object>> metadataList = new ArrayList<>();
 		List<ContentFormatter> formatters = new ArrayList<>();
+		List<Double> scores = new ArrayList<>();
+		List<String> originalIds = new ArrayList<>();
 
 		for (Document doc : documents) {
 			texts.add(doc.getText());
 			metadataList.add(doc.getMetadata());
 			formatters.add(doc.getContentFormatter());
+			scores.add(doc.getScore());
+			originalIds.add(doc.getId());
 		}
 
-		return createDocuments(texts, formatters, metadataList);
+		return createDocuments(texts, formatters, metadataList, scores, originalIds);
 	}
 
 	private List<Document> createDocuments(List<String> texts, List<ContentFormatter> formatters,
-			List<Map<String, Object>> metadataList) {
+			List<Map<String, Object>> metadataList, List<Double> scores, List<String> originalIds) {
 
 		// Process the data in a column oriented way and recreate the Document
 		List<Document> documents = new ArrayList<>();
@@ -82,25 +86,38 @@ private List<Document> createDocuments(List<String> texts, List<ContentFormatter
 		for (int i = 0; i < texts.size(); i++) {
 			String text = texts.get(i);
 			Map<String, Object> metadata = metadataList.get(i);
+			Double originalScore = scores.get(i);
+			String originalId = originalIds.get(i);
+
 			List<String> chunks = splitText(text);
 			if (chunks.size() > 1) {
 				logger.info("Splitting up document into " + chunks.size() + " chunks.");
 			}
-			for (String chunk : chunks) {
-				// only primitive values are in here -
-				Map<String, Object> metadataCopy = metadata.entrySet()
+
+			for (int chunkIndex = 0; chunkIndex < chunks.size(); chunkIndex++) {
+				String chunk = chunks.get(chunkIndex);
+
+				Map<String, Object> enhancedMetadata = metadata.entrySet()
 					.stream()
 					.filter(e -> e.getKey() != null && e.getValue() != null)
 					.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
-				Document newDoc = new Document(chunk, metadataCopy);
+
+				enhancedMetadata.put("parent_document_id", originalId);
+				enhancedMetadata.put("chunk_index", chunkIndex);
+				enhancedMetadata.put("total_chunks", chunks.size());
+
+				Document newDoc = Document.builder()
+					.text(chunk)
+					.metadata(enhancedMetadata)
+					.score(originalScore)
+					.build();
 
 				if (this.copyContentFormatter) {
 					// Transfer the content-formatter of the parent to the chunked
-					// documents it was slit into.
+					// documents it was split into.
 					newDoc.setContentFormatter(formatters.get(i));
 				}
 
-				// TODO copy over other properties.
 				documents.add(newDoc);
 			}
 		}
diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java
index 88fee22e92d..c45ef0650ac 100644
--- a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java
+++ b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java
@@ -82,9 +82,22 @@ public void testSplitText() {
 		assertThat(chunks.get(3).getText())
 			.isEqualTo("choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.");
 
-		// Verify that the same, merged metadata is copied to all chunks.
-		assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
-		assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata());
+		// Verify that the original metadata is copied to all chunks (excluding
+		// chunk-specific fields)
+		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
+				"total_chunks");
+
+		// Verify chunk indices are correct
+		assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
+		assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1);
 		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
 		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");
 
@@ -148,7 +161,6 @@ public void pageNoChunkSplit() {
 	@Test
 	public void pageWithChunkSplit() {
 		// given
-
 		var doc1 = new Document("1In the end, writing arises when man realizes that memory is not enough."
 				+ "1The most oppressive thing about the labyrinth is that you are constantly "
 				+ "1being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.",
@@ -236,13 +248,137 @@ public void testSplitTextWithNullMetadata() {
 		assertThat(chunks.get(0).getText()).isEqualTo("In the end, writing arises when man");
 		assertThat(chunks.get(1).getText()).isEqualTo(" realizes that memory is not enough.");
 
-		// Verify that the same, merged metadata is copied to all chunks.
-		assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
-		assertThat(chunks.get(1).getMetadata()).containsKeys("key1");
+		// Verify that the original metadata is copied to all chunks (with chunk-specific
+		// fields)
+		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index",
+				"total_chunks");
+
+		// Verify chunk indices are different
+		assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
 
 		// Verify that the content formatters are copied from the parents to the chunks.
 		assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter);
 		assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter);
 	}
 
+	@Test
+	public void testScorePreservation() {
+		// given
+		Double originalScore = 0.95;
+		var doc = Document.builder()
+			.text("This is a test document that will be split into multiple chunks.")
+			.metadata(Map.of("source", "test.txt"))
+			.score(originalScore)
+			.build();
+
+		// when
+		List<Document> chunks = testTextSplitter.apply(List.of(doc));
+
+		// then
+		assertThat(chunks).hasSize(2);
+		assertThat(chunks.get(0).getScore()).isEqualTo(originalScore);
+		assertThat(chunks.get(1).getScore()).isEqualTo(originalScore);
+	}
+
+	@Test
+	public void testParentDocumentTracking() {
+		// given
+		var doc1 = new Document("First document content for testing splitting functionality.",
+				Map.of("source", "doc1.txt"));
+		var doc2 = new Document("Second document content for testing splitting functionality.",
+				Map.of("source", "doc2.txt"));
+
+		String originalId1 = doc1.getId();
+		String originalId2 = doc2.getId();
+
+		// when
+		List<Document> chunks = testTextSplitter.apply(List.of(doc1, doc2));
+
+		// then
+		assertThat(chunks).hasSize(4);
+
+		// Verify parent document tracking for doc1 chunks
+		assertThat(chunks.get(0).getMetadata().get("parent_document_id")).isEqualTo(originalId1);
+		assertThat(chunks.get(1).getMetadata().get("parent_document_id")).isEqualTo(originalId1);
+
+		// Verify parent document tracking for doc2 chunks
+		assertThat(chunks.get(2).getMetadata().get("parent_document_id")).isEqualTo(originalId2);
+		assertThat(chunks.get(3).getMetadata().get("parent_document_id")).isEqualTo(originalId2);
+	}
+
+	@Test
+	public void testChunkMetadataInformation() {
+		// given
+		var doc = new Document("This is a longer document that will be split into exactly two chunks for testing.",
+				Map.of("source", "test.txt"));
+
+		// when
+		List<Document> chunks = testTextSplitter.apply(List.of(doc));
+
+		// then
+		assertThat(chunks).hasSize(2);
+
+		// Verify chunk index and total chunks for first chunk
+		assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(0).getMetadata().get("total_chunks")).isEqualTo(2);
+
+		// Verify chunk index and total chunks for second chunk
+		assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
+		assertThat(chunks.get(1).getMetadata().get("total_chunks")).isEqualTo(2);
+
+		// Verify original metadata is preserved
+		assertThat(chunks.get(0).getMetadata().get("source")).isEqualTo("test.txt");
+		assertThat(chunks.get(1).getMetadata().get("source")).isEqualTo("test.txt");
+	}
+
+	@Test
+	public void testEnhancedMetadataWithMultipleDocuments() {
+		// given
+		var doc1 = Document.builder()
+			.text("First document with score and metadata.")
+			.metadata(Map.of("type", "article", "priority", "high"))
+			.score(0.8)
+			.build();
+
+		var doc2 = Document.builder()
+			.text("Second document with different score.")
+			.metadata(Map.of("type", "report", "priority", "medium"))
+			.score(0.6)
+			.build();
+
+		String originalId1 = doc1.getId();
+		String originalId2 = doc2.getId();
+
+		// when
+		List<Document> chunks = testTextSplitter.apply(List.of(doc1, doc2));
+
+		// then
+		assertThat(chunks).hasSize(4);
+
+		// Verify first document chunks
+		for (int i = 0; i < 2; i++) {
+			Document chunk = chunks.get(i);
+			assertThat(chunk.getScore()).isEqualTo(0.8);
+			assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId1);
+			assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i);
+			assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2);
+			assertThat(chunk.getMetadata().get("type")).isEqualTo("article");
+			assertThat(chunk.getMetadata().get("priority")).isEqualTo("high");
+		}
+
+		// Verify second document chunks
+		for (int i = 2; i < 4; i++) {
+			Document chunk = chunks.get(i);
+			assertThat(chunk.getScore()).isEqualTo(0.6);
+			assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId2);
+			assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i - 2);
+			assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2);
+			assertThat(chunk.getMetadata().get("type")).isEqualTo("report");
+			assertThat(chunk.getMetadata().get("priority")).isEqualTo("medium");
+		}
+	}
+
 }

From 82baa4adada7f98026134f5cd4ea711b04faa27b Mon Sep 17 00:00:00 2001
From: Hyeri1ee <haerizian10@gmail.com>
Date: Sat, 20 Sep 2025 04:47:12 +0900
Subject: [PATCH 2/3] Fix test comments and update TokenTextSplitterTest for
 enhanced metadata

- Correct comment: 'excluding' -> 'including' chunk-specific fields
- Update TokenTextSplitterTest to handle new metadata fields
- Ensure all tests pass with enhanced TextSplitter functionality

Signed-off-by: Hyeri1ee <haerizian10@gmail.com>
---
 .../splitter/TextSplitterTests.java           |  2 +-
 .../splitter/TokenTextSplitterTest.java       | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java
index c45ef0650ac..66ee5f4e9f5 100644
--- a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java
+++ b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java
@@ -82,7 +82,7 @@ public void testSplitText() {
 		assertThat(chunks.get(3).getText())
 			.isEqualTo("choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.");
 
-		// Verify that the original metadata is copied to all chunks (excluding
+		// Verify that the original metadata is copied to all chunks (including
 		// chunk-specific fields)
 		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
 				"total_chunks");
diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java
index e803c8a4e40..96c58f3fa9a 100644
--- a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java
+++ b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java
@@ -104,9 +104,22 @@ public void testTokenTextSplitterBuilderWithAllFields() {
 		assertThat(chunks.get(4).getText()).isEqualTo("It isn’t the lack of an exit, but");
 		assertThat(chunks.get(5).getText()).isEqualTo("the abundance of exits that is so disorienting");
 
-		// Verify that the same, merged metadata is copied to all chunks.
-		assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
-		assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata());
+		// Verify that the original metadata is copied to all chunks (including
+		// chunk-specific fields)
+		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
+				"total_chunks");
+
+		// Verify chunk indices are correct
+		assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
+		assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1);
 
 		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
 		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");

From c1dbb977f189b4c95cd64e33b0e98d39695561a8 Mon Sep 17 00:00:00 2001
From: Eric Bottard <eric.bottard@broadcom.com>
Date: Thu, 9 Oct 2025 13:07:15 +0200
Subject: [PATCH 3/3] Fix the typo fix

Signed-off-by: Eric Bottard <eric.bottard@broadcom.com>
---
 .../java/org/springframework/ai/document/id/IdGenerator.java    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java
index a06e55b4ed5..515e0bf797b 100644
--- a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java
+++ b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java
@@ -26,7 +26,7 @@ public interface IdGenerator {
 
 	/**
 	 * Generate a unique ID for the given content. Note: some generator, such as the
-	 * random generator might not dependent on or use the content parameters.
+	 * random generator might not depend on or use the content parameters.
 	 * @param contents the content to generate an ID for.
 	 * @return the generated ID.
 	 */