From 6fcafb32bb4739d8166fc80dc7a499fb6783191f Mon Sep 17 00:00:00 2001 From: ogbozoyan Date: Sun, 20 Oct 2024 23:37:09 +0300 Subject: [PATCH 1/2] fix: fixed possible NPE while splitting document chunks --- .../ai/transformer/splitter/TextSplitter.java | 8 +++-- .../splitter/TextSplitterTests.java | 33 ++++++++++++++++++- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java b/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java index 6f856510139..d5af648e59c 100644 --- a/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java +++ b/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java @@ -87,8 +87,12 @@ private List createDocuments(List texts, List metadataCopy = metadata.entrySet() - .stream() - .collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue())); + .stream() + .filter(e -> e.getKey() != null && e.getValue() != null) + .collect(Collectors.toMap( + Map.Entry::getKey, + Map.Entry::getValue + )); Document newDoc = new Document(chunk, metadataCopy); if (this.copyContentFormatter) { diff --git a/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java b/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java index ed6044da5e2..a5caf706ee0 100644 --- a/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java +++ b/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java @@ -42,7 +42,7 @@ protected List splitText(String text) { List chunks = new ArrayList<>(); chunks.add(text.substring(0, chuckSize)); - chunks.add(text.substring(chuckSize, text.length())); + chunks.add(text.substring(chuckSize)); return chunks; } @@ -213,4 +213,35 @@ public void pageWithChunkSplit() { () -> assertThat(splitedDocument.get(3).getMetadata().get("page_number")).isEqualTo(3)); } + @Test + public void testSplitTextWithNullMetadata() { + + var contentFormatter = DefaultContentFormatter.defaultConfig(); + + var doc = new Document("In the end, writing arises when man realizes that memory is not enough."); + + doc.getMetadata().put("key1", "value1"); + doc.getMetadata().put("key2", null); + + doc.setContentFormatter(contentFormatter); + + List chunks = testTextSplitter.apply(List.of(doc)); + + assertThat(testTextSplitter.isCopyContentFormatter()).isTrue(); + + assertThat(chunks).hasSize(2); + + // Doc chunks: + assertThat(chunks.get(0).getContent()).isEqualTo("In the end, writing arises when man"); + assertThat(chunks.get(1).getContent()).isEqualTo(" realizes that memory is not enough."); + + // Verify that the same, merged metadata is copied to all chunks. + assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata()); + assertThat(chunks.get(1).getMetadata()).containsKeys("key1"); + + // Verify that the content formatters are copied from the parents to the chunks. + assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter); + assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter); + } + } From 3d30403b376123d8e1dc763ef6c7a53d35a68ae9 Mon Sep 17 00:00:00 2001 From: ogbozoyan Date: Tue, 22 Oct 2024 13:31:28 +0300 Subject: [PATCH 2/2] fix: fixed possible NPE while splitting document chunks --- .../ai/transformer/splitter/TextSplitter.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java b/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java index d5af648e59c..809fc556b8f 100644 --- a/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java +++ b/spring-ai-core/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java @@ -87,12 +87,9 @@ private List createDocuments(List texts, List metadataCopy = metadata.entrySet() - .stream() - .filter(e -> e.getKey() != null && e.getValue() != null) - .collect(Collectors.toMap( - Map.Entry::getKey, - Map.Entry::getValue - )); + .stream() + .filter(e -> e.getKey() != null && e.getValue() != null) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); Document newDoc = new Document(chunk, metadataCopy); if (this.copyContentFormatter) {