Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,12 @@ private List<Document> createDocuments(List<String> texts, List<ContentFormatter
for (String chunk : chunks) {
// only primitive values are in here -
Map<String, Object> metadataCopy = metadata.entrySet()
.stream()
.collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue()));
.stream()
.filter(e -> e.getKey() != null && e.getValue() != null)
.collect(Collectors.toMap(
Map.Entry::getKey,
Map.Entry::getValue
));
Document newDoc = new Document(chunk, metadataCopy);

if (this.copyContentFormatter) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ protected List<String> splitText(String text) {
List<String> chunks = new ArrayList<>();

chunks.add(text.substring(0, chuckSize));
chunks.add(text.substring(chuckSize, text.length()));
chunks.add(text.substring(chuckSize));

return chunks;
}
Expand Down Expand Up @@ -213,4 +213,35 @@ public void pageWithChunkSplit() {
() -> assertThat(splitedDocument.get(3).getMetadata().get("page_number")).isEqualTo(3));
}

@Test
public void testSplitTextWithNullMetadata() {

var contentFormatter = DefaultContentFormatter.defaultConfig();

var doc = new Document("In the end, writing arises when man realizes that memory is not enough.");

doc.getMetadata().put("key1", "value1");
doc.getMetadata().put("key2", null);

doc.setContentFormatter(contentFormatter);

List<Document> chunks = testTextSplitter.apply(List.of(doc));

assertThat(testTextSplitter.isCopyContentFormatter()).isTrue();

assertThat(chunks).hasSize(2);

// Doc chunks:
assertThat(chunks.get(0).getContent()).isEqualTo("In the end, writing arises when man");
assertThat(chunks.get(1).getContent()).isEqualTo(" realizes that memory is not enough.");

// Verify that the same, merged metadata is copied to all chunks.
assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
assertThat(chunks.get(1).getMetadata()).containsKeys("key1");

// Verify that the content formatters are copied from the parents to the chunks.
assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter);
assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter);
}

}
Loading