From 6dc825e112250c6f323ff0ab2f5b01895e8b994e Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Tue, 23 Jul 2024 18:13:13 +0200 Subject: [PATCH 01/11] Start working with Markdown document reader --- document-readers/markdown-reader/pom.xml | 46 ++++++++ .../markdown/MarkdownDocumentReader.java | 103 ++++++++++++++++++ .../markdown/MarkdownDocumentReaderTest.java | 35 ++++++ .../src/test/resources/only-headers.md | 20 ++++ pom.xml | 2 + 5 files changed, 206 insertions(+) create mode 100644 document-readers/markdown-reader/pom.xml create mode 100644 document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java create mode 100644 document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java create mode 100644 document-readers/markdown-reader/src/test/resources/only-headers.md diff --git a/document-readers/markdown-reader/pom.xml b/document-readers/markdown-reader/pom.xml new file mode 100644 index 00000000000..5922ea2b4ed --- /dev/null +++ b/document-readers/markdown-reader/pom.xml @@ -0,0 +1,46 @@ + + + 4.0.0 + + org.springframework.ai + spring-ai + 1.0.0-SNAPSHOT + ../../pom.xml + + spring-ai-markdown-document-reader + jar + Spring AI Document Reader - Markdown + Spring AI Markdown document reader + https://github.com/spring-projects/spring-ai + + + https://github.com/spring-projects/spring-ai + git://github.com/spring-projects/spring-ai.git + git@github.com:spring-projects/spring-ai.git + + + + + org.springframework.ai + spring-ai-core + ${parent.version} + + + + org.commonmark + commonmark + ${commonmark.version} + + + + + org.springframework.boot + spring-boot-starter-test + test + + + + + diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java new file mode 100644 index 00000000000..f10d379cb40 --- /dev/null +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -0,0 +1,103 @@ +package org.springframework.ai.reader.markdown; + +import org.commonmark.node.AbstractVisitor; +import org.commonmark.node.Heading; +import org.commonmark.node.Node; +import org.commonmark.node.Text; +import org.commonmark.parser.Parser; +import org.springframework.ai.document.Document; +import org.springframework.ai.document.DocumentReader; +import org.springframework.core.io.DefaultResourceLoader; +import org.springframework.core.io.Resource; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +/** + * Reads the given Markdown resource and groups headers paragraphs into + * {@link Document}'s. + * + * @author Piotr Olaszewski + */ +public class MarkdownDocumentReader implements DocumentReader { + + private final Resource markdownResource; + + private final Parser parser; + + public MarkdownDocumentReader(String markdownResource) { + this(new DefaultResourceLoader().getResource(markdownResource)); + } + + public MarkdownDocumentReader(Resource markdownResource) { + this.markdownResource = markdownResource; + this.parser = Parser.builder().build(); + } + + @Override + public List get() { + try (var input = markdownResource.getInputStream()) { + Node node = parser.parseReader(new InputStreamReader(input)); + + DocumentVisitor documentVisitor = new DocumentVisitor(); + node.accept(documentVisitor); + + return documentVisitor.getDocuments(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + static class DocumentVisitor extends AbstractVisitor { + + private final List documents = new ArrayList<>(); + + private final List currentParagraphs = new ArrayList<>(); + + private Document.Builder currentDocumentBuilder; + + @Override + public void visit(Heading heading) { + buildAndFlush(); + + currentDocumentBuilder = Document.builder(); + + super.visit(heading); + } + + @Override + public void visit(Text text) { + if (text.getParent() instanceof Heading heading) { + currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel())) + .withMetadata("title", text.getLiteral()); + } + else { + currentParagraphs.add(text.getLiteral()); + } + + super.visit(text); + } + + public List getDocuments() { + buildAndFlush(); + + return documents; + } + + private void buildAndFlush() { + if (!currentParagraphs.isEmpty()) { + String content = String.join("", currentParagraphs); + + Document document = currentDocumentBuilder.withContent(content).build(); + documents.add(document); + + currentParagraphs.clear(); + } + } + + } + +} diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java new file mode 100644 index 00000000000..5fa0f791413 --- /dev/null +++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java @@ -0,0 +1,35 @@ +package org.springframework.ai.reader.markdown; + +import org.junit.jupiter.api.Test; +import org.springframework.ai.document.Document; + +import java.util.List; +import java.util.Map; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.groups.Tuple.tuple; + +/** + * @author Piotr Olaszewski + */ +class MarkdownDocumentReaderTest { + + @Test + void testOnlyHeadersWithParagraphs() { + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/only-headers.md"); + + List documents = reader.get(); + + assertThat(documents).hasSize(4) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of("category", "header_1", "title", "Header 1a"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sednisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), + tuple(Map.of("category", "header_1", "title", "Header 1b"), + "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sedsollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."), + tuple(Map.of("category", "header_2", "title", "Header 2b"), + "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapienodio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."), + tuple(Map.of("category", "header_2", "title", "Header 2c"), + "Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.")); + } + +} diff --git a/document-readers/markdown-reader/src/test/resources/only-headers.md b/document-readers/markdown-reader/src/test/resources/only-headers.md new file mode 100644 index 00000000000..81c770e875a --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/only-headers.md @@ -0,0 +1,20 @@ +# Header 1a + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed +nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. + +# Header 1b + +Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed +sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh. + +## Header 2b + +Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien +odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. + +# Header 1c + +## Header 2c + +Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit. diff --git a/pom.xml b/pom.xml index d7f9e6b4c60..874ee60e424 100644 --- a/pom.xml +++ b/pom.xml @@ -23,6 +23,7 @@ spring-ai-spring-boot-testcontainers spring-ai-spring-cloud-bindings + document-readers/markdown-reader document-readers/pdf-reader document-readers/tika-reader @@ -186,6 +187,7 @@ 1.9.1 0.5.0 2.10.1 + 0.22.0 5.3.1 From b18d91a0817b4410f7b23f99319def3e22a0be99 Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Tue, 23 Jul 2024 18:39:21 +0200 Subject: [PATCH 02/11] Add documents content for text with formatting --- .../markdown/MarkdownDocumentReader.java | 13 ++++++++---- .../markdown/MarkdownDocumentReaderTest.java | 21 ++++++++++++++++--- .../src/test/resources/with-formatting.md | 9 ++++++++ 3 files changed, 36 insertions(+), 7 deletions(-) create mode 100644 document-readers/markdown-reader/src/test/resources/with-formatting.md diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index f10d379cb40..9503825647c 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -1,9 +1,6 @@ package org.springframework.ai.reader.markdown; -import org.commonmark.node.AbstractVisitor; -import org.commonmark.node.Heading; -import org.commonmark.node.Node; -import org.commonmark.node.Text; +import org.commonmark.node.*; import org.commonmark.parser.Parser; import org.springframework.ai.document.Document; import org.springframework.ai.document.DocumentReader; @@ -68,6 +65,14 @@ public void visit(Heading heading) { super.visit(heading); } + @Override + public void visit(SoftLineBreak softLineBreak) { + if (!currentParagraphs.isEmpty()) { + currentParagraphs.add(" "); + } + super.visit(softLineBreak); + } + @Override public void visit(Text text) { if (text.getParent() instanceof Heading heading) { diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java index 5fa0f791413..33f76b4ddd3 100644 --- a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java +++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java @@ -23,13 +23,28 @@ void testOnlyHeadersWithParagraphs() { assertThat(documents).hasSize(4) .extracting(Document::getMetadata, Document::getContent) .containsOnly(tuple(Map.of("category", "header_1", "title", "Header 1a"), - "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sednisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), tuple(Map.of("category", "header_1", "title", "Header 1b"), - "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sedsollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."), + "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."), tuple(Map.of("category", "header_2", "title", "Header 2b"), - "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapienodio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."), + "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."), tuple(Map.of("category", "header_2", "title", "Header 2c"), "Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.")); } + @Test + void testWithFormatting() { + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/with-formatting.md"); + + List documents = reader.get(); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of("category", "header_1", "title", "This is a fancy header name"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."), + tuple(Map.of("category", "header_3", "title", "Header 3"), + "Aenean eu leo eu nibh tristique posuere quis quis massa.")); + + } + } diff --git a/document-readers/markdown-reader/src/test/resources/with-formatting.md b/document-readers/markdown-reader/src/test/resources/with-formatting.md new file mode 100644 index 00000000000..963743ece30 --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/with-formatting.md @@ -0,0 +1,9 @@ +# This is a fancy header name + +Lorem ipsum dolor sit amet, **consectetur adipiscing elit**. Donec tincidunt velit non bibendum gravida. Cras accumsan +tincidunt ornare. Donec hendrerit consequat tellus *blandit* accumsan. Aenean aliquam metus at ***arcu elementum*** +dignissim. + +### Header 3 + +Aenean eu leo eu nibh tristique _posuere quis quis massa_. From 412c3024c20c93af6df361df03e53b292c4a72f9 Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Tue, 23 Jul 2024 20:14:58 +0200 Subject: [PATCH 03/11] Handle horizontal rules --- .../markdown/MarkdownDocumentReader.java | 38 +++++++++++++--- .../config/MarkdownDocumentReaderConfig.java | 40 +++++++++++++++++ .../markdown/MarkdownDocumentReaderTest.java | 45 +++++++++++++++++++ .../src/test/resources/horizontal-rules.md | 27 +++++++++++ 4 files changed, 144 insertions(+), 6 deletions(-) create mode 100644 document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java create mode 100644 document-readers/markdown-reader/src/test/resources/horizontal-rules.md diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index 9503825647c..f8d3fe74b3a 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -4,6 +4,7 @@ import org.commonmark.parser.Parser; import org.springframework.ai.document.Document; import org.springframework.ai.document.DocumentReader; +import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig; import org.springframework.core.io.DefaultResourceLoader; import org.springframework.core.io.Resource; @@ -22,14 +23,21 @@ public class MarkdownDocumentReader implements DocumentReader { private final Resource markdownResource; + private final MarkdownDocumentReaderConfig config; + private final Parser parser; public MarkdownDocumentReader(String markdownResource) { - this(new DefaultResourceLoader().getResource(markdownResource)); + this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig()); + } + + public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) { + this(new DefaultResourceLoader().getResource(markdownResource), config); } - public MarkdownDocumentReader(Resource markdownResource) { + public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) { this.markdownResource = markdownResource; + this.config = config; this.parser = Parser.builder().build(); } @@ -38,7 +46,7 @@ public List get() { try (var input = markdownResource.getInputStream()) { Node node = parser.parseReader(new InputStreamReader(input)); - DocumentVisitor documentVisitor = new DocumentVisitor(); + DocumentVisitor documentVisitor = new DocumentVisitor(config); node.accept(documentVisitor); return documentVisitor.getDocuments(); @@ -54,17 +62,34 @@ static class DocumentVisitor extends AbstractVisitor { private final List currentParagraphs = new ArrayList<>(); + private final MarkdownDocumentReaderConfig config; + private Document.Builder currentDocumentBuilder; - @Override - public void visit(Heading heading) { - buildAndFlush(); + public DocumentVisitor(MarkdownDocumentReaderConfig config) { + this.config = config; + } + @Override + public void visit(org.commonmark.node.Document document) { currentDocumentBuilder = Document.builder(); + super.visit(document); + } + @Override + public void visit(Heading heading) { + buildAndFlush(); super.visit(heading); } + @Override + public void visit(ThematicBreak thematicBreak) { + if (config.horizontalRuleCreateDocument) { + buildAndFlush(); + } + super.visit(thematicBreak); + } + @Override public void visit(SoftLineBreak softLineBreak) { if (!currentParagraphs.isEmpty()) { @@ -101,6 +126,7 @@ private void buildAndFlush() { currentParagraphs.clear(); } + currentDocumentBuilder = Document.builder(); } } diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java new file mode 100644 index 00000000000..93cb2a64191 --- /dev/null +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java @@ -0,0 +1,40 @@ +package org.springframework.ai.reader.markdown.config; + +/** + * @author Piotr Olaszewski + */ +public class MarkdownDocumentReaderConfig { + + public final boolean horizontalRuleCreateDocument; + + public MarkdownDocumentReaderConfig(Builder builder) { + horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument; + } + + public static MarkdownDocumentReaderConfig defaultConfig() { + return builder().build(); + } + + public static Builder builder() { + return new Builder(); + } + + public static class Builder { + + private boolean horizontalRuleCreateDocument = false; + + private Builder() { + } + + public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocument) { + this.horizontalRuleCreateDocument = horizontalRuleCreateDocument; + return this; + } + + public MarkdownDocumentReaderConfig build() { + return new MarkdownDocumentReaderConfig(this); + } + + } + +} diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java index 33f76b4ddd3..5542d1844d2 100644 --- a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java +++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java @@ -2,6 +2,7 @@ import org.junit.jupiter.api.Test; import org.springframework.ai.document.Document; +import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig; import java.util.List; import java.util.Map; @@ -44,7 +45,51 @@ void testWithFormatting() { "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."), tuple(Map.of("category", "header_3", "title", "Header 3"), "Aenean eu leo eu nibh tristique posuere quis quis massa.")); + } + + @Test + void testDocumentDividedViaHorizontalRules() { + MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder() + .withHorizontalRuleCreateDocument(true) + .build(); + + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/horizontal-rules.md", config); + + List documents = reader.get(); + + assertThat(documents).hasSize(7) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of(), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida."), + tuple(Map.of(), + "Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."), + tuple(Map.of(), + "Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna."), + tuple(Map.of(), + "Vestibulum nec eros non felis fermentum posuere eget ac risus. Curabitur et fringilla massa. Cras facilisis nec nisl sit amet sagittis."), + tuple(Map.of(), + "Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula."), + tuple(Map.of(), + "Aenean quis vulputate mi. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nam tincidunt nunc a tortor tincidunt, nec lobortis diam rhoncus."), + tuple(Map.of(), "Nulla facilisi. Phasellus eget tellus sed nibh ornare interdum eu eu mi.")); + } + + @Test + void testDocumentNotDividedViaHorizontalRulesWhenIsDisabled() { + MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder() + .withHorizontalRuleCreateDocument(false) + .build(); + + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/horizontal-rules.md", config); + + List documents = reader.get(); + + assertThat(documents).hasSize(1); + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEmpty(); + assertThat(documentsFirst.getContent()).startsWith("Lorem ipsum dolor sit amet, consectetur adipiscing elit") + .endsWith("Phasellus eget tellus sed nibh ornare interdum eu eu mi."); } } diff --git a/document-readers/markdown-reader/src/test/resources/horizontal-rules.md b/document-readers/markdown-reader/src/test/resources/horizontal-rules.md new file mode 100644 index 00000000000..f7affefc124 --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/horizontal-rules.md @@ -0,0 +1,27 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. + +--- + +Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu +elementum dignissim. + +*** +Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis +et magna. + +* * * + +Vestibulum nec eros non felis fermentum posuere eget ac risus. Curabitur et fringilla massa. Cras facilisis nec nisl sit +amet sagittis. + +***** + +Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula. + +--------------------------------------- + +Aenean quis vulputate mi. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nam tincidunt nunc a tortor tincidunt, nec lobortis diam rhoncus. + +- - - + +Nulla facilisi. Phasellus eget tellus sed nibh ornare interdum eu eu mi. From bf606c847d707aedb361b5b7995a990c7b916e44 Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Tue, 23 Jul 2024 20:39:15 +0200 Subject: [PATCH 04/11] Handle hard line break --- .../ai/reader/markdown/MarkdownDocumentReader.java | 8 ++++++++ .../markdown/MarkdownDocumentReaderTest.java | 14 ++++++++++++++ .../markdown-reader/src/test/resources/simple.md | 8 ++++++++ 3 files changed, 30 insertions(+) create mode 100644 document-readers/markdown-reader/src/test/resources/simple.md diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index f8d3fe74b3a..b44c615e66f 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -98,6 +98,14 @@ public void visit(SoftLineBreak softLineBreak) { super.visit(softLineBreak); } + @Override + public void visit(HardLineBreak hardLineBreak) { + if (!currentParagraphs.isEmpty()) { + currentParagraphs.add(" "); + } + super.visit(hardLineBreak); + } + @Override public void visit(Text text) { if (text.getParent() instanceof Heading heading) { diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java index 5542d1844d2..27282e7ea47 100644 --- a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java +++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java @@ -92,4 +92,18 @@ void testDocumentNotDividedViaHorizontalRulesWhenIsDisabled() { .endsWith("Phasellus eget tellus sed nibh ornare interdum eu eu mi."); } + @Test + void testSimpleMarkdownDocumentWithHardAndSoftLineBreaks() { + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/simple.md"); + + List documents = reader.get(); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEmpty(); + assertThat(documentsFirst.getContent()).isEqualTo( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim.Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna. Vestibulum nec eros non felis fermentum posuere eget ac risus.Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula."); + } + } diff --git a/document-readers/markdown-reader/src/test/resources/simple.md b/document-readers/markdown-reader/src/test/resources/simple.md new file mode 100644 index 00000000000..3275c89b8fc --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/simple.md @@ -0,0 +1,8 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan +tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim. + +Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna. Vestibulum nec eros non felis fermentum posuere eget ac risus. + +Aenean eu leo eu nibh tristique posuere quis quis massa.\ +Nullam lacinia luctus sem ut vehicula. + From b3303d0da35a5acd873fac5527d3a3a191e853d4 Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Tue, 23 Jul 2024 20:43:36 +0200 Subject: [PATCH 05/11] Handle hard line break - refactor --- .../ai/reader/markdown/MarkdownDocumentReader.java | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index b44c615e66f..63830699ae1 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -92,17 +92,13 @@ public void visit(ThematicBreak thematicBreak) { @Override public void visit(SoftLineBreak softLineBreak) { - if (!currentParagraphs.isEmpty()) { - currentParagraphs.add(" "); - } + lineBreakToSpaceTranslate(); super.visit(softLineBreak); } @Override public void visit(HardLineBreak hardLineBreak) { - if (!currentParagraphs.isEmpty()) { - currentParagraphs.add(" "); - } + lineBreakToSpaceTranslate(); super.visit(hardLineBreak); } @@ -137,6 +133,12 @@ private void buildAndFlush() { currentDocumentBuilder = Document.builder(); } + private void lineBreakToSpaceTranslate() { + if (!currentParagraphs.isEmpty()) { + currentParagraphs.add(" "); + } + } + } } From 90ed6f3a057f7222120f814cc1d387c673150a22 Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Tue, 23 Jul 2024 20:46:46 +0200 Subject: [PATCH 06/11] Handle hard line break - refactor --- .../ai/reader/markdown/MarkdownDocumentReader.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index 63830699ae1..0a81eb466bb 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -92,13 +92,13 @@ public void visit(ThematicBreak thematicBreak) { @Override public void visit(SoftLineBreak softLineBreak) { - lineBreakToSpaceTranslate(); + translateLineBreakToSpace(); super.visit(softLineBreak); } @Override public void visit(HardLineBreak hardLineBreak) { - lineBreakToSpaceTranslate(); + translateLineBreakToSpace(); super.visit(hardLineBreak); } @@ -133,7 +133,7 @@ private void buildAndFlush() { currentDocumentBuilder = Document.builder(); } - private void lineBreakToSpaceTranslate() { + private void translateLineBreakToSpace() { if (!currentParagraphs.isEmpty()) { currentParagraphs.add(" "); } From aeae2ad41e562f053ee867fd6fda0fdbe36a79a0 Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Tue, 23 Jul 2024 21:29:09 +0200 Subject: [PATCH 07/11] Handle inline and block codes --- .../markdown/MarkdownDocumentReader.java | 23 ++++++++ .../config/MarkdownDocumentReaderConfig.java | 10 ++++ .../markdown/MarkdownDocumentReaderTest.java | 57 +++++++++++++++++++ .../src/test/resources/code.md | 25 ++++++++ 4 files changed, 115 insertions(+) create mode 100644 document-readers/markdown-reader/src/test/resources/code.md diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index 0a81eb466bb..97b233ff428 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -102,6 +102,29 @@ public void visit(HardLineBreak hardLineBreak) { super.visit(hardLineBreak); } + @Override + public void visit(Code code) { + currentParagraphs.add(code.getLiteral()); + currentDocumentBuilder.withMetadata("code", "inline"); + super.visit(code); + } + + @Override + public void visit(FencedCodeBlock fencedCodeBlock) { + if (!config.includeCodeBlock) { + buildAndFlush(); + } + + translateLineBreakToSpace(); + currentParagraphs.add(fencedCodeBlock.getLiteral()); + currentDocumentBuilder.withMetadata("code", "block"); + currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo()); + + buildAndFlush(); + + super.visit(fencedCodeBlock); + } + @Override public void visit(Text text) { if (text.getParent() instanceof Heading heading) { diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java index 93cb2a64191..80498da4508 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java @@ -7,8 +7,11 @@ public class MarkdownDocumentReaderConfig { public final boolean horizontalRuleCreateDocument; + public final boolean includeCodeBlock; + public MarkdownDocumentReaderConfig(Builder builder) { horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument; + includeCodeBlock = builder.includeCodeBlock; } public static MarkdownDocumentReaderConfig defaultConfig() { @@ -23,6 +26,8 @@ public static class Builder { private boolean horizontalRuleCreateDocument = false; + private boolean includeCodeBlock = false; + private Builder() { } @@ -31,6 +36,11 @@ public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocu return this; } + public Builder withIncludeCodeBlock(boolean includeCodeBlock) { + this.includeCodeBlock = includeCodeBlock; + return this; + } + public MarkdownDocumentReaderConfig build() { return new MarkdownDocumentReaderConfig(this); } diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java index 27282e7ea47..032b6fb143b 100644 --- a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java +++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java @@ -106,4 +106,61 @@ void testSimpleMarkdownDocumentWithHardAndSoftLineBreaks() { "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim.Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna. Vestibulum nec eros non felis fermentum posuere eget ac risus.Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula."); } + @Test + void testCode() { + MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder() + .withHorizontalRuleCreateDocument(true) + .build(); + + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/code.md", config); + + List documents = reader.get(); + + assertThat(documents).satisfiesExactly(document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of()); + assertThat(document.getContent()).isEqualTo("This is a Java sample application:"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "code", "block")); + assertThat(document.getContent()).startsWith("package com.example.demo;") + .contains("SpringApplication.run(DemoApplication.class, args);"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("code", "inline")); + assertThat(document.getContent()).isEqualTo( + "Markdown also provides the possibility to use inline code formatting throughout the entire sentence."); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of()); + assertThat(document.getContent()) + .isEqualTo("Another possibility is to set block code without specific highlighting:"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "code", "block")); + assertThat(document.getContent()).isEqualTo("./mvnw spring-javaformat:apply\n"); + }); + } + + @Test + void testCodeWhenCodeBlockShouldNotBeSeparatedDocument() { + MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder() + .withHorizontalRuleCreateDocument(true) + .withIncludeCodeBlock(true) + .build(); + + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/code.md", config); + + List documents = reader.get(); + + assertThat(documents).satisfiesExactly(document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "code", "block")); + assertThat(document.getContent()).startsWith("This is a Java sample application: package com.example.demo") + .contains("SpringApplication.run(DemoApplication.class, args);"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("code", "inline")); + assertThat(document.getContent()).isEqualTo( + "Markdown also provides the possibility to use inline code formatting throughout the entire sentence."); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "code", "block")); + assertThat(document.getContent()).isEqualTo( + "Another possibility is to set block code without specific highlighting: ./mvnw spring-javaformat:apply\n"); + }); + } + } diff --git a/document-readers/markdown-reader/src/test/resources/code.md b/document-readers/markdown-reader/src/test/resources/code.md new file mode 100644 index 00000000000..31d7c7b0319 --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/code.md @@ -0,0 +1,25 @@ +This is a Java sample application: + +```java +package com.example.demo; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; + +@SpringBootApplication +public class DemoApplication { + public static void main(String[] args) { + SpringApplication.run(DemoApplication.class, args); + } +} +``` + +Markdown also provides the possibility to `use inline code formatting throughout` the entire sentence. + +--- + +Another possibility is to set block code without specific highlighting: + +``` +./mvnw spring-javaformat:apply +``` From 28634d9d8ab5368821a351fbc683fd153362c014 Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Tue, 23 Jul 2024 21:47:50 +0200 Subject: [PATCH 08/11] Handle blockquote --- .../markdown/MarkdownDocumentReader.java | 15 ++++++- .../config/MarkdownDocumentReaderConfig.java | 10 +++++ .../markdown/MarkdownDocumentReaderTest.java | 44 ++++++++++++++++--- .../src/test/resources/blockquote.md | 8 ++++ 4 files changed, 69 insertions(+), 8 deletions(-) create mode 100644 document-readers/markdown-reader/src/test/resources/blockquote.md diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index 97b233ff428..9dd8cb22009 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -102,10 +102,21 @@ public void visit(HardLineBreak hardLineBreak) { super.visit(hardLineBreak); } + @Override + public void visit(BlockQuote blockQuote) { + if (!config.includeBlockquote) { + buildAndFlush(); + } + + translateLineBreakToSpace(); + currentDocumentBuilder.withMetadata("category", "blockquote"); + super.visit(blockQuote); + } + @Override public void visit(Code code) { currentParagraphs.add(code.getLiteral()); - currentDocumentBuilder.withMetadata("code", "inline"); + currentDocumentBuilder.withMetadata("category", "code_inline"); super.visit(code); } @@ -117,7 +128,7 @@ public void visit(FencedCodeBlock fencedCodeBlock) { translateLineBreakToSpace(); currentParagraphs.add(fencedCodeBlock.getLiteral()); - currentDocumentBuilder.withMetadata("code", "block"); + currentDocumentBuilder.withMetadata("category", "code_block"); currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo()); buildAndFlush(); diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java index 80498da4508..015182ecfa6 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java @@ -9,9 +9,12 @@ public class MarkdownDocumentReaderConfig { public final boolean includeCodeBlock; + public final boolean includeBlockquote; + public MarkdownDocumentReaderConfig(Builder builder) { horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument; includeCodeBlock = builder.includeCodeBlock; + includeBlockquote = builder.includeBlockquote; } public static MarkdownDocumentReaderConfig defaultConfig() { @@ -28,6 +31,8 @@ public static class Builder { private boolean includeCodeBlock = false; + private boolean includeBlockquote = false; + private Builder() { } @@ -41,6 +46,11 @@ public Builder withIncludeCodeBlock(boolean includeCodeBlock) { return this; } + public Builder withIncludeBlockquote(boolean includeBlockquote) { + this.includeBlockquote = includeBlockquote; + return this; + } + public MarkdownDocumentReaderConfig build() { return new MarkdownDocumentReaderConfig(this); } diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java index 032b6fb143b..5880bb29341 100644 --- a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java +++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java @@ -120,11 +120,11 @@ void testCode() { assertThat(document.getMetadata()).isEqualTo(Map.of()); assertThat(document.getContent()).isEqualTo("This is a Java sample application:"); }, document -> { - assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "code", "block")); + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "category", "code_block")); assertThat(document.getContent()).startsWith("package com.example.demo;") .contains("SpringApplication.run(DemoApplication.class, args);"); }, document -> { - assertThat(document.getMetadata()).isEqualTo(Map.of("code", "inline")); + assertThat(document.getMetadata()).isEqualTo(Map.of("category", "code_inline")); assertThat(document.getContent()).isEqualTo( "Markdown also provides the possibility to use inline code formatting throughout the entire sentence."); }, document -> { @@ -132,7 +132,7 @@ void testCode() { assertThat(document.getContent()) .isEqualTo("Another possibility is to set block code without specific highlighting:"); }, document -> { - assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "code", "block")); + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "category", "code_block")); assertThat(document.getContent()).isEqualTo("./mvnw spring-javaformat:apply\n"); }); } @@ -149,18 +149,50 @@ void testCodeWhenCodeBlockShouldNotBeSeparatedDocument() { List documents = reader.get(); assertThat(documents).satisfiesExactly(document -> { - assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "code", "block")); + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "category", "code_block")); assertThat(document.getContent()).startsWith("This is a Java sample application: package com.example.demo") .contains("SpringApplication.run(DemoApplication.class, args);"); }, document -> { - assertThat(document.getMetadata()).isEqualTo(Map.of("code", "inline")); + assertThat(document.getMetadata()).isEqualTo(Map.of("category", "code_inline")); assertThat(document.getContent()).isEqualTo( "Markdown also provides the possibility to use inline code formatting throughout the entire sentence."); }, document -> { - assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "code", "block")); + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "category", "code_block")); assertThat(document.getContent()).isEqualTo( "Another possibility is to set block code without specific highlighting: ./mvnw spring-javaformat:apply\n"); }); } + @Test + void testBlockquote() { + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/blockquote.md"); + + List documents = reader.get(); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of(), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), + tuple(Map.of("category", "blockquote"), + "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.")); + } + + @Test + void testBlockquoteWhenBlockquoteShouldNotBeSeparatedDocument() { + MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder() + .withIncludeBlockquote(true) + .build(); + + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/blockquote.md", config); + + List documents = reader.get(); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEqualTo(Map.of("category", "blockquote")); + assertThat(documentsFirst.getContent()).isEqualTo( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."); + } + } diff --git a/document-readers/markdown-reader/src/test/resources/blockquote.md b/document-readers/markdown-reader/src/test/resources/blockquote.md new file mode 100644 index 00000000000..d92ac44f6cd --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/blockquote.md @@ -0,0 +1,8 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed +nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. + +> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget +> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a +> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum +> suscipit. + From f90cde4d16b98bdc48365ec9e64d33d9cf16788c Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Wed, 24 Jul 2024 08:24:17 +0200 Subject: [PATCH 09/11] Handle ordered and unordered lists --- .../reader/markdown/MarkdownDocumentReader.java | 6 ++++++ .../markdown/MarkdownDocumentReaderTest.java | 14 ++++++++++++++ .../markdown-reader/src/test/resources/lists.md | 17 +++++++++++++++++ 3 files changed, 37 insertions(+) create mode 100644 document-readers/markdown-reader/src/test/resources/lists.md diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index 9dd8cb22009..51b0408838f 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -102,6 +102,12 @@ public void visit(HardLineBreak hardLineBreak) { super.visit(hardLineBreak); } + @Override + public void visit(ListItem listItem) { + translateLineBreakToSpace(); + super.visit(listItem); + } + @Override public void visit(BlockQuote blockQuote) { if (!config.includeBlockquote) { diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java index 5880bb29341..83c6c561def 100644 --- a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java +++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java @@ -195,4 +195,18 @@ void testBlockquoteWhenBlockquoteShouldNotBeSeparatedDocument() { "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."); } + @Test + void testLists() { + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/lists.md"); + + List documents = reader.get(); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of("category", "header_2", "title", "Ordered list"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor."), + tuple(Map.of("category", "header_2", "title", "Unordered list"), + "Aenean eu leo eu nibh tristique posuere quis quis massa. Aenean imperdiet libero dui, nec malesuada dui maximus vel. Vestibulum sed dui condimentum, cursus libero in, dapibus tortor. Etiam facilisis enim in egestas dictum.")); + } + } diff --git a/document-readers/markdown-reader/src/test/resources/lists.md b/document-readers/markdown-reader/src/test/resources/lists.md new file mode 100644 index 00000000000..f82e7e34521 --- /dev/null +++ b/document-readers/markdown-reader/src/test/resources/lists.md @@ -0,0 +1,17 @@ +## Ordered list + +1. Lorem ipsum dolor sit *amet*, consectetur adipiscing elit. **Curabitur** diam eros, laoreet sit _amet_ cursus vitae, + varius sed nisi. +2. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. +3. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget + sapien odio. + 1. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum + suscipit. + 2. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. + +## Unordered list + +* Aenean eu leo eu nibh tristique posuere quis quis massa. +* Aenean imperdiet libero dui, nec malesuada dui maximus vel. Vestibulum sed dui condimentum, cursus libero in, dapibus + tortor. + * Etiam facilisis enim in egestas dictum. From 5a2b9673da26614829c41db485397d24e8507d96 Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Wed, 24 Jul 2024 08:58:39 +0200 Subject: [PATCH 10/11] Add JavaDocs --- .../markdown/MarkdownDocumentReader.java | 22 +++++++++++-- .../config/MarkdownDocumentReaderConfig.java | 33 +++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index 51b0408838f..391cb267916 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -14,17 +14,28 @@ import java.util.List; /** - * Reads the given Markdown resource and groups headers paragraphs into - * {@link Document}'s. + * Reads the given Markdown resource and groups headers, paragraphs, or text divided by + * horizontal lines (depending on the + * {@link MarkdownDocumentReaderConfig#horizontalRuleCreateDocument} configuration) into + * {@link Document}s. * * @author Piotr Olaszewski */ public class MarkdownDocumentReader implements DocumentReader { + /** + * The resource points to the Markdown document. + */ private final Resource markdownResource; + /** + * Configuration to a parsing process. + */ private final MarkdownDocumentReaderConfig config; + /** + * Markdown parser. + */ private final Parser parser; public MarkdownDocumentReader(String markdownResource) { @@ -41,6 +52,10 @@ public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderC this.parser = Parser.builder().build(); } + /** + * Extracts and returns a list of documents from the resource. + * @return List of extracted {@link Document} + */ @Override public List get() { try (var input = markdownResource.getInputStream()) { @@ -56,6 +71,9 @@ public List get() { } } + /** + * A convenient class for visiting handled nodes in the Markdown document. + */ static class DocumentVisitor extends AbstractVisitor { private final List documents = new ArrayList<>(); diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java index 015182ecfa6..9b802aa428d 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java @@ -1,6 +1,11 @@ package org.springframework.ai.reader.markdown.config; +import org.springframework.ai.document.Document; +import org.springframework.ai.reader.markdown.MarkdownDocumentReader; + /** + * Common configuration for the {@link MarkdownDocumentReader}. + * * @author Piotr Olaszewski */ public class MarkdownDocumentReaderConfig { @@ -17,6 +22,9 @@ public MarkdownDocumentReaderConfig(Builder builder) { includeBlockquote = builder.includeBlockquote; } + /** + * @return the default configuration + */ public static MarkdownDocumentReaderConfig defaultConfig() { return builder().build(); } @@ -36,21 +44,46 @@ public static class Builder { private Builder() { } + /** + * Text divided by horizontal lines will create new {@link Document}s. The default + * is {@code false}, meaning text separated by horizontal lines won't create a new + * document. + * @param horizontalRuleCreateDocument flag to determine whether new documents are + * created from text divided by horizontal line + * @return this builder + */ public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocument) { this.horizontalRuleCreateDocument = horizontalRuleCreateDocument; return this; } + /** + * Whatever to include code blocks in {@link Document}s. The default is + * {@code false}, which means all code blocks are in separate documents. + * @param includeCodeBlock flag to include code block into paragraph document or + * create new with code only + * @return this builder + */ public Builder withIncludeCodeBlock(boolean includeCodeBlock) { this.includeCodeBlock = includeCodeBlock; return this; } + /** + * Whatever to include blockquotes in {@link Document}s. The default is + * {@code false}, which means all blockquotes are in separate documents. + * @param includeBlockquote flag to include blockquotes into paragraph document or + * create new with blockquote only + * @return this builder + */ public Builder withIncludeBlockquote(boolean includeBlockquote) { this.includeBlockquote = includeBlockquote; return this; } + /** + * @return the immutable configuration + */ public MarkdownDocumentReaderConfig build() { return new MarkdownDocumentReaderConfig(this); } From 4a71e01a09484b1b544af6113bbe781d2db02dc0 Mon Sep 17 00:00:00 2001 From: Piotr Olaszewski Date: Wed, 24 Jul 2024 09:15:33 +0200 Subject: [PATCH 11/11] Introduce additional metadata --- .../markdown/MarkdownDocumentReader.java | 7 ++++- .../config/MarkdownDocumentReaderConfig.java | 30 +++++++++++++++++++ .../markdown/MarkdownDocumentReaderTest.java | 18 +++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java index 391cb267916..7ed8aa6b548 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java @@ -183,7 +183,12 @@ private void buildAndFlush() { if (!currentParagraphs.isEmpty()) { String content = String.join("", currentParagraphs); - Document document = currentDocumentBuilder.withContent(content).build(); + Document.Builder builder = currentDocumentBuilder.withContent(content); + + config.additionalMetadata.forEach(builder::withMetadata); + + Document document = builder.build(); + documents.add(document); currentParagraphs.clear(); diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java index 9b802aa428d..d5ad3ec58ce 100644 --- a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java +++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java @@ -2,6 +2,10 @@ import org.springframework.ai.document.Document; import org.springframework.ai.reader.markdown.MarkdownDocumentReader; +import org.springframework.util.Assert; + +import java.util.HashMap; +import java.util.Map; /** * Common configuration for the {@link MarkdownDocumentReader}. @@ -16,10 +20,13 @@ public class MarkdownDocumentReaderConfig { public final boolean includeBlockquote; + public final Map additionalMetadata; + public MarkdownDocumentReaderConfig(Builder builder) { horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument; includeCodeBlock = builder.includeCodeBlock; includeBlockquote = builder.includeBlockquote; + additionalMetadata = builder.additionalMetadata; } /** @@ -41,6 +48,8 @@ public static class Builder { private boolean includeBlockquote = false; + private Map additionalMetadata = new HashMap<>(); + private Builder() { } @@ -81,6 +90,27 @@ public Builder withIncludeBlockquote(boolean includeBlockquote) { return this; } + /** + * Adds this additional metadata to the all built {@link Document}s. + * @return this builder + */ + public Builder withAdditionalMetadata(String key, Object value) { + Assert.notNull(key, "key must not be null"); + Assert.notNull(value, "value must not be null"); + this.additionalMetadata.put(key, value); + return this; + } + + /** + * Adds this additional metadata to the all built {@link Document}s. + * @return this builder + */ + public Builder withAdditionalMetadata(Map additionalMetadata) { + Assert.notNull(additionalMetadata, "additionalMetadata must not be null"); + this.additionalMetadata = additionalMetadata; + return this; + } + /** * @return the immutable configuration */ diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java index 83c6c561def..739dbbd709b 100644 --- a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java +++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java @@ -209,4 +209,22 @@ void testLists() { "Aenean eu leo eu nibh tristique posuere quis quis massa. Aenean imperdiet libero dui, nec malesuada dui maximus vel. Vestibulum sed dui condimentum, cursus libero in, dapibus tortor. Etiam facilisis enim in egestas dictum.")); } + @Test + void testWithAdditionalMetadata() { + MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder() + .withAdditionalMetadata("service", "some-service-name") + .withAdditionalMetadata("env", "prod") + .build(); + + MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/simple.md", config); + + List documents = reader.get(); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEqualTo(Map.of("service", "some-service-name", "env", "prod")); + assertThat(documentsFirst.getContent()).startsWith("Lorem ipsum dolor sit amet, consectetur adipiscing elit."); + } + }