Skip to content

Commit 9bdc49b

Browse files
committed
feat: markdown reader sopport folder reader and multi files
Signed-off-by: yuluo-yx <[email protected]>
1 parent 5864255 commit 9bdc49b

File tree

6 files changed

+239
-18
lines changed

6 files changed

+239
-18
lines changed

document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java

Lines changed: 135 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,15 @@
1616

1717
package org.springframework.ai.reader.markdown;
1818

19+
import java.io.File;
20+
import java.io.FileNotFoundException;
1921
import java.io.IOException;
22+
import java.io.InputStream;
2023
import java.io.InputStreamReader;
24+
import java.net.URL;
2125
import java.util.ArrayList;
2226
import java.util.List;
27+
import java.util.stream.Collectors;
2328

2429
import org.commonmark.node.AbstractVisitor;
2530
import org.commonmark.node.BlockQuote;
@@ -33,27 +38,37 @@
3338
import org.commonmark.node.Text;
3439
import org.commonmark.node.ThematicBreak;
3540
import org.commonmark.parser.Parser;
41+
import org.slf4j.Logger;
42+
import org.slf4j.LoggerFactory;
3643

3744
import org.springframework.ai.document.Document;
3845
import org.springframework.ai.document.DocumentReader;
3946
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
4047
import org.springframework.core.io.DefaultResourceLoader;
4148
import org.springframework.core.io.Resource;
49+
import org.springframework.util.Assert;
4250

4351
/**
4452
* Reads the given Markdown resource and groups headers, paragraphs, or text divided by
4553
* horizontal lines (depending on the
4654
* {@link MarkdownDocumentReaderConfig#horizontalRuleCreateDocument} configuration) into
4755
* {@link Document}s.
56+
* Currently, only Markdown resource files in the ClassPath path are supported,
57+
* and Markdown files can be read in the way of directory path configuration.
58+
* Use {@See org.springframework.ai.reader.markdown.MarkdownDocumentReaderTest#testDirPathSingle()}
59+
* {@See org.springframework.ai.reader.markdown.MarkdownDocumentReaderTest#testMultipleMarkdownFiles()}
4860
*
4961
* @author Piotr Olaszewski
62+
* @auther shown.Ji
5063
*/
5164
public class MarkdownDocumentReader implements DocumentReader {
5265

66+
private final static Logger logger = LoggerFactory.getLogger(MarkdownDocumentReader.class);
67+
5368
/**
5469
* The resource points to the Markdown document.
5570
*/
56-
private final Resource markdownResource;
71+
private final List<Resource> markdownResources;
5772

5873
/**
5974
* Configuration to a parsing process.
@@ -67,27 +82,56 @@ public class MarkdownDocumentReader implements DocumentReader {
6782

6883
/**
6984
* Create a new {@link MarkdownDocumentReader} instance.
70-
* @param markdownResource the resource to read
85+
* @param markdownResourcePath the markdown file resource path to read
7186
*/
72-
public MarkdownDocumentReader(String markdownResource) {
73-
this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig());
87+
public MarkdownDocumentReader(String markdownResourcePath) {
88+
this(loadResources(loadResourcePaths(markdownResourcePath)), MarkdownDocumentReaderConfig.defaultConfig());
7489
}
7590

7691
/**
7792
* Create a new {@link MarkdownDocumentReader} instance.
78-
* @param markdownResource the resource to read
93+
* @param markdownResourcePath the resource path
7994
* @param config the configuration to use
8095
*/
81-
public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) {
82-
this(new DefaultResourceLoader().getResource(markdownResource), config);
96+
public MarkdownDocumentReader(String markdownResourcePath, MarkdownDocumentReaderConfig config) {
97+
this(loadResources(loadResourcePaths(markdownResourcePath)), config);
8398
}
8499

85100
/**
86101
* Create a new {@link MarkdownDocumentReader} instance.
87-
* @param markdownResource the resource to read
102+
* @param markdownResourcePaths the resources paths to read
103+
*/
104+
public MarkdownDocumentReader(List<String> markdownResourcePaths) {
105+
this(loadResources(markdownResourcePaths), MarkdownDocumentReaderConfig.defaultConfig());
106+
}
107+
108+
/**
109+
* Create a new {@link MarkdownDocumentReader} instance.
110+
* @param markdownResource the markdown file resources to read
111+
*/
112+
public MarkdownDocumentReader(Resource markdownResource) {
113+
this(markdownResource, MarkdownDocumentReaderConfig.defaultConfig());
114+
}
115+
116+
/**
117+
* Create a new {@link MarkdownDocumentReader} instance.
118+
* @param markdownResource the markdown file resource to read
119+
* @param config the configuration to use
88120
*/
89121
public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) {
90-
this.markdownResource = markdownResource;
122+
this(List.of(markdownResource), config);
123+
}
124+
125+
/**
126+
* Create a new {@link MarkdownDocumentReader} instance.
127+
* @param markdownResource the resource to read
128+
* @param config the configuration to use
129+
*/
130+
public MarkdownDocumentReader(List<Resource> markdownResource, MarkdownDocumentReaderConfig config) {
131+
132+
Assert.notEmpty(markdownResource, "Markdown resource must not be empty");
133+
134+
this.markdownResources = markdownResource;
91135
this.config = config;
92136
this.parser = Parser.builder().build();
93137
}
@@ -98,17 +142,34 @@ public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderC
98142
*/
99143
@Override
100144
public List<Document> get() {
101-
try (var input = this.markdownResource.getInputStream()) {
102-
Node node = this.parser.parseReader(new InputStreamReader(input));
103145

104-
DocumentVisitor documentVisitor = new DocumentVisitor(this.config);
105-
node.accept(documentVisitor);
146+
return this.markdownResources.stream()
147+
.flatMap(markdownResource -> getDocuments(markdownResource).stream())
148+
.collect(Collectors.toList());
149+
}
106150

107-
return documentVisitor.getDocuments();
108-
}
109-
catch (IOException e) {
110-
throw new RuntimeException(e);
151+
private List<Document> getDocuments(Resource markdownResource) {
152+
153+
List<Document> documents;
154+
try {
155+
if (markdownResource.isFile() && !markdownResource.exists()) {
156+
throw new FileNotFoundException("Resource does not exist: " + markdownResource.getFilename());
157+
}
158+
159+
logger.debug("Attempting to read resource: " + markdownResource.getDescription());
160+
try (InputStream input = markdownResource.getInputStream()) {
161+
Node node = this.parser.parseReader(new InputStreamReader(input));
162+
163+
DocumentVisitor documentVisitor = new DocumentVisitor(this.config);
164+
node.accept(documentVisitor);
165+
166+
documents = documentVisitor.getDocuments();
167+
}
168+
} catch (IOException e) {
169+
logger.error("Error reading markdown resource: " + e.getMessage(), e);
170+
throw new RuntimeException("Error reading markdown resource", e);
111171
}
172+
return documents;
112173
}
113174

114175
/**
@@ -207,7 +268,7 @@ public void visit(FencedCodeBlock fencedCodeBlock) {
207268
public void visit(Text text) {
208269
if (text.getParent() instanceof Heading heading) {
209270
this.currentDocumentBuilder.metadata("category", "header_%d".formatted(heading.getLevel()))
210-
.metadata("title", text.getLiteral());
271+
.metadata("title", text.getLiteral());
211272
}
212273
else {
213274
this.currentParagraphs.add(text.getLiteral());
@@ -247,4 +308,60 @@ private void translateLineBreakToSpace() {
247308

248309
}
249310

311+
/**
312+
* Load resources from the given paths.
313+
* @param markdownResourcePaths the resource paths to load
314+
* @return a list of Resources
315+
*/
316+
private static List<Resource> loadResources(List<String> markdownResourcePaths) {
317+
318+
DefaultResourceLoader resourceLoader = new DefaultResourceLoader();
319+
320+
return markdownResourcePaths.stream()
321+
.map(resourceLoader::getResource)
322+
.collect(Collectors.toList());
323+
}
324+
325+
/**
326+
* Load resource paths from the given path.
327+
* @param resourcePath markdown resource path
328+
* @return a list of resource paths
329+
*/
330+
private static List<String> loadResourcePaths(String resourcePath) {
331+
List<String> resources = new ArrayList<>();
332+
333+
if (resourcePath.startsWith("classpath:")) {
334+
String path = resourcePath.replace("classpath:", "");
335+
URL resourceURL = MarkdownDocumentReader.class.getResource(path);
336+
337+
if (resourceURL != null) {
338+
File file = new File(resourceURL.getFile());
339+
if (file.isDirectory()) {
340+
File[] files = file.listFiles((dir, name) -> name.endsWith(".md"));
341+
if (files != null) {
342+
for (File mdFile : files) {
343+
resources.add("classpath:" + mdFile.getName());
344+
}
345+
}
346+
} else if (file.exists() && file.getName().endsWith(".md")) {
347+
resources.add(resourcePath);
348+
}
349+
}
350+
} else {
351+
File file = new File(resourcePath);
352+
if (file.exists() && file.isDirectory()) {
353+
File[] files = file.listFiles((dir, name) -> name.endsWith(".md"));
354+
if (files != null) {
355+
for (File mdFile : files) {
356+
resources.add(mdFile.getAbsolutePath());
357+
}
358+
}
359+
} else if (file.exists() && file.getName().endsWith(".md")) {
360+
resources.add(file.getAbsolutePath());
361+
}
362+
}
363+
364+
return resources;
365+
}
366+
250367
}

document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,68 @@
2929

3030
/**
3131
* @author Piotr Olaszewski
32+
* @author shown.Ji
3233
*/
3334
class MarkdownDocumentReaderTest {
3435

36+
@Test
37+
void testDirPathSingle() {
38+
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-1");
39+
40+
List<Document> documents = reader.get();
41+
42+
assertThat(documents).hasSize(2)
43+
.extracting(Document::getMetadata, Document::getText)
44+
.containsOnly(tuple(Map.of(),
45+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
46+
tuple(Map.of("category", "blockquote"),
47+
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
48+
}
49+
50+
@Test
51+
void testDirPathMultiple() {
52+
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/dir-test-2");
53+
List<Document> documents = reader.get();
54+
55+
assertThat(documents).hasSize(6)
56+
.extracting(Document::getMetadata, Document::getText)
57+
.containsOnly(
58+
tuple(Map.of("category", "header_1", "title", "This is a fancy header name"),
59+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."),
60+
tuple(Map.of("category", "header_3", "title", "Header 3"),
61+
"Aenean eu leo eu nibh tristique posuere quis quis massa."),
62+
tuple(Map.of("category", "header_1", "title", "Header 1a"),
63+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
64+
tuple(Map.of("category", "header_1", "title", "Header 1b"),
65+
"Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."),
66+
tuple(Map.of("category", "header_2", "title", "Header 2b"),
67+
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."),
68+
tuple(Map.of("category", "header_2", "title", "Header 2c"),
69+
"Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
70+
}
71+
72+
@Test
73+
void testMultipleMarkdownFiles() {
74+
MarkdownDocumentReader reader = new MarkdownDocumentReader(List.of("classpath:/only-headers.md", "classpath:/with-formatting.md"));
75+
List<Document> documents = reader.get();
76+
77+
assertThat(documents).hasSize(6)
78+
.extracting(Document::getMetadata, Document::getText)
79+
.containsOnly(
80+
tuple(Map.of("category", "header_1", "title", "This is a fancy header name"),
81+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."),
82+
tuple(Map.of("category", "header_3", "title", "Header 3"),
83+
"Aenean eu leo eu nibh tristique posuere quis quis massa."),
84+
tuple(Map.of("category", "header_1", "title", "Header 1a"),
85+
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
86+
tuple(Map.of("category", "header_1", "title", "Header 1b"),
87+
"Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."),
88+
tuple(Map.of("category", "header_2", "title", "Header 2b"),
89+
"Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."),
90+
tuple(Map.of("category", "header_2", "title", "Header 2c"),
91+
"Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
92+
}
93+
3594
@Test
3695
void testOnlyHeadersWithParagraphs() {
3796
MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/only-headers.md");
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
2+
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.
3+
4+
> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
5+
> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a
6+
> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
7+
> suscipit.
8+
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
2+
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.
3+
4+
> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
5+
> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a
6+
> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
7+
> suscipit.
8+
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Header 1a
2+
3+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
4+
nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.
5+
6+
# Header 1b
7+
8+
Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed
9+
sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh.
10+
11+
## Header 2b
12+
13+
Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien
14+
odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero.
15+
16+
# Header 1c
17+
18+
## Header 2c
19+
20+
Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# This is a fancy header name
2+
3+
Lorem ipsum dolor sit amet, **consectetur adipiscing elit**. Donec tincidunt velit non bibendum gravida. Cras accumsan
4+
tincidunt ornare. Donec hendrerit consequat tellus *blandit* accumsan. Aenean aliquam metus at ***arcu elementum***
5+
dignissim.
6+
7+
### Header 3
8+
9+
Aenean eu leo eu nibh tristique _posuere quis quis massa_.

0 commit comments

Comments
 (0)