Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions document-readers/markdown-reader/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>spring-ai-markdown-document-reader</artifactId>
<packaging>jar</packaging>
<name>Spring AI Document Reader - Markdown</name>
<description>Spring AI Markdown document reader</description>
<url>https://github.com/spring-projects/spring-ai</url>

<scm>
<url>https://github.com/spring-projects/spring-ai</url>
<connection>git://github.com/spring-projects/spring-ai.git</connection>
<developerConnection>[email protected]:spring-projects/spring-ai.git</developerConnection>
</scm>

<dependencies>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-core</artifactId>
<version>${parent.version}</version>
</dependency>

<dependency>
<groupId>org.commonmark</groupId>
<artifactId>commonmark</artifactId>
<version>${commonmark.version}</version>
</dependency>

<!-- TESTING -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>

</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
package org.springframework.ai.reader.markdown;

import org.commonmark.node.*;
import org.commonmark.parser.Parser;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

/**
* Reads the given Markdown resource and groups headers, paragraphs, or text divided by
* horizontal lines (depending on the
* {@link MarkdownDocumentReaderConfig#horizontalRuleCreateDocument} configuration) into
* {@link Document}s.
*
* @author Piotr Olaszewski
*/
public class MarkdownDocumentReader implements DocumentReader {

/**
* The resource points to the Markdown document.
*/
private final Resource markdownResource;

/**
* Configuration to a parsing process.
*/
private final MarkdownDocumentReaderConfig config;

/**
* Markdown parser.
*/
private final Parser parser;

public MarkdownDocumentReader(String markdownResource) {
this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig());
}

public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) {
this(new DefaultResourceLoader().getResource(markdownResource), config);
}

public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) {
this.markdownResource = markdownResource;
this.config = config;
this.parser = Parser.builder().build();
}

/**
* Extracts and returns a list of documents from the resource.
* @return List of extracted {@link Document}
*/
@Override
public List<Document> get() {
try (var input = markdownResource.getInputStream()) {
Node node = parser.parseReader(new InputStreamReader(input));

DocumentVisitor documentVisitor = new DocumentVisitor(config);
node.accept(documentVisitor);

return documentVisitor.getDocuments();
}
catch (IOException e) {
throw new RuntimeException(e);
}
}

/**
* A convenient class for visiting handled nodes in the Markdown document.
*/
static class DocumentVisitor extends AbstractVisitor {

private final List<Document> documents = new ArrayList<>();

private final List<String> currentParagraphs = new ArrayList<>();

private final MarkdownDocumentReaderConfig config;

private Document.Builder currentDocumentBuilder;

public DocumentVisitor(MarkdownDocumentReaderConfig config) {
this.config = config;
}

@Override
public void visit(org.commonmark.node.Document document) {
currentDocumentBuilder = Document.builder();
super.visit(document);
}

@Override
public void visit(Heading heading) {
buildAndFlush();
super.visit(heading);
}

@Override
public void visit(ThematicBreak thematicBreak) {
if (config.horizontalRuleCreateDocument) {
buildAndFlush();
}
super.visit(thematicBreak);
}

@Override
public void visit(SoftLineBreak softLineBreak) {
translateLineBreakToSpace();
super.visit(softLineBreak);
}

@Override
public void visit(HardLineBreak hardLineBreak) {
translateLineBreakToSpace();
super.visit(hardLineBreak);
}

@Override
public void visit(ListItem listItem) {
translateLineBreakToSpace();
super.visit(listItem);
}

@Override
public void visit(BlockQuote blockQuote) {
if (!config.includeBlockquote) {
buildAndFlush();
}

translateLineBreakToSpace();
currentDocumentBuilder.withMetadata("category", "blockquote");
super.visit(blockQuote);
}

@Override
public void visit(Code code) {
currentParagraphs.add(code.getLiteral());
currentDocumentBuilder.withMetadata("category", "code_inline");
super.visit(code);
}

@Override
public void visit(FencedCodeBlock fencedCodeBlock) {
if (!config.includeCodeBlock) {
buildAndFlush();
}

translateLineBreakToSpace();
currentParagraphs.add(fencedCodeBlock.getLiteral());
currentDocumentBuilder.withMetadata("category", "code_block");
currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo());

buildAndFlush();

super.visit(fencedCodeBlock);
}

@Override
public void visit(Text text) {
if (text.getParent() instanceof Heading heading) {
currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel()))
.withMetadata("title", text.getLiteral());
}
else {
currentParagraphs.add(text.getLiteral());
}

super.visit(text);
}

public List<Document> getDocuments() {
buildAndFlush();

return documents;
}

private void buildAndFlush() {
if (!currentParagraphs.isEmpty()) {
String content = String.join("", currentParagraphs);

Document.Builder builder = currentDocumentBuilder.withContent(content);

config.additionalMetadata.forEach(builder::withMetadata);

Document document = builder.build();

documents.add(document);

currentParagraphs.clear();
}
currentDocumentBuilder = Document.builder();
}

private void translateLineBreakToSpace() {
if (!currentParagraphs.isEmpty()) {
currentParagraphs.add(" ");
}
}

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package org.springframework.ai.reader.markdown.config;

import org.springframework.ai.document.Document;
import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
import org.springframework.util.Assert;

import java.util.HashMap;
import java.util.Map;

/**
* Common configuration for the {@link MarkdownDocumentReader}.
*
* @author Piotr Olaszewski
*/
public class MarkdownDocumentReaderConfig {

public final boolean horizontalRuleCreateDocument;

public final boolean includeCodeBlock;

public final boolean includeBlockquote;

public final Map<String, Object> additionalMetadata;

public MarkdownDocumentReaderConfig(Builder builder) {
horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument;
includeCodeBlock = builder.includeCodeBlock;
includeBlockquote = builder.includeBlockquote;
additionalMetadata = builder.additionalMetadata;
}

/**
* @return the default configuration
*/
public static MarkdownDocumentReaderConfig defaultConfig() {
return builder().build();
}

public static Builder builder() {
return new Builder();
}

public static class Builder {

private boolean horizontalRuleCreateDocument = false;

private boolean includeCodeBlock = false;

private boolean includeBlockquote = false;

private Map<String, Object> additionalMetadata = new HashMap<>();

private Builder() {
}

/**
* Text divided by horizontal lines will create new {@link Document}s. The default
* is {@code false}, meaning text separated by horizontal lines won't create a new
* document.
* @param horizontalRuleCreateDocument flag to determine whether new documents are
* created from text divided by horizontal line
* @return this builder
*/
public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocument) {
this.horizontalRuleCreateDocument = horizontalRuleCreateDocument;
return this;
}

/**
* Whatever to include code blocks in {@link Document}s. The default is
* {@code false}, which means all code blocks are in separate documents.
* @param includeCodeBlock flag to include code block into paragraph document or
* create new with code only
* @return this builder
*/
public Builder withIncludeCodeBlock(boolean includeCodeBlock) {
this.includeCodeBlock = includeCodeBlock;
return this;
}

/**
* Whatever to include blockquotes in {@link Document}s. The default is
* {@code false}, which means all blockquotes are in separate documents.
* @param includeBlockquote flag to include blockquotes into paragraph document or
* create new with blockquote only
* @return this builder
*/
public Builder withIncludeBlockquote(boolean includeBlockquote) {
this.includeBlockquote = includeBlockquote;
return this;
}

/**
* Adds this additional metadata to the all built {@link Document}s.
* @return this builder
*/
public Builder withAdditionalMetadata(String key, Object value) {
Assert.notNull(key, "key must not be null");
Assert.notNull(value, "value must not be null");
this.additionalMetadata.put(key, value);
return this;
}

/**
* Adds this additional metadata to the all built {@link Document}s.
* @return this builder
*/
public Builder withAdditionalMetadata(Map<String, Object> additionalMetadata) {
Assert.notNull(additionalMetadata, "additionalMetadata must not be null");
this.additionalMetadata = additionalMetadata;
return this;
}

/**
* @return the immutable configuration
*/
public MarkdownDocumentReaderConfig build() {
return new MarkdownDocumentReaderConfig(this);
}

}

}
Loading