Skip to content

Commit 2879e6c

Browse files
committed
feat: add JSoup HTML document reader
This commit introduces the `JsoupDocumentReader` and `JsoupDocumentReaderConfig` classes, which provide functionality to read and parse HTML documents using the JSoup library. The reader supports: - Extracting text from specific HTML elements using CSS selectors. - Extracting all text from the body of the document. - Grouping text by element. - Extracting metadata, including the document title, meta tags, and link URLs. - Reading from various resource types (files, URLs, byte arrays). - Configurable character encoding, selector, separator, and metadata extraction. This new reader enhances Spring AI's ability to process web content and other HTML-based data sources.
1 parent c623264 commit 2879e6c

File tree

11 files changed

+759
-0
lines changed

11 files changed

+759
-0
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Spring AI JSoup Document Reader
2+
3+
This module provides an HTML document reader for the Spring AI project. It leverages the [JSoup](https://jsoup.org/) library to parse HTML content and extract text and metadata, making it suitable for use in AI applications.
4+
5+
## Features
6+
7+
* **Flexible Text Extraction:**
8+
* Extract all text from the `<body>` of an HTML document.
9+
* Extract text from specific elements using CSS selectors.
10+
* Group text by element, creating a separate document for each selected element.
11+
* Combine text from multiple selected elements using a configurable separator.
12+
* **Metadata Extraction:**
13+
* Extract the document title.
14+
* Extract content from `<meta>` tags (e.g., description, keywords). You can specify which meta tags to extract.
15+
* Extract a list of all absolute URLs of links (`<a href="...">`) within the document.
16+
* **Configurable:**
17+
* Specify the character encoding (defaults to UTF-8).
18+
* Customize the CSS selector for element selection.
19+
* Configure the separator string for joining text from multiple elements.
20+
* Choose whether to extract all text or use element-based extraction.
21+
* Enable/disable link URL extraction.
22+
* Add additional metadata using configuration.
23+
* **Resource-Based:** Works with Spring's `Resource` abstraction, allowing you to read HTML from files, classpath resources, URLs, and even in-memory byte arrays.
24+
25+
---
26+
27+
#### How to Build:
28+
```bash
29+
./mvnw -pl document-readers/jsoup-reader clean install
30+
```
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
~ Copyright 2025-2025 the original author or authors.
4+
~
5+
~ Licensed under the Apache License, Version 2.0 (the "License");
6+
~ you may not use this file except in compliance with the License.
7+
~ You may obtain a copy of the License at
8+
~
9+
~ https://www.apache.org/licenses/LICENSE-2.0
10+
~
11+
~ Unless required by applicable law or agreed to in writing, software
12+
~ distributed under the License is distributed on an "AS IS" BASIS,
13+
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
~ See the License for the specific language governing permissions and
15+
~ limitations under the License.
16+
-->
17+
18+
<project xmlns="http://maven.apache.org/POM/4.0.0"
19+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
20+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
21+
<modelVersion>4.0.0</modelVersion>
22+
<parent>
23+
<groupId>org.springframework.ai</groupId>
24+
<artifactId>spring-ai</artifactId>
25+
<version>1.0.0-SNAPSHOT</version>
26+
<relativePath>../../pom.xml</relativePath>
27+
</parent>
28+
29+
<artifactId>spring-ai-jsoup-document-reader</artifactId>
30+
<packaging>jar</packaging>
31+
<name>Spring AI Document Reader - HTML</name>
32+
<description>Spring AI HTML document reader</description>
33+
<url>https://github.com/spring-projects/spring-ai</url>
34+
35+
<scm>
36+
<url>https://github.com/spring-projects/spring-ai</url>
37+
<connection>git://github.com/spring-projects/spring-ai.git</connection>
38+
<developerConnection>[email protected]:spring-projects/spring-ai.git</developerConnection>
39+
</scm>
40+
41+
<dependencies>
42+
<dependency>
43+
<groupId>org.springframework.ai</groupId>
44+
<artifactId>spring-ai-core</artifactId>
45+
<version>${project.parent.version}</version>
46+
</dependency>
47+
48+
<dependency>
49+
<groupId>org.jsoup</groupId>
50+
<artifactId>jsoup</artifactId>
51+
<version>1.18.3</version>
52+
</dependency>
53+
54+
<!-- TESTING -->
55+
<dependency>
56+
<groupId>org.springframework.boot</groupId>
57+
<artifactId>spring-boot-starter-test</artifactId>
58+
<scope>test</scope>
59+
</dependency>
60+
61+
</dependencies>
62+
63+
</project>
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
/*
2+
* Copyright 2025-2025 the original author or authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* https://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package org.springframework.ai.reader.jsoup;
18+
19+
import java.io.IOException;
20+
import java.io.InputStream;
21+
import java.util.ArrayList;
22+
import java.util.HashMap;
23+
import java.util.List;
24+
import java.util.Map;
25+
import java.util.stream.Collectors;
26+
27+
import org.jsoup.Jsoup;
28+
import org.jsoup.nodes.Element;
29+
import org.jsoup.select.Elements;
30+
31+
import org.springframework.ai.document.Document;
32+
import org.springframework.ai.document.DocumentReader;
33+
import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
34+
import org.springframework.core.io.DefaultResourceLoader;
35+
import org.springframework.core.io.Resource;
36+
37+
/**
38+
* Reads HTML documents and extracts text content using JSoup.
39+
*
40+
* This reader provides options for selecting specific HTML elements to extract, handling
41+
* links, and extracting metadata. It leverages the JSoup library for parsing HTML.
42+
*
43+
* @see <a href="https://jsoup.org/">JSoup Website</a>
44+
* @author Alexandros Pappas
45+
*/
46+
public class JsoupDocumentReader implements DocumentReader {
47+
48+
private final Resource htmlResource;
49+
50+
private final JsoupDocumentReaderConfig config;
51+
52+
public JsoupDocumentReader(String htmlResource) {
53+
this(new DefaultResourceLoader().getResource(htmlResource));
54+
}
55+
56+
public JsoupDocumentReader(Resource htmlResource) {
57+
this(htmlResource, JsoupDocumentReaderConfig.defaultConfig());
58+
}
59+
60+
public JsoupDocumentReader(String htmlResource, JsoupDocumentReaderConfig config) {
61+
this(new DefaultResourceLoader().getResource(htmlResource), config);
62+
}
63+
64+
public JsoupDocumentReader(Resource htmlResource, JsoupDocumentReaderConfig config) {
65+
this.htmlResource = htmlResource;
66+
this.config = config;
67+
}
68+
69+
@Override
70+
public List<Document> get() {
71+
try (InputStream inputStream = htmlResource.getInputStream()) {
72+
org.jsoup.nodes.Document doc = Jsoup.parse(inputStream, this.config.charset, "");
73+
74+
List<Document> documents = new ArrayList<>();
75+
76+
if (this.config.allElements) {
77+
// Extract text from all elements and create a single document
78+
String allText = doc.body().text(); // .body to exclude head
79+
Document document = new Document(allText);
80+
addMetadata(doc, document);
81+
documents.add(document);
82+
}
83+
else if (this.config.groupByElement) {
84+
// Extract text on a per-element base using the defined selector.
85+
Elements selectedElements = doc.select(this.config.selector);
86+
for (Element element : selectedElements) {
87+
String elementText = element.text();
88+
Document document = new Document(elementText);
89+
addMetadata(doc, document);
90+
// Do not add metadata from element to avoid duplication.
91+
documents.add(document);
92+
}
93+
}
94+
else {
95+
// Extract text from specific elements based on the selector
96+
Elements elements = doc.select(this.config.selector);
97+
String text = elements.stream().map(Element::text).collect(Collectors.joining(this.config.separator));
98+
Document document = new Document(text);
99+
addMetadata(doc, document);
100+
documents.add(document);
101+
}
102+
103+
return documents;
104+
105+
}
106+
catch (IOException e) {
107+
throw new RuntimeException("Failed to read HTML resource: " + htmlResource, e);
108+
}
109+
}
110+
111+
private void addMetadata(org.jsoup.nodes.Document jsoupDoc, Document springDoc) {
112+
Map<String, Object> metadata = new HashMap<>();
113+
metadata.put("title", jsoupDoc.title());
114+
115+
for (String metaTag : this.config.metadataTags) {
116+
String value = jsoupDoc.select("meta[name=" + metaTag + "]").attr("content");
117+
if (!value.isEmpty()) {
118+
metadata.put(metaTag, value);
119+
}
120+
}
121+
122+
if (this.config.includeLinkUrls) {
123+
Elements links = jsoupDoc.select("a[href]");
124+
List<String> linkUrls = links.stream().map(link -> link.attr("abs:href")).toList();
125+
metadata.put("linkUrls", linkUrls);
126+
}
127+
128+
// Use putAll to add all entries from additionalMetadata
129+
metadata.putAll(this.config.additionalMetadata);
130+
131+
// Add all collected metadata to the Spring Document
132+
springDoc.getMetadata().putAll(metadata);
133+
}
134+
135+
}

0 commit comments

Comments
 (0)