|
| 1 | +/* |
| 2 | + * Copyright 2025-2025 the original author or authors. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * https://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +package org.springframework.ai.reader.jsoup; |
| 18 | + |
| 19 | +import java.io.IOException; |
| 20 | +import java.io.InputStream; |
| 21 | +import java.util.ArrayList; |
| 22 | +import java.util.HashMap; |
| 23 | +import java.util.List; |
| 24 | +import java.util.Map; |
| 25 | +import java.util.stream.Collectors; |
| 26 | + |
| 27 | +import org.jsoup.Jsoup; |
| 28 | +import org.jsoup.nodes.Element; |
| 29 | +import org.jsoup.select.Elements; |
| 30 | + |
| 31 | +import org.springframework.ai.document.Document; |
| 32 | +import org.springframework.ai.document.DocumentReader; |
| 33 | +import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig; |
| 34 | +import org.springframework.core.io.DefaultResourceLoader; |
| 35 | +import org.springframework.core.io.Resource; |
| 36 | + |
| 37 | +/** |
| 38 | + * Reads HTML documents and extracts text content using JSoup. |
| 39 | + * |
| 40 | + * This reader provides options for selecting specific HTML elements to extract, handling |
| 41 | + * links, and extracting metadata. It leverages the JSoup library for parsing HTML. |
| 42 | + * |
| 43 | + * @see <a href="https://jsoup.org/">JSoup Website</a> |
| 44 | + * @author Alexandros Pappas |
| 45 | + */ |
| 46 | +public class JsoupDocumentReader implements DocumentReader { |
| 47 | + |
| 48 | + private final Resource htmlResource; |
| 49 | + |
| 50 | + private final JsoupDocumentReaderConfig config; |
| 51 | + |
| 52 | + public JsoupDocumentReader(String htmlResource) { |
| 53 | + this(new DefaultResourceLoader().getResource(htmlResource)); |
| 54 | + } |
| 55 | + |
| 56 | + public JsoupDocumentReader(Resource htmlResource) { |
| 57 | + this(htmlResource, JsoupDocumentReaderConfig.defaultConfig()); |
| 58 | + } |
| 59 | + |
| 60 | + public JsoupDocumentReader(String htmlResource, JsoupDocumentReaderConfig config) { |
| 61 | + this(new DefaultResourceLoader().getResource(htmlResource), config); |
| 62 | + } |
| 63 | + |
| 64 | + public JsoupDocumentReader(Resource htmlResource, JsoupDocumentReaderConfig config) { |
| 65 | + this.htmlResource = htmlResource; |
| 66 | + this.config = config; |
| 67 | + } |
| 68 | + |
| 69 | + @Override |
| 70 | + public List<Document> get() { |
| 71 | + try (InputStream inputStream = htmlResource.getInputStream()) { |
| 72 | + org.jsoup.nodes.Document doc = Jsoup.parse(inputStream, this.config.charset, ""); |
| 73 | + |
| 74 | + List<Document> documents = new ArrayList<>(); |
| 75 | + |
| 76 | + if (this.config.allElements) { |
| 77 | + // Extract text from all elements and create a single document |
| 78 | + String allText = doc.body().text(); // .body to exclude head |
| 79 | + Document document = new Document(allText); |
| 80 | + addMetadata(doc, document); |
| 81 | + documents.add(document); |
| 82 | + } |
| 83 | + else if (this.config.groupByElement) { |
| 84 | + // Extract text on a per-element base using the defined selector. |
| 85 | + Elements selectedElements = doc.select(this.config.selector); |
| 86 | + for (Element element : selectedElements) { |
| 87 | + String elementText = element.text(); |
| 88 | + Document document = new Document(elementText); |
| 89 | + addMetadata(doc, document); |
| 90 | + // Do not add metadata from element to avoid duplication. |
| 91 | + documents.add(document); |
| 92 | + } |
| 93 | + } |
| 94 | + else { |
| 95 | + // Extract text from specific elements based on the selector |
| 96 | + Elements elements = doc.select(this.config.selector); |
| 97 | + String text = elements.stream().map(Element::text).collect(Collectors.joining(this.config.separator)); |
| 98 | + Document document = new Document(text); |
| 99 | + addMetadata(doc, document); |
| 100 | + documents.add(document); |
| 101 | + } |
| 102 | + |
| 103 | + return documents; |
| 104 | + |
| 105 | + } |
| 106 | + catch (IOException e) { |
| 107 | + throw new RuntimeException("Failed to read HTML resource: " + htmlResource, e); |
| 108 | + } |
| 109 | + } |
| 110 | + |
| 111 | + private void addMetadata(org.jsoup.nodes.Document jsoupDoc, Document springDoc) { |
| 112 | + Map<String, Object> metadata = new HashMap<>(); |
| 113 | + metadata.put("title", jsoupDoc.title()); |
| 114 | + |
| 115 | + for (String metaTag : this.config.metadataTags) { |
| 116 | + String value = jsoupDoc.select("meta[name=" + metaTag + "]").attr("content"); |
| 117 | + if (!value.isEmpty()) { |
| 118 | + metadata.put(metaTag, value); |
| 119 | + } |
| 120 | + } |
| 121 | + |
| 122 | + if (this.config.includeLinkUrls) { |
| 123 | + Elements links = jsoupDoc.select("a[href]"); |
| 124 | + List<String> linkUrls = links.stream().map(link -> link.attr("abs:href")).toList(); |
| 125 | + metadata.put("linkUrls", linkUrls); |
| 126 | + } |
| 127 | + |
| 128 | + // Use putAll to add all entries from additionalMetadata |
| 129 | + metadata.putAll(this.config.additionalMetadata); |
| 130 | + |
| 131 | + // Add all collected metadata to the Spring Document |
| 132 | + springDoc.getMetadata().putAll(metadata); |
| 133 | + } |
| 134 | + |
| 135 | +} |
0 commit comments