Skip to content

Commit 50e8615

Browse files
WOONBEspring-builds
authored andcommitted
fix getTextBetweenParagraphs to check for invalid page numbers in outline items
Fix ParagraphPdfDocumentReader to reliably extract text from PDFs with imperfect outlines and coordinate edge cases Test : Add test to validate ParagraphPdfDocumentReader to skip Invalid Outline Fixes #3421 Signed-off-by: WOONBE <[email protected]> (cherry picked from commit d92a2ea)
1 parent 35894f3 commit 50e8615

File tree

3 files changed

+101
-49
lines changed

3 files changed

+101
-49
lines changed

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java

Lines changed: 46 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023-2024 the original author or authors.
2+
* Copyright 2023-2025 the original author or authors.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -46,6 +46,7 @@
4646
* The paragraphs are grouped into {@link Document} objects.
4747
*
4848
* @author Christian Tzolov
49+
* @author Heonwoo Kim
4950
*/
5051
public class ParagraphPdfDocumentReader implements DocumentReader {
5152

@@ -127,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig
127128
*/
128129
@Override
129130
public List<Document> get() {
130-
131131
var paragraphs = this.paragraphTextExtractor.flatten();
132-
133-
List<Document> documents = new ArrayList<>(paragraphs.size());
134-
135-
if (!CollectionUtils.isEmpty(paragraphs)) {
136-
logger.info("Start processing paragraphs from PDF");
137-
Iterator<Paragraph> itr = paragraphs.iterator();
138-
139-
var current = itr.next();
140-
141-
if (!itr.hasNext()) {
142-
documents.add(toDocument(current, current));
143-
}
144-
else {
145-
while (itr.hasNext()) {
146-
var next = itr.next();
147-
Document document = toDocument(current, next);
148-
if (document != null && StringUtils.hasText(document.getText())) {
149-
documents.add(toDocument(current, next));
150-
}
151-
current = next;
152-
}
132+
List<Document> documents = new ArrayList<>();
133+
if (CollectionUtils.isEmpty(paragraphs)) {
134+
return documents;
135+
}
136+
logger.info("Start processing paragraphs from PDF");
137+
for (int i = 0; i < paragraphs.size(); i++) {
138+
Paragraph from = paragraphs.get(i);
139+
Paragraph to = (i + 1 < paragraphs.size()) ? paragraphs.get(i + 1) : from;
140+
Document document = toDocument(from, to);
141+
if (document != null && StringUtils.hasText(document.getText())) {
142+
documents.add(document);
153143
}
154144
}
155145
logger.info("End processing paragraphs from PDF");
@@ -173,17 +163,27 @@ protected Document toDocument(Paragraph from, Paragraph to) {
173163
protected void addMetadata(Paragraph from, Paragraph to, Document document) {
174164
document.getMetadata().put(METADATA_TITLE, from.title());
175165
document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber());
176-
document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber());
166+
document.getMetadata().put(METADATA_END_PAGE, from.endPageNumber());
177167
document.getMetadata().put(METADATA_LEVEL, from.level());
178168
document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);
179169
}
180170

181171
public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) {
182172

173+
if (fromParagraph.startPageNumber() < 1) {
174+
logger.warn("Skipping paragraph titled '{}' because it has an invalid start page number: {}",
175+
fromParagraph.title(), fromParagraph.startPageNumber());
176+
return "";
177+
}
178+
183179
// Page started from index 0, while PDFBOx getPage return them from index 1.
184180
int startPage = fromParagraph.startPageNumber() - 1;
185181
int endPage = toParagraph.startPageNumber() - 1;
186182

183+
if (fromParagraph == toParagraph || endPage < startPage) {
184+
endPage = startPage;
185+
}
186+
187187
try {
188188

189189
StringBuilder sb = new StringBuilder();
@@ -194,39 +194,37 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
194194
for (int pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
195195

196196
var page = this.document.getPage(pageNumber);
197+
float pageHeight = page.getMediaBox().getHeight();
197198

198-
int fromPosition = fromParagraph.position();
199-
int toPosition = toParagraph.position();
200-
201-
if (this.config.reversedParagraphPosition) {
202-
fromPosition = (int) (page.getMediaBox().getHeight() - fromPosition);
203-
toPosition = (int) (page.getMediaBox().getHeight() - toPosition);
204-
}
205-
206-
int x0 = (int) page.getMediaBox().getLowerLeftX();
207-
int xW = (int) page.getMediaBox().getWidth();
199+
int fromPos = fromParagraph.position();
200+
int toPos = (fromParagraph != toParagraph) ? toParagraph.position() : 0;
208201

209-
int y0 = (int) page.getMediaBox().getLowerLeftY();
210-
int yW = (int) page.getMediaBox().getHeight();
202+
int x = (int) page.getMediaBox().getLowerLeftX();
203+
int w = (int) page.getMediaBox().getWidth();
204+
int y, h;
211205

212-
if (pageNumber == startPage) {
213-
y0 = fromPosition;
214-
yW = (int) page.getMediaBox().getHeight() - y0;
206+
if (pageNumber == startPage && pageNumber == endPage) {
207+
y = toPos;
208+
h = fromPos - toPos;
215209
}
216-
if (pageNumber == endPage) {
217-
yW = toPosition - y0;
210+
else if (pageNumber == startPage) {
211+
y = 0;
212+
h = fromPos;
218213
}
219-
220-
if ((y0 + yW) == (int) page.getMediaBox().getHeight()) {
221-
yW = yW - this.config.pageBottomMargin;
214+
else if (pageNumber == endPage) {
215+
y = toPos;
216+
h = (int) pageHeight - toPos;
217+
}
218+
else {
219+
y = 0;
220+
h = (int) pageHeight;
222221
}
223222

224-
if (y0 == 0) {
225-
y0 = y0 + this.config.pageTopMargin;
226-
yW = yW - this.config.pageTopMargin;
223+
if (h < 0) {
224+
h = 0;
227225
}
228226

229-
pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x0, y0, xW, yW));
227+
pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h));
230228
pdfTextStripper.extractRegions(page);
231229
var text = pdfTextStripper.getTextForRegion("pdfPageRegion");
232230
if (StringUtils.hasText(text)) {

document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023-2024 the original author or authors.
2+
* Copyright 2023-2025 the original author or authors.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,15 +16,32 @@
1616

1717
package org.springframework.ai.reader.pdf;
1818

19+
import java.io.ByteArrayOutputStream;
20+
import java.io.IOException;
21+
import java.io.InputStream;
22+
import java.util.List;
23+
24+
import org.apache.pdfbox.Loader;
25+
import org.apache.pdfbox.pdmodel.PDDocument;
26+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
27+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
28+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
1929
import org.junit.jupiter.api.Test;
2030

31+
import org.springframework.ai.document.Document;
2132
import org.springframework.ai.reader.ExtractedTextFormatter;
2233
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
34+
import org.springframework.core.io.ByteArrayResource;
35+
import org.springframework.core.io.ClassPathResource;
36+
import org.springframework.core.io.Resource;
2337

38+
import static org.assertj.core.api.Assertions.assertThat;
2439
import static org.assertj.core.api.Assertions.assertThatThrownBy;
40+
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
2541

2642
/**
2743
* @author Christian Tzolov
44+
* @author Heonwoo Kim
2845
*/
2946
public class ParagraphPdfDocumentReaderTests {
3047

@@ -50,4 +67,41 @@ public void testPdfWithoutToc() {
5067

5168
}
5269

70+
@Test
71+
void shouldSkipInvalidOutline() throws IOException {
72+
73+
Resource basePdfResource = new ClassPathResource("sample3.pdf");
74+
75+
PDDocument documentToModify;
76+
try (InputStream inputStream = basePdfResource.getInputStream()) {
77+
78+
byte[] pdfBytes = inputStream.readAllBytes();
79+
80+
documentToModify = Loader.loadPDF(pdfBytes);
81+
}
82+
PDDocumentOutline outline = documentToModify.getDocumentCatalog().getDocumentOutline();
83+
if (outline != null && outline.getFirstChild() != null) {
84+
PDOutlineItem chapter2OutlineItem = outline.getFirstChild().getNextSibling();
85+
if (chapter2OutlineItem != null) {
86+
87+
chapter2OutlineItem.setDestination((PDDestination) null);
88+
}
89+
}
90+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
91+
documentToModify.save(baos);
92+
documentToModify.close();
93+
94+
Resource corruptedPdfResource = new ByteArrayResource(baos.toByteArray());
95+
96+
ParagraphPdfDocumentReader reader = new ParagraphPdfDocumentReader(corruptedPdfResource,
97+
PdfDocumentReaderConfig.defaultConfig());
98+
99+
List<Document> documents = assertDoesNotThrow(() -> reader.get());
100+
101+
assertThat(documents).isNotNull();
102+
assertThat(documents).hasSize(2);
103+
assertThat(documents.get(0).getMetadata().get("title")).isEqualTo("Chapter 1");
104+
assertThat(documents.get(1).getMetadata().get("title")).isEqualTo("Chapter 3");
105+
}
106+
53107
}
Binary file not shown.

0 commit comments

Comments
 (0)