Skip to content

Commit 2a9cd05

Browse files
committed
Fix ParagraphPdfDocumentReader to reliably extract text from PDFs with imperfect outlines and coordinate edge cases
Signed-off-by: WOONBE <[email protected]>
1 parent cc3a246 commit 2a9cd05

File tree

4 files changed

+67
-55
lines changed

4 files changed

+67
-55
lines changed

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java

Lines changed: 36 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -128,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig
128128
*/
129129
@Override
130130
public List<Document> get() {
131-
132131
var paragraphs = this.paragraphTextExtractor.flatten();
133-
134-
List<Document> documents = new ArrayList<>(paragraphs.size());
135-
136-
if (!CollectionUtils.isEmpty(paragraphs)) {
137-
logger.info("Start processing paragraphs from PDF");
138-
Iterator<Paragraph> itr = paragraphs.iterator();
139-
140-
var current = itr.next();
141-
142-
if (!itr.hasNext()) {
143-
documents.add(toDocument(current, current));
144-
}
145-
else {
146-
while (itr.hasNext()) {
147-
var next = itr.next();
148-
Document document = toDocument(current, next);
149-
if (document != null && StringUtils.hasText(document.getText())) {
150-
documents.add(toDocument(current, next));
151-
}
152-
current = next;
153-
}
132+
List<Document> documents = new ArrayList<>();
133+
if (CollectionUtils.isEmpty(paragraphs)) {
134+
return documents;
135+
}
136+
logger.info("Start processing paragraphs from PDF");
137+
for (int i = 0; i < paragraphs.size(); i++) {
138+
Paragraph from = paragraphs.get(i);
139+
Paragraph to = (i + 1 < paragraphs.size()) ? paragraphs.get(i + 1) : from;
140+
Document document = toDocument(from, to);
141+
if (document != null && StringUtils.hasText(document.getText())) {
142+
documents.add(document);
154143
}
155144
}
156145
logger.info("End processing paragraphs from PDF");
@@ -174,7 +163,7 @@ protected Document toDocument(Paragraph from, Paragraph to) {
174163
protected void addMetadata(Paragraph from, Paragraph to, Document document) {
175164
document.getMetadata().put(METADATA_TITLE, from.title());
176165
document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber());
177-
document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber());
166+
document.getMetadata().put(METADATA_END_PAGE, from.endPageNumber());
178167
document.getMetadata().put(METADATA_LEVEL, from.level());
179168
document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);
180169
}
@@ -192,7 +181,7 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
192181
int startPage = fromParagraph.startPageNumber() - 1;
193182
int endPage = toParagraph.startPageNumber() - 1;
194183

195-
if (endPage < 0) {
184+
if (fromParagraph == toParagraph || endPage < startPage) {
196185
endPage = startPage;
197186
}
198187

@@ -206,39 +195,32 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
206195
for (int pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
207196

208197
var page = this.document.getPage(pageNumber);
209-
210-
int fromPosition = fromParagraph.position();
211-
int toPosition = toParagraph.position();
212-
213-
if (this.config.reversedParagraphPosition) {
214-
fromPosition = (int) (page.getMediaBox().getHeight() - fromPosition);
215-
toPosition = (int) (page.getMediaBox().getHeight() - toPosition);
198+
float pageHeight = page.getMediaBox().getHeight();
199+
200+
int fromPos = fromParagraph.position();
201+
int toPos = (fromParagraph != toParagraph) ? toParagraph.position() : 0;
202+
203+
int x = (int) page.getMediaBox().getLowerLeftX();
204+
int w = (int) page.getMediaBox().getWidth();
205+
int y, h;
206+
207+
if (pageNumber == startPage && pageNumber == endPage) {
208+
y = toPos;
209+
h = fromPos - toPos;
210+
} else if (pageNumber == startPage) {
211+
y = 0;
212+
h = fromPos;
213+
} else if (pageNumber == endPage) {
214+
y = toPos;
215+
h = (int) pageHeight - toPos;
216+
} else {
217+
y = 0;
218+
h = (int) pageHeight;
216219
}
217220

218-
int x0 = (int) page.getMediaBox().getLowerLeftX();
219-
int xW = (int) page.getMediaBox().getWidth();
220-
221-
int y0 = (int) page.getMediaBox().getLowerLeftY();
222-
int yW = (int) page.getMediaBox().getHeight();
223-
224-
if (pageNumber == startPage) {
225-
y0 = fromPosition;
226-
yW = (int) page.getMediaBox().getHeight() - y0;
227-
}
228-
if (pageNumber == endPage) {
229-
yW = toPosition - y0;
230-
}
231-
232-
if ((y0 + yW) == (int) page.getMediaBox().getHeight()) {
233-
yW = yW - this.config.pageBottomMargin;
234-
}
235-
236-
if (y0 == 0) {
237-
y0 = y0 + this.config.pageTopMargin;
238-
yW = yW - this.config.pageTopMargin;
239-
}
221+
if (h < 0) h = 0;
240222

241-
pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x0, y0, xW, yW));
223+
pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h));
242224
pdfTextStripper.extractRegions(page);
243225
var text = pdfTextStripper.getTextForRegion("pdfPageRegion");
244226
if (StringUtils.hasText(text)) {

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/config/ParagraphManager.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ protected Paragraph generateParagraphs(Paragraph parentParagraph, PDOutlineNode
117117
if (nextSiblingNumber < 0) {
118118
nextSiblingNumber = getPageNumber(current.getLastChild());
119119
}
120-
121120
var paragraphPosition = (current.getDestination() instanceof PDPageXYZDestination)
122121
? ((PDPageXYZDestination) current.getDestination()).getTop() : 0;
123122

document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,43 @@
1616

1717
package org.springframework.ai.reader.pdf;
1818

19+
20+
import org.apache.pdfbox.Loader;
21+
import org.apache.pdfbox.io.RandomAccessRead;
22+
import org.apache.pdfbox.pdmodel.PDDocument;
23+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
24+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
25+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
1926
import org.junit.jupiter.api.Test;
2027

28+
import org.junit.jupiter.api.extension.ExtendWith;
29+
import org.mockito.Mockito;
30+
import org.mockito.Spy;
31+
import org.mockito.junit.jupiter.MockitoExtension;
32+
import org.springframework.ai.document.Document;
2133
import org.springframework.ai.reader.ExtractedTextFormatter;
34+
35+
import org.springframework.ai.reader.pdf.config.ParagraphManager;
2236
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
37+
import org.springframework.core.io.ByteArrayResource;
38+
import org.springframework.core.io.ClassPathResource;
39+
import org.springframework.core.io.Resource;
40+
import org.springframework.test.util.ReflectionTestUtils;
41+
42+
import java.io.ByteArrayOutputStream;
43+
import java.io.IOException;
44+
import java.io.InputStream;
45+
import java.util.List;
2346

47+
import static org.assertj.core.api.Assertions.assertThat;
2448
import static org.assertj.core.api.Assertions.assertThatThrownBy;
49+
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
50+
import static org.mockito.Mockito.doReturn;
51+
import static org.mockito.Mockito.when;
2552

2653
/**
2754
* @author Christian Tzolov
55+
* @author Heonwoo Kim
2856
*/
2957
public class ParagraphPdfDocumentReaderTests {
3058

@@ -50,4 +78,7 @@ public void testPdfWithoutToc() {
5078

5179
}
5280

81+
82+
83+
5384
}
1.26 KB
Binary file not shown.

0 commit comments

Comments
 (0)