Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2023-2024 the original author or authors.
* Copyright 2023-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -46,6 +46,7 @@
* The paragraphs are grouped into {@link Document} objects.
*
* @author Christian Tzolov
* @author Heonwoo Kim
*/
public class ParagraphPdfDocumentReader implements DocumentReader {

Expand Down Expand Up @@ -127,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig
*/
@Override
public List<Document> get() {

var paragraphs = this.paragraphTextExtractor.flatten();

List<Document> documents = new ArrayList<>(paragraphs.size());

if (!CollectionUtils.isEmpty(paragraphs)) {
logger.info("Start processing paragraphs from PDF");
Iterator<Paragraph> itr = paragraphs.iterator();

var current = itr.next();

if (!itr.hasNext()) {
documents.add(toDocument(current, current));
}
else {
while (itr.hasNext()) {
var next = itr.next();
Document document = toDocument(current, next);
if (document != null && StringUtils.hasText(document.getText())) {
documents.add(toDocument(current, next));
}
current = next;
}
List<Document> documents = new ArrayList<>();
if (CollectionUtils.isEmpty(paragraphs)) {
return documents;
}
logger.info("Start processing paragraphs from PDF");
for (int i = 0; i < paragraphs.size(); i++) {
Paragraph from = paragraphs.get(i);
Paragraph to = (i + 1 < paragraphs.size()) ? paragraphs.get(i + 1) : from;
Document document = toDocument(from, to);
if (document != null && StringUtils.hasText(document.getText())) {
documents.add(document);
}
}
logger.info("End processing paragraphs from PDF");
Expand All @@ -173,17 +163,27 @@ protected Document toDocument(Paragraph from, Paragraph to) {
protected void addMetadata(Paragraph from, Paragraph to, Document document) {
document.getMetadata().put(METADATA_TITLE, from.title());
document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber());
document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber());
document.getMetadata().put(METADATA_END_PAGE, from.endPageNumber());
document.getMetadata().put(METADATA_LEVEL, from.level());
document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);
}

public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) {

if (fromParagraph.startPageNumber() < 1) {
logger.warn("Skipping paragraph titled '{}' because it has an invalid start page number: {}",
fromParagraph.title(), fromParagraph.startPageNumber());
return "";
}

// Page started from index 0, while PDFBOx getPage return them from index 1.
int startPage = fromParagraph.startPageNumber() - 1;
int endPage = toParagraph.startPageNumber() - 1;

if (fromParagraph == toParagraph || endPage < startPage) {
endPage = startPage;
}

try {

StringBuilder sb = new StringBuilder();
Expand All @@ -194,39 +194,37 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
for (int pageNumber = startPage; pageNumber <= endPage; pageNumber++) {

var page = this.document.getPage(pageNumber);
float pageHeight = page.getMediaBox().getHeight();

int fromPosition = fromParagraph.position();
int toPosition = toParagraph.position();

if (this.config.reversedParagraphPosition) {
fromPosition = (int) (page.getMediaBox().getHeight() - fromPosition);
toPosition = (int) (page.getMediaBox().getHeight() - toPosition);
}

int x0 = (int) page.getMediaBox().getLowerLeftX();
int xW = (int) page.getMediaBox().getWidth();
int fromPos = fromParagraph.position();
int toPos = (fromParagraph != toParagraph) ? toParagraph.position() : 0;

int y0 = (int) page.getMediaBox().getLowerLeftY();
int yW = (int) page.getMediaBox().getHeight();
int x = (int) page.getMediaBox().getLowerLeftX();
int w = (int) page.getMediaBox().getWidth();
int y, h;

if (pageNumber == startPage) {
y0 = fromPosition;
yW = (int) page.getMediaBox().getHeight() - y0;
if (pageNumber == startPage && pageNumber == endPage) {
y = toPos;
h = fromPos - toPos;
}
if (pageNumber == endPage) {
yW = toPosition - y0;
else if (pageNumber == startPage) {
y = 0;
h = fromPos;
}

if ((y0 + yW) == (int) page.getMediaBox().getHeight()) {
yW = yW - this.config.pageBottomMargin;
else if (pageNumber == endPage) {
y = toPos;
h = (int) pageHeight - toPos;
}
else {
y = 0;
h = (int) pageHeight;
}

if (y0 == 0) {
y0 = y0 + this.config.pageTopMargin;
yW = yW - this.config.pageTopMargin;
if (h < 0) {
h = 0;
}

pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x0, y0, xW, yW));
pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h));
pdfTextStripper.extractRegions(page);
var text = pdfTextStripper.getTextForRegion("pdfPageRegion");
if (StringUtils.hasText(text)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2023-2024 the original author or authors.
* Copyright 2023-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,15 +16,32 @@

package org.springframework.ai.reader.pdf;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.junit.jupiter.api.Test;

import org.springframework.ai.document.Document;
import org.springframework.ai.reader.ExtractedTextFormatter;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;

/**
* @author Christian Tzolov
* @author Heonwoo Kim
*/
public class ParagraphPdfDocumentReaderTests {

Expand All @@ -50,4 +67,41 @@ public void testPdfWithoutToc() {

}

@Test
void shouldSkipInvalidOutline() throws IOException {

Resource basePdfResource = new ClassPathResource("sample3.pdf");

PDDocument documentToModify;
try (InputStream inputStream = basePdfResource.getInputStream()) {

byte[] pdfBytes = inputStream.readAllBytes();

documentToModify = Loader.loadPDF(pdfBytes);
}
PDDocumentOutline outline = documentToModify.getDocumentCatalog().getDocumentOutline();
if (outline != null && outline.getFirstChild() != null) {
PDOutlineItem chapter2OutlineItem = outline.getFirstChild().getNextSibling();
if (chapter2OutlineItem != null) {

chapter2OutlineItem.setDestination((PDDestination) null);
}
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
documentToModify.save(baos);
documentToModify.close();

Resource corruptedPdfResource = new ByteArrayResource(baos.toByteArray());

ParagraphPdfDocumentReader reader = new ParagraphPdfDocumentReader(corruptedPdfResource,
PdfDocumentReaderConfig.defaultConfig());

List<Document> documents = assertDoesNotThrow(() -> reader.get());

assertThat(documents).isNotNull();
assertThat(documents).hasSize(2);
assertThat(documents.get(0).getMetadata().get("title")).isEqualTo("Chapter 1");
assertThat(documents.get(1).getMetadata().get("title")).isEqualTo("Chapter 3");
}

}
Binary file not shown.