Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
* The paragraphs are grouped into {@link Document} objects.
*
* @author Christian Tzolov
* @author Heonwoo Kim
*/
public class ParagraphPdfDocumentReader implements DocumentReader {

Expand Down Expand Up @@ -127,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig
*/
@Override
public List<Document> get() {

var paragraphs = this.paragraphTextExtractor.flatten();

List<Document> documents = new ArrayList<>(paragraphs.size());

if (!CollectionUtils.isEmpty(paragraphs)) {
logger.info("Start processing paragraphs from PDF");
Iterator<Paragraph> itr = paragraphs.iterator();

var current = itr.next();

if (!itr.hasNext()) {
documents.add(toDocument(current, current));
}
else {
while (itr.hasNext()) {
var next = itr.next();
Document document = toDocument(current, next);
if (document != null && StringUtils.hasText(document.getText())) {
documents.add(toDocument(current, next));
}
current = next;
}
List<Document> documents = new ArrayList<>();
if (CollectionUtils.isEmpty(paragraphs)) {
return documents;
}
logger.info("Start processing paragraphs from PDF");
for (int i = 0; i < paragraphs.size(); i++) {
Paragraph from = paragraphs.get(i);
Paragraph to = (i + 1 < paragraphs.size()) ? paragraphs.get(i + 1) : from;
Document document = toDocument(from, to);
if (document != null && StringUtils.hasText(document.getText())) {
documents.add(document);
}
}
logger.info("End processing paragraphs from PDF");
Expand All @@ -173,17 +163,28 @@ protected Document toDocument(Paragraph from, Paragraph to) {
protected void addMetadata(Paragraph from, Paragraph to, Document document) {
document.getMetadata().put(METADATA_TITLE, from.title());
document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber());
document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber());
document.getMetadata().put(METADATA_END_PAGE, from.endPageNumber());
document.getMetadata().put(METADATA_LEVEL, from.level());
document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);
}

public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) {

if (fromParagraph.startPageNumber() < 1) {

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this an intended line break?

logger.warn("Skipping paragraph titled '{}' because it has an invalid start page number: {}",
fromParagraph.title(), fromParagraph.startPageNumber());
return "";
}

// Page started from index 0, while PDFBOx getPage return them from index 1.
int startPage = fromParagraph.startPageNumber() - 1;
int endPage = toParagraph.startPageNumber() - 1;

if (fromParagraph == toParagraph || endPage < startPage) {
endPage = startPage;
}

try {

StringBuilder sb = new StringBuilder();
Expand All @@ -194,39 +195,36 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
for (int pageNumber = startPage; pageNumber <= endPage; pageNumber++) {

var page = this.document.getPage(pageNumber);
float pageHeight = page.getMediaBox().getHeight();

int fromPosition = fromParagraph.position();
int toPosition = toParagraph.position();

if (this.config.reversedParagraphPosition) {
fromPosition = (int) (page.getMediaBox().getHeight() - fromPosition);
toPosition = (int) (page.getMediaBox().getHeight() - toPosition);
}
int fromPos = fromParagraph.position();
int toPos = (fromParagraph != toParagraph) ? toParagraph.position() : 0;

int x0 = (int) page.getMediaBox().getLowerLeftX();
int xW = (int) page.getMediaBox().getWidth();
int x = (int) page.getMediaBox().getLowerLeftX();
int w = (int) page.getMediaBox().getWidth();
int y, h;

int y0 = (int) page.getMediaBox().getLowerLeftY();
int yW = (int) page.getMediaBox().getHeight();

if (pageNumber == startPage) {
y0 = fromPosition;
yW = (int) page.getMediaBox().getHeight() - y0;
if (pageNumber == startPage && pageNumber == endPage) {
y = toPos;
h = fromPos - toPos;
}
if (pageNumber == endPage) {
yW = toPosition - y0;
else if (pageNumber == startPage) {
y = 0;
h = fromPos;
}

if ((y0 + yW) == (int) page.getMediaBox().getHeight()) {
yW = yW - this.config.pageBottomMargin;
else if (pageNumber == endPage) {
y = toPos;
h = (int) pageHeight - toPos;
}

if (y0 == 0) {
y0 = y0 + this.config.pageTopMargin;
yW = yW - this.config.pageTopMargin;
else {
y = 0;
h = (int) pageHeight;
}

pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x0, y0, xW, yW));
if (h < 0)
h = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add curly braces to the if statement


pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h));
pdfTextStripper.extractRegions(page);
var text = pdfTextStripper.getTextForRegion("pdfPageRegion");
if (StringUtils.hasText(text)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ protected Paragraph generateParagraphs(Paragraph parentParagraph, PDOutlineNode
if (nextSiblingNumber < 0) {
nextSiblingNumber = getPageNumber(current.getLastChild());
}

var paragraphPosition = (current.getDestination() instanceof PDPageXYZDestination)
? ((PDPageXYZDestination) current.getDestination()).getTop() : 0;

Expand Down
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please rearrange the import statements in the following order:

  1. java.* packages
  2. Other packages
  3. org.springframework.* packages
  4. Static imports

For reference, you can refer to the example in this file:
OpenAiApiIT.java.

Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,42 @@

package org.springframework.ai.reader.pdf;

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.junit.jupiter.api.Test;

import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mockito;
import org.mockito.Spy;
import org.mockito.junit.jupiter.MockitoExtension;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.ExtractedTextFormatter;

import org.springframework.ai.reader.pdf.config.ParagraphManager;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.core.io.ClassPathResource;
import org.springframework.core.io.Resource;
import org.springframework.test.util.ReflectionTestUtils;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.when;

/**
* @author Christian Tzolov
* @author Heonwoo Kim
*/
public class ParagraphPdfDocumentReaderTests {

Expand All @@ -50,4 +77,41 @@ public void testPdfWithoutToc() {

}

@Test
void shouldSkipInvalidOutline() throws IOException {

Resource basePdfResource = new ClassPathResource("sample3.pdf");

PDDocument documentToModify;
try (InputStream inputStream = basePdfResource.getInputStream()) {

byte[] pdfBytes = inputStream.readAllBytes();

documentToModify = Loader.loadPDF(pdfBytes);
}
PDDocumentOutline outline = documentToModify.getDocumentCatalog().getDocumentOutline();
if (outline != null && outline.getFirstChild() != null) {
PDOutlineItem chapter2OutlineItem = outline.getFirstChild().getNextSibling();
if (chapter2OutlineItem != null) {

chapter2OutlineItem.setDestination((PDDestination) null);
}
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
documentToModify.save(baos);
documentToModify.close();

Resource corruptedPdfResource = new ByteArrayResource(baos.toByteArray());

ParagraphPdfDocumentReader reader = new ParagraphPdfDocumentReader(corruptedPdfResource,
PdfDocumentReaderConfig.defaultConfig());

List<Document> documents = assertDoesNotThrow(() -> reader.get());

assertThat(documents).isNotNull();
assertThat(documents).hasSize(2);
assertThat(documents.get(0).getMetadata().get("title")).isEqualTo("Chapter 1");
assertThat(documents.get(1).getMetadata().get("title")).isEqualTo("Chapter 3");
}

}
Binary file not shown.