diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java index 95863fff649..d8be2179675 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java @@ -1,5 +1,5 @@ /* - * Copyright 2023-2024 the original author or authors. + * Copyright 2023-2025 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,6 +46,7 @@ * The paragraphs are grouped into {@link Document} objects. * * @author Christian Tzolov + * @author Heonwoo Kim */ public class ParagraphPdfDocumentReader implements DocumentReader { @@ -127,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig */ @Override public List get() { - var paragraphs = this.paragraphTextExtractor.flatten(); - - List documents = new ArrayList<>(paragraphs.size()); - - if (!CollectionUtils.isEmpty(paragraphs)) { - logger.info("Start processing paragraphs from PDF"); - Iterator itr = paragraphs.iterator(); - - var current = itr.next(); - - if (!itr.hasNext()) { - documents.add(toDocument(current, current)); - } - else { - while (itr.hasNext()) { - var next = itr.next(); - Document document = toDocument(current, next); - if (document != null && StringUtils.hasText(document.getText())) { - documents.add(toDocument(current, next)); - } - current = next; - } + List documents = new ArrayList<>(); + if (CollectionUtils.isEmpty(paragraphs)) { + return documents; + } + logger.info("Start processing paragraphs from PDF"); + for (int i = 0; i < paragraphs.size(); i++) { + Paragraph from = paragraphs.get(i); + Paragraph to = (i + 1 < paragraphs.size()) ? paragraphs.get(i + 1) : from; + Document document = toDocument(from, to); + if (document != null && StringUtils.hasText(document.getText())) { + documents.add(document); } } logger.info("End processing paragraphs from PDF"); @@ -173,17 +163,27 @@ protected Document toDocument(Paragraph from, Paragraph to) { protected void addMetadata(Paragraph from, Paragraph to, Document document) { document.getMetadata().put(METADATA_TITLE, from.title()); document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber()); - document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber()); + document.getMetadata().put(METADATA_END_PAGE, from.endPageNumber()); document.getMetadata().put(METADATA_LEVEL, from.level()); document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName); } public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) { + if (fromParagraph.startPageNumber() < 1) { + logger.warn("Skipping paragraph titled '{}' because it has an invalid start page number: {}", + fromParagraph.title(), fromParagraph.startPageNumber()); + return ""; + } + // Page started from index 0, while PDFBOx getPage return them from index 1. int startPage = fromParagraph.startPageNumber() - 1; int endPage = toParagraph.startPageNumber() - 1; + if (fromParagraph == toParagraph || endPage < startPage) { + endPage = startPage; + } + try { StringBuilder sb = new StringBuilder(); @@ -194,39 +194,37 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara for (int pageNumber = startPage; pageNumber <= endPage; pageNumber++) { var page = this.document.getPage(pageNumber); + float pageHeight = page.getMediaBox().getHeight(); - int fromPosition = fromParagraph.position(); - int toPosition = toParagraph.position(); - - if (this.config.reversedParagraphPosition) { - fromPosition = (int) (page.getMediaBox().getHeight() - fromPosition); - toPosition = (int) (page.getMediaBox().getHeight() - toPosition); - } - - int x0 = (int) page.getMediaBox().getLowerLeftX(); - int xW = (int) page.getMediaBox().getWidth(); + int fromPos = fromParagraph.position(); + int toPos = (fromParagraph != toParagraph) ? toParagraph.position() : 0; - int y0 = (int) page.getMediaBox().getLowerLeftY(); - int yW = (int) page.getMediaBox().getHeight(); + int x = (int) page.getMediaBox().getLowerLeftX(); + int w = (int) page.getMediaBox().getWidth(); + int y, h; - if (pageNumber == startPage) { - y0 = fromPosition; - yW = (int) page.getMediaBox().getHeight() - y0; + if (pageNumber == startPage && pageNumber == endPage) { + y = toPos; + h = fromPos - toPos; } - if (pageNumber == endPage) { - yW = toPosition - y0; + else if (pageNumber == startPage) { + y = 0; + h = fromPos; } - - if ((y0 + yW) == (int) page.getMediaBox().getHeight()) { - yW = yW - this.config.pageBottomMargin; + else if (pageNumber == endPage) { + y = toPos; + h = (int) pageHeight - toPos; + } + else { + y = 0; + h = (int) pageHeight; } - if (y0 == 0) { - y0 = y0 + this.config.pageTopMargin; - yW = yW - this.config.pageTopMargin; + if (h < 0) { + h = 0; } - pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x0, y0, xW, yW)); + pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h)); pdfTextStripper.extractRegions(page); var text = pdfTextStripper.getTextForRegion("pdfPageRegion"); if (StringUtils.hasText(text)) { diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java index b514f690e11..2e3351957cb 100644 --- a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java +++ b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java @@ -1,5 +1,5 @@ /* - * Copyright 2023-2024 the original author or authors. + * Copyright 2023-2025 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,15 +16,32 @@ package org.springframework.ai.reader.pdf; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.junit.jupiter.api.Test; +import org.springframework.ai.document.Document; import org.springframework.ai.reader.ExtractedTextFormatter; import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; +import org.springframework.core.io.ByteArrayResource; +import org.springframework.core.io.ClassPathResource; +import org.springframework.core.io.Resource; +import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; /** * @author Christian Tzolov + * @author Heonwoo Kim */ public class ParagraphPdfDocumentReaderTests { @@ -50,4 +67,41 @@ public void testPdfWithoutToc() { } + @Test + void shouldSkipInvalidOutline() throws IOException { + + Resource basePdfResource = new ClassPathResource("sample3.pdf"); + + PDDocument documentToModify; + try (InputStream inputStream = basePdfResource.getInputStream()) { + + byte[] pdfBytes = inputStream.readAllBytes(); + + documentToModify = Loader.loadPDF(pdfBytes); + } + PDDocumentOutline outline = documentToModify.getDocumentCatalog().getDocumentOutline(); + if (outline != null && outline.getFirstChild() != null) { + PDOutlineItem chapter2OutlineItem = outline.getFirstChild().getNextSibling(); + if (chapter2OutlineItem != null) { + + chapter2OutlineItem.setDestination((PDDestination) null); + } + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + documentToModify.save(baos); + documentToModify.close(); + + Resource corruptedPdfResource = new ByteArrayResource(baos.toByteArray()); + + ParagraphPdfDocumentReader reader = new ParagraphPdfDocumentReader(corruptedPdfResource, + PdfDocumentReaderConfig.defaultConfig()); + + List documents = assertDoesNotThrow(() -> reader.get()); + + assertThat(documents).isNotNull(); + assertThat(documents).hasSize(2); + assertThat(documents.get(0).getMetadata().get("title")).isEqualTo("Chapter 1"); + assertThat(documents.get(1).getMetadata().get("title")).isEqualTo("Chapter 3"); + } + } diff --git a/document-readers/pdf-reader/src/test/resources/sample3.pdf b/document-readers/pdf-reader/src/test/resources/sample3.pdf new file mode 100644 index 00000000000..8ed8b40633c Binary files /dev/null and b/document-readers/pdf-reader/src/test/resources/sample3.pdf differ