From cc3a246973b8e0271655ab8c1d75d515085c496b Mon Sep 17 00:00:00 2001 From: WOONBE Date: Fri, 20 Jun 2025 14:23:12 +0900 Subject: [PATCH 1/7] fix getTextBetweenParagraphs to check for invalid page numbers in outline items Signed-off-by: WOONBE --- .../autoconfigure/OpenAiEmbeddingProperties.java | 9 +++++++++ .../ai/reader/pdf/ParagraphPdfDocumentReader.java | 12 ++++++++++++ 2 files changed, 21 insertions(+) diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java index fba25fdfbdd..fe49b2d5ea8 100644 --- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java +++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java @@ -34,6 +34,8 @@ public class OpenAiEmbeddingProperties extends OpenAiParentProperties { private String embeddingsPath = DEFAULT_EMBEDDINGS_PATH; + private int maxBatchChars = 160000; + @NestedConfigurationProperty private OpenAiEmbeddingOptions options = OpenAiEmbeddingOptions.builder().model(DEFAULT_EMBEDDING_MODEL).build(); @@ -61,4 +63,11 @@ public void setEmbeddingsPath(String embeddingsPath) { this.embeddingsPath = embeddingsPath; } + public int getMaxBatchChars() { + return this.maxBatchChars; + } + public void setMaxBatchChars(int maxBatchChars) { + this.maxBatchChars = maxBatchChars; + } + } diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java index 95863fff649..9dd23f816b3 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java @@ -46,6 +46,7 @@ * The paragraphs are grouped into {@link Document} objects. * * @author Christian Tzolov + * @author Heonwoo Kim */ public class ParagraphPdfDocumentReader implements DocumentReader { @@ -180,10 +181,21 @@ protected void addMetadata(Paragraph from, Paragraph to, Document document) { public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) { + if (fromParagraph.startPageNumber() < 1) { + + logger.warn("Skipping paragraph titled '{}' because it has an invalid start page number: {}", + fromParagraph.title(), fromParagraph.startPageNumber()); + return ""; + } + // Page started from index 0, while PDFBOx getPage return them from index 1. int startPage = fromParagraph.startPageNumber() - 1; int endPage = toParagraph.startPageNumber() - 1; + if (endPage < 0) { + endPage = startPage; + } + try { StringBuilder sb = new StringBuilder(); From 2a9cd05ae9871d8980beb590b4c46dd033cd8adc Mon Sep 17 00:00:00 2001 From: WOONBE Date: Fri, 20 Jun 2025 16:38:47 +0900 Subject: [PATCH 2/7] Fix ParagraphPdfDocumentReader to reliably extract text from PDFs with imperfect outlines and coordinate edge cases Signed-off-by: WOONBE --- .../pdf/ParagraphPdfDocumentReader.java | 90 +++++++----------- .../reader/pdf/config/ParagraphManager.java | 1 - .../pdf/ParagraphPdfDocumentReaderTests.java | 31 ++++++ .../pdf-reader/src/test/resources/sample3.pdf | Bin 0 -> 1295 bytes 4 files changed, 67 insertions(+), 55 deletions(-) create mode 100644 document-readers/pdf-reader/src/test/resources/sample3.pdf diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java index 9dd23f816b3..e02ff1318f5 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java @@ -128,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig */ @Override public List get() { - var paragraphs = this.paragraphTextExtractor.flatten(); - - List documents = new ArrayList<>(paragraphs.size()); - - if (!CollectionUtils.isEmpty(paragraphs)) { - logger.info("Start processing paragraphs from PDF"); - Iterator itr = paragraphs.iterator(); - - var current = itr.next(); - - if (!itr.hasNext()) { - documents.add(toDocument(current, current)); - } - else { - while (itr.hasNext()) { - var next = itr.next(); - Document document = toDocument(current, next); - if (document != null && StringUtils.hasText(document.getText())) { - documents.add(toDocument(current, next)); - } - current = next; - } + List documents = new ArrayList<>(); + if (CollectionUtils.isEmpty(paragraphs)) { + return documents; + } + logger.info("Start processing paragraphs from PDF"); + for (int i = 0; i < paragraphs.size(); i++) { + Paragraph from = paragraphs.get(i); + Paragraph to = (i + 1 < paragraphs.size()) ? paragraphs.get(i + 1) : from; + Document document = toDocument(from, to); + if (document != null && StringUtils.hasText(document.getText())) { + documents.add(document); } } logger.info("End processing paragraphs from PDF"); @@ -174,7 +163,7 @@ protected Document toDocument(Paragraph from, Paragraph to) { protected void addMetadata(Paragraph from, Paragraph to, Document document) { document.getMetadata().put(METADATA_TITLE, from.title()); document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber()); - document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber()); + document.getMetadata().put(METADATA_END_PAGE, from.endPageNumber()); document.getMetadata().put(METADATA_LEVEL, from.level()); document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName); } @@ -192,7 +181,7 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara int startPage = fromParagraph.startPageNumber() - 1; int endPage = toParagraph.startPageNumber() - 1; - if (endPage < 0) { + if (fromParagraph == toParagraph || endPage < startPage) { endPage = startPage; } @@ -206,39 +195,32 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara for (int pageNumber = startPage; pageNumber <= endPage; pageNumber++) { var page = this.document.getPage(pageNumber); - - int fromPosition = fromParagraph.position(); - int toPosition = toParagraph.position(); - - if (this.config.reversedParagraphPosition) { - fromPosition = (int) (page.getMediaBox().getHeight() - fromPosition); - toPosition = (int) (page.getMediaBox().getHeight() - toPosition); + float pageHeight = page.getMediaBox().getHeight(); + + int fromPos = fromParagraph.position(); + int toPos = (fromParagraph != toParagraph) ? toParagraph.position() : 0; + + int x = (int) page.getMediaBox().getLowerLeftX(); + int w = (int) page.getMediaBox().getWidth(); + int y, h; + + if (pageNumber == startPage && pageNumber == endPage) { + y = toPos; + h = fromPos - toPos; + } else if (pageNumber == startPage) { + y = 0; + h = fromPos; + } else if (pageNumber == endPage) { + y = toPos; + h = (int) pageHeight - toPos; + } else { + y = 0; + h = (int) pageHeight; } - int x0 = (int) page.getMediaBox().getLowerLeftX(); - int xW = (int) page.getMediaBox().getWidth(); - - int y0 = (int) page.getMediaBox().getLowerLeftY(); - int yW = (int) page.getMediaBox().getHeight(); - - if (pageNumber == startPage) { - y0 = fromPosition; - yW = (int) page.getMediaBox().getHeight() - y0; - } - if (pageNumber == endPage) { - yW = toPosition - y0; - } - - if ((y0 + yW) == (int) page.getMediaBox().getHeight()) { - yW = yW - this.config.pageBottomMargin; - } - - if (y0 == 0) { - y0 = y0 + this.config.pageTopMargin; - yW = yW - this.config.pageTopMargin; - } + if (h < 0) h = 0; - pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x0, y0, xW, yW)); + pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h)); pdfTextStripper.extractRegions(page); var text = pdfTextStripper.getTextForRegion("pdfPageRegion"); if (StringUtils.hasText(text)) { diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/config/ParagraphManager.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/config/ParagraphManager.java index f3e5eb69cc0..27d35d593f2 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/config/ParagraphManager.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/config/ParagraphManager.java @@ -117,7 +117,6 @@ protected Paragraph generateParagraphs(Paragraph parentParagraph, PDOutlineNode if (nextSiblingNumber < 0) { nextSiblingNumber = getPageNumber(current.getLastChild()); } - var paragraphPosition = (current.getDestination() instanceof PDPageXYZDestination) ? ((PDPageXYZDestination) current.getDestination()).getTop() : 0; diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java index b514f690e11..2d82abce365 100644 --- a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java +++ b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java @@ -16,15 +16,43 @@ package org.springframework.ai.reader.pdf; + +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mockito; +import org.mockito.Spy; +import org.mockito.junit.jupiter.MockitoExtension; +import org.springframework.ai.document.Document; import org.springframework.ai.reader.ExtractedTextFormatter; + +import org.springframework.ai.reader.pdf.config.ParagraphManager; import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; +import org.springframework.core.io.ByteArrayResource; +import org.springframework.core.io.ClassPathResource; +import org.springframework.core.io.Resource; +import org.springframework.test.util.ReflectionTestUtils; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.when; /** * @author Christian Tzolov + * @author Heonwoo Kim */ public class ParagraphPdfDocumentReaderTests { @@ -50,4 +78,7 @@ public void testPdfWithoutToc() { } + + + } diff --git a/document-readers/pdf-reader/src/test/resources/sample3.pdf b/document-readers/pdf-reader/src/test/resources/sample3.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8ed8b40633c853d6c288738f9f679b2fc12e7763 GIT binary patch literal 1295 zcmY!laBZ^4=fsl4ocwey{jk)c;>`R! z1$~fe{eZ;u)M5oApzR4$;V;*z4 UskiaEtx4=BGBF4+V1G10 zM347*&yl7{SCm+twOc3jgs`69GilMEDVLs1I`hZIw{?P!-gN;zh7&RZKr?_2fEWjJ ziXq6cFu%dvVnPMCC_~(G|Hv7e)(4aREb@Z8WL_`75jK}#3l(a(BpKq8=YRG*nRDq8 za+oZh$KQp;B}Q1n#Mr_d7We*1S-~Z_T>5?rhCnfFkp>hjDlSnlG_XXCzpKHwy^a_N z{GIxF1%IB=OI6JiO}lq-+?y8qp^d-Pd`;seyXQQ} za0s|^I0iOxEJ;4Hpe13!7PqDY8WUVJR67KO-WRo{7`W*0K35FmDT+I>pfQX`?Sfj< zB!gKCI69}@xGx^Qo}1gy@AS>9+c=v-bXdpJt2A#?$$_ z#P6J*=knRr^GdK)7xRn>RmW78tu*lSKCK!O9q5x2dgbYq%)@*VVmqJz<=WM2Jx{Q? zhBKStWA2(me(u#?G1vO{@k&onV9QM1JvVu_ZS?zPFYhflktX2&bIXjKHGG9{KQ-?u z@_z9Bu4?o$qE>eVYIN{?;8k_d3U=OC~Aro`3ajfVs(|JNtg5UAg9y z7Z!SY)wAEB$CuW|S-sGBk=>Qx>1ZEtuV~Gb{fwW_*(}Ev*~VBhfH9YTP<}oz2EiFu z-_u1Q+Q!M!)Y!t()WE{j*u>1q(#gf##mLpf&CuAw%*n;b($r3Yu##9<8j1)?P23WaC`1tVjil5mA+Ltr6d2xMYQT_{C@=!w&e4hAAF7ia!p;XLV8 zJu6_VQo;R4IekmcXL(sQPc%r_vTg^bv60y8&ghK`7u+hp)5pwvx9%(Rzh2glSc0Rt XB(bQZq6p|ZLjwb2E>%@me>W}wgn{YA literal 0 HcmV?d00001 From aabfda5167ddfc36c7f7154bd8a2acaca2867f55 Mon Sep 17 00:00:00 2001 From: WOONBE Date: Fri, 20 Jun 2025 16:46:14 +0900 Subject: [PATCH 3/7] Test : Add test to validate ParagraphPdfDocumentReader to skip Invalid Outline Signed-off-by: WOONBE --- .../OpenAiEmbeddingProperties.java | 1 + .../pdf/ParagraphPdfDocumentReader.java | 12 ++++--- .../pdf/ParagraphPdfDocumentReaderTests.java | 35 ++++++++++++++++++- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java index fe49b2d5ea8..a295847157f 100644 --- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java +++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java @@ -66,6 +66,7 @@ public void setEmbeddingsPath(String embeddingsPath) { public int getMaxBatchChars() { return this.maxBatchChars; } + public void setMaxBatchChars(int maxBatchChars) { this.maxBatchChars = maxBatchChars; } diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java index e02ff1318f5..9f699bae894 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java @@ -207,18 +207,22 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara if (pageNumber == startPage && pageNumber == endPage) { y = toPos; h = fromPos - toPos; - } else if (pageNumber == startPage) { + } + else if (pageNumber == startPage) { y = 0; h = fromPos; - } else if (pageNumber == endPage) { + } + else if (pageNumber == endPage) { y = toPos; h = (int) pageHeight - toPos; - } else { + } + else { y = 0; h = (int) pageHeight; } - if (h < 0) h = 0; + if (h < 0) + h = 0; pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h)); pdfTextStripper.extractRegions(page); diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java index 2d82abce365..2e314536215 100644 --- a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java +++ b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java @@ -16,7 +16,6 @@ package org.springframework.ai.reader.pdf; - import org.apache.pdfbox.Loader; import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.pdmodel.PDDocument; @@ -78,7 +77,41 @@ public void testPdfWithoutToc() { } + @Test + void shouldSkipInvalidOutline() throws IOException { + + Resource basePdfResource = new ClassPathResource("sample3.pdf"); + + PDDocument documentToModify; + try (InputStream inputStream = basePdfResource.getInputStream()) { + + byte[] pdfBytes = inputStream.readAllBytes(); + documentToModify = Loader.loadPDF(pdfBytes); + } + PDDocumentOutline outline = documentToModify.getDocumentCatalog().getDocumentOutline(); + if (outline != null && outline.getFirstChild() != null) { + PDOutlineItem chapter2OutlineItem = outline.getFirstChild().getNextSibling(); + if (chapter2OutlineItem != null) { + chapter2OutlineItem.setDestination((PDDestination) null); + } + } + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + documentToModify.save(baos); + documentToModify.close(); + + Resource corruptedPdfResource = new ByteArrayResource(baos.toByteArray()); + + ParagraphPdfDocumentReader reader = new ParagraphPdfDocumentReader(corruptedPdfResource, + PdfDocumentReaderConfig.defaultConfig()); + + List documents = assertDoesNotThrow(() -> reader.get()); + + assertThat(documents).isNotNull(); + assertThat(documents).hasSize(2); + assertThat(documents.get(0).getMetadata().get("title")).isEqualTo("Chapter 1"); + assertThat(documents.get(1).getMetadata().get("title")).isEqualTo("Chapter 3"); + } } From 88ffff412c7cdf979cbbec36fd4d7fd73256fe21 Mon Sep 17 00:00:00 2001 From: WOONBE Date: Fri, 20 Jun 2025 17:12:35 +0900 Subject: [PATCH 4/7] Rollback OpenAiEmbeddingProperties Signed-off-by: WOONBE --- .../autoconfigure/OpenAiEmbeddingProperties.java | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java index a295847157f..fba25fdfbdd 100644 --- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java +++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java @@ -34,8 +34,6 @@ public class OpenAiEmbeddingProperties extends OpenAiParentProperties { private String embeddingsPath = DEFAULT_EMBEDDINGS_PATH; - private int maxBatchChars = 160000; - @NestedConfigurationProperty private OpenAiEmbeddingOptions options = OpenAiEmbeddingOptions.builder().model(DEFAULT_EMBEDDING_MODEL).build(); @@ -63,12 +61,4 @@ public void setEmbeddingsPath(String embeddingsPath) { this.embeddingsPath = embeddingsPath; } - public int getMaxBatchChars() { - return this.maxBatchChars; - } - - public void setMaxBatchChars(int maxBatchChars) { - this.maxBatchChars = maxBatchChars; - } - } From 8fb85404bba1ee78e4b68af3603e4ee965d12819 Mon Sep 17 00:00:00 2001 From: WOONBE Date: Fri, 20 Jun 2025 22:19:22 +0900 Subject: [PATCH 5/7] Style: Reorder imports and remove unnecessary line break Signed-off-by: WOONBE --- .../pdf/ParagraphPdfDocumentReader.java | 1 - .../reader/pdf/config/ParagraphManager.java | 1 + .../pdf/ParagraphPdfDocumentReaderTests.java | 20 +++++-------------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java index 9f699bae894..e999dfbda70 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java @@ -171,7 +171,6 @@ protected void addMetadata(Paragraph from, Paragraph to, Document document) { public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) { if (fromParagraph.startPageNumber() < 1) { - logger.warn("Skipping paragraph titled '{}' because it has an invalid start page number: {}", fromParagraph.title(), fromParagraph.startPageNumber()); return ""; diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/config/ParagraphManager.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/config/ParagraphManager.java index 27d35d593f2..f3e5eb69cc0 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/config/ParagraphManager.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/config/ParagraphManager.java @@ -117,6 +117,7 @@ protected Paragraph generateParagraphs(Paragraph parentParagraph, PDOutlineNode if (nextSiblingNumber < 0) { nextSiblingNumber = getPageNumber(current.getLastChild()); } + var paragraphPosition = (current.getDestination() instanceof PDPageXYZDestination) ? ((PDPageXYZDestination) current.getDestination()).getTop() : 0; diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java index 2e314536215..1f1f6af6190 100644 --- a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java +++ b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java @@ -16,38 +16,28 @@ package org.springframework.ai.reader.pdf; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + import org.apache.pdfbox.Loader; -import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.Mockito; -import org.mockito.Spy; -import org.mockito.junit.jupiter.MockitoExtension; import org.springframework.ai.document.Document; import org.springframework.ai.reader.ExtractedTextFormatter; - -import org.springframework.ai.reader.pdf.config.ParagraphManager; import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig; import org.springframework.core.io.ByteArrayResource; import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.Resource; -import org.springframework.test.util.ReflectionTestUtils; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.List; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; -import static org.mockito.Mockito.doReturn; -import static org.mockito.Mockito.when; /** * @author Christian Tzolov From 02107b85ee79231e9821dbf80c3f45e3a71d44d0 Mon Sep 17 00:00:00 2001 From: WOONBE Date: Fri, 20 Jun 2025 22:23:12 +0900 Subject: [PATCH 6/7] Add curly braces to single-line if statement Signed-off-by: WOONBE --- .../ai/reader/pdf/ParagraphPdfDocumentReader.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java index e999dfbda70..bbbc214c45e 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java @@ -220,8 +220,9 @@ else if (pageNumber == endPage) { h = (int) pageHeight; } - if (h < 0) + if (h < 0) { h = 0; + } pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h)); pdfTextStripper.extractRegions(page); From 291d515592bd3b40af623dcb4d1a5784f5bc0c81 Mon Sep 17 00:00:00 2001 From: WOONBE Date: Fri, 20 Jun 2025 22:42:27 +0900 Subject: [PATCH 7/7] style : modified copyright year Signed-off-by: WOONBE --- .../ai/reader/pdf/ParagraphPdfDocumentReader.java | 2 +- .../ai/reader/pdf/ParagraphPdfDocumentReaderTests.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java index bbbc214c45e..d8be2179675 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java @@ -1,5 +1,5 @@ /* - * Copyright 2023-2024 the original author or authors. + * Copyright 2023-2025 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java index 1f1f6af6190..2e3351957cb 100644 --- a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java +++ b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java @@ -1,5 +1,5 @@ /* - * Copyright 2023-2024 the original author or authors. + * Copyright 2023-2025 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.