Skip to content

Commit aabfda5

Browse files
committed
Test : Add test to validate ParagraphPdfDocumentReader to skip Invalid Outline
Signed-off-by: WOONBE <[email protected]>
1 parent 2a9cd05 commit aabfda5

File tree

3 files changed

+43
-5
lines changed

3 files changed

+43
-5
lines changed

auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiEmbeddingProperties.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ public void setEmbeddingsPath(String embeddingsPath) {
6666
public int getMaxBatchChars() {
6767
return this.maxBatchChars;
6868
}
69+
6970
public void setMaxBatchChars(int maxBatchChars) {
7071
this.maxBatchChars = maxBatchChars;
7172
}

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -207,18 +207,22 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
207207
if (pageNumber == startPage && pageNumber == endPage) {
208208
y = toPos;
209209
h = fromPos - toPos;
210-
} else if (pageNumber == startPage) {
210+
}
211+
else if (pageNumber == startPage) {
211212
y = 0;
212213
h = fromPos;
213-
} else if (pageNumber == endPage) {
214+
}
215+
else if (pageNumber == endPage) {
214216
y = toPos;
215217
h = (int) pageHeight - toPos;
216-
} else {
218+
}
219+
else {
217220
y = 0;
218221
h = (int) pageHeight;
219222
}
220223

221-
if (h < 0) h = 0;
224+
if (h < 0)
225+
h = 0;
222226

223227
pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h));
224228
pdfTextStripper.extractRegions(page);

document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
package org.springframework.ai.reader.pdf;
1818

19-
2019
import org.apache.pdfbox.Loader;
2120
import org.apache.pdfbox.io.RandomAccessRead;
2221
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -78,7 +77,41 @@ public void testPdfWithoutToc() {
7877

7978
}
8079

80+
@Test
81+
void shouldSkipInvalidOutline() throws IOException {
82+
83+
Resource basePdfResource = new ClassPathResource("sample3.pdf");
84+
85+
PDDocument documentToModify;
86+
try (InputStream inputStream = basePdfResource.getInputStream()) {
87+
88+
byte[] pdfBytes = inputStream.readAllBytes();
8189

90+
documentToModify = Loader.loadPDF(pdfBytes);
91+
}
92+
PDDocumentOutline outline = documentToModify.getDocumentCatalog().getDocumentOutline();
93+
if (outline != null && outline.getFirstChild() != null) {
94+
PDOutlineItem chapter2OutlineItem = outline.getFirstChild().getNextSibling();
95+
if (chapter2OutlineItem != null) {
8296

97+
chapter2OutlineItem.setDestination((PDDestination) null);
98+
}
99+
}
100+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
101+
documentToModify.save(baos);
102+
documentToModify.close();
103+
104+
Resource corruptedPdfResource = new ByteArrayResource(baos.toByteArray());
105+
106+
ParagraphPdfDocumentReader reader = new ParagraphPdfDocumentReader(corruptedPdfResource,
107+
PdfDocumentReaderConfig.defaultConfig());
108+
109+
List<Document> documents = assertDoesNotThrow(() -> reader.get());
110+
111+
assertThat(documents).isNotNull();
112+
assertThat(documents).hasSize(2);
113+
assertThat(documents.get(0).getMetadata().get("title")).isEqualTo("Chapter 1");
114+
assertThat(documents.get(1).getMetadata().get("title")).isEqualTo("Chapter 3");
115+
}
83116

84117
}

0 commit comments

Comments
 (0)