Skip to content

Commit 6e33c84

Browse files
committed
Fix PDF document grouping logic to ensure correct page distribution according to pagesPerDocument
1 parent 3f79373 commit 6e33c84

File tree

2 files changed

+13
-1
lines changed

2 files changed

+13
-1
lines changed

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ public List<Document> get() {
119119
pagesPerDocument++;
120120

121121
if (this.config.pagesPerDocument != PdfDocumentReaderConfig.ALL_PAGES
122-
&& pagesPerDocument >= this.config.pagesPerDocument) {
122+
&& pagesPerDocument > this.config.pagesPerDocument) {
123123
pagesPerDocument = 0;
124124

125125
var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining());

document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,16 @@ void testIndexOutOfBound() {
7171
assertThat(documents).hasSize(64);
7272
}
7373

74+
@Test
75+
void testPagesPerDocument() {
76+
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
77+
PdfDocumentReaderConfig.builder()
78+
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
79+
.withPagesPerDocument(32)
80+
.build())
81+
.get();
82+
83+
assertThat(documents).hasSize(2);
84+
}
85+
7486
}

0 commit comments

Comments
 (0)