Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -44,6 +45,7 @@
* pageBottomMargin = 0
*
* @author Christian Tzolov
* @author Fu Jian
*/
public class PagePdfDocumentReader implements DocumentReader {

Expand Down Expand Up @@ -96,74 +98,70 @@ public List<Document> get() {
try {
var pdfTextStripper = new PDFLayoutTextStripperByArea();

int pageNumber = 0;
int pagesPerDocument = 0;
int startPageNumber = pageNumber;
int pageNumber = 1;
int startPageNumber = 1;

List<String> pageTextGroupList = new ArrayList<>();

int totalPages = this.document.getDocumentCatalog().getPages().getCount();
int logFrequency = totalPages > 10 ? totalPages / 10 : 1; // if less than 10
// pages, print
// each iteration
int counter = 0;

PDPage lastPage = this.document.getDocumentCatalog().getPages().iterator().next();
for (PDPage page : this.document.getDocumentCatalog().getPages()) {
lastPage = page;
if (counter % logFrequency == 0 && counter / logFrequency < 10) {
logger.info("Processing PDF page: {}", (counter + 1));
}
counter++;
PDPageTree pages = this.document.getDocumentCatalog().getPages();
int totalPages = pages.getCount();
int logFrequency = totalPages > 10 ? totalPages / 10 : 1;

pagesPerDocument++;
int pagesPerDocument = getPagesPerDocument(totalPages);
for (PDPage page : pages) {
if ((pageNumber - 1) % logFrequency == 0) {
logger.info("Processing PDF page: {}", pageNumber);
}

if (this.config.pagesPerDocument != PdfDocumentReaderConfig.ALL_PAGES
&& pagesPerDocument >= this.config.pagesPerDocument) {
pagesPerDocument = 0;
handleSinglePage(page, pageNumber, pdfTextStripper, pageTextGroupList);

var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining());
if (StringUtils.hasText(aggregatedPageTextGroup)) {
readDocuments.add(toDocument(page, aggregatedPageTextGroup, startPageNumber, pageNumber));
if (pageNumber % pagesPerDocument == 0 || pageNumber == totalPages) {
if (!CollectionUtils.isEmpty(pageTextGroupList)) {
readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()),
startPageNumber, pageNumber));
pageTextGroupList.clear();
}
pageTextGroupList.clear();

startPageNumber = pageNumber + 1;
}
int x0 = (int) page.getMediaBox().getLowerLeftX();
int xW = (int) page.getMediaBox().getWidth();

int y0 = (int) page.getMediaBox().getLowerLeftY() + this.config.pageTopMargin;
int yW = (int) page.getMediaBox().getHeight()
- (this.config.pageTopMargin + this.config.pageBottomMargin);

pdfTextStripper.addRegion(PDF_PAGE_REGION, new Rectangle(x0, y0, xW, yW));
pdfTextStripper.extractRegions(page);
var pageText = pdfTextStripper.getTextForRegion(PDF_PAGE_REGION);

if (StringUtils.hasText(pageText)) {

pageText = this.config.pageExtractedTextFormatter.format(pageText, pageNumber);

pageTextGroupList.add(pageText);
}
pageNumber++;
pdfTextStripper.removeRegion(PDF_PAGE_REGION);
}
if (!CollectionUtils.isEmpty(pageTextGroupList)) {
readDocuments.add(toDocument(lastPage, pageTextGroupList.stream().collect(Collectors.joining()),
startPageNumber, pageNumber));
}
logger.info("Processing {} pages", totalPages);
return readDocuments;

logger.info("Processed total {} pages", totalPages);
return readDocuments;
}
catch (IOException e) {
throw new RuntimeException(e);
}
}

protected Document toDocument(PDPage page, String docText, int startPageNumber, int endPageNumber) {
private void handleSinglePage(PDPage page, int pageNumber, PDFLayoutTextStripperByArea pdfTextStripper,
List<String> pageTextGroupList) throws IOException {
int x0 = (int) page.getMediaBox().getLowerLeftX();
int xW = (int) page.getMediaBox().getWidth();

int y0 = (int) page.getMediaBox().getLowerLeftY() + this.config.pageTopMargin;
int yW = (int) page.getMediaBox().getHeight() - (this.config.pageTopMargin + this.config.pageBottomMargin);

pdfTextStripper.addRegion(PDF_PAGE_REGION, new Rectangle(x0, y0, xW, yW));
pdfTextStripper.extractRegions(page);
var pageText = pdfTextStripper.getTextForRegion(PDF_PAGE_REGION);

if (StringUtils.hasText(pageText)) {
pageText = this.config.pageExtractedTextFormatter.format(pageText, pageNumber);
pageTextGroupList.add(pageText);
}
pdfTextStripper.removeRegion(PDF_PAGE_REGION);
}

private int getPagesPerDocument(int totalPages) {
if (this.config.pagesPerDocument == PdfDocumentReaderConfig.ALL_PAGES) {
return totalPages;
}
return this.config.pagesPerDocument;
}

protected Document toDocument(String docText, int startPageNumber, int endPageNumber) {
Document doc = new Document(docText);
doc.getMetadata().put(METADATA_START_PAGE_NUMBER, startPageNumber);
if (startPageNumber != endPageNumber) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
/**
* @author Christian Tzolov
* @author Tibor Tarnai
* @author Fu Jian
*/
class PagePdfDocumentReaderTests {

Expand Down Expand Up @@ -71,4 +72,43 @@ void testIndexOutOfBound() {
assertThat(documents).hasSize(64);
}

@Test
void testPagesPerDocument() {
// The test pdf contain 64 pages
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
PdfDocumentReaderConfig.builder()
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
.withPagesPerDocument(32)
.build())
.get();

assertThat(documents).hasSize(2);
}

@Test
void testPagesPerDocumentNotDivisible() {
// The test pdf contain 64 pages
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
PdfDocumentReaderConfig.builder()
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
.withPagesPerDocument(3)
.build())
.get();

assertThat(documents).hasSize(22);
}

@Test
void testAllPagesPerDocument() {
// The test pdf contain 64 pages
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
PdfDocumentReaderConfig.builder()
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
.withPagesPerDocument(0) // all pages into one document
.build())
.get();

assertThat(documents).hasSize(1);
}

}