- 
                Notifications
    
You must be signed in to change notification settings  - Fork 2k
 
GH 3421 : Fix silent failures in PDF outline processing #3623
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
cc3a246
              2a9cd05
              aabfda5
              88ffff4
              8fb8540
              02107b8
              291d515
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -46,6 +46,7 @@ | |
| * The paragraphs are grouped into {@link Document} objects. | ||
| * | ||
| * @author Christian Tzolov | ||
| * @author Heonwoo Kim | ||
| */ | ||
| public class ParagraphPdfDocumentReader implements DocumentReader { | ||
| 
     | 
||
| 
          
            
          
           | 
    @@ -127,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig | |
| */ | ||
| @Override | ||
| public List<Document> get() { | ||
| 
     | 
||
| var paragraphs = this.paragraphTextExtractor.flatten(); | ||
| 
     | 
||
| List<Document> documents = new ArrayList<>(paragraphs.size()); | ||
| 
     | 
||
| if (!CollectionUtils.isEmpty(paragraphs)) { | ||
| logger.info("Start processing paragraphs from PDF"); | ||
| Iterator<Paragraph> itr = paragraphs.iterator(); | ||
| 
     | 
||
| var current = itr.next(); | ||
| 
     | 
||
| if (!itr.hasNext()) { | ||
| documents.add(toDocument(current, current)); | ||
| } | ||
| else { | ||
| while (itr.hasNext()) { | ||
| var next = itr.next(); | ||
| Document document = toDocument(current, next); | ||
| if (document != null && StringUtils.hasText(document.getText())) { | ||
| documents.add(toDocument(current, next)); | ||
| } | ||
| current = next; | ||
| } | ||
| List<Document> documents = new ArrayList<>(); | ||
| if (CollectionUtils.isEmpty(paragraphs)) { | ||
| return documents; | ||
| } | ||
| logger.info("Start processing paragraphs from PDF"); | ||
| for (int i = 0; i < paragraphs.size(); i++) { | ||
| Paragraph from = paragraphs.get(i); | ||
| Paragraph to = (i + 1 < paragraphs.size()) ? paragraphs.get(i + 1) : from; | ||
| Document document = toDocument(from, to); | ||
| if (document != null && StringUtils.hasText(document.getText())) { | ||
| documents.add(document); | ||
| } | ||
| } | ||
| logger.info("End processing paragraphs from PDF"); | ||
| 
        
          
        
         | 
    @@ -173,17 +163,28 @@ protected Document toDocument(Paragraph from, Paragraph to) { | |
| protected void addMetadata(Paragraph from, Paragraph to, Document document) { | ||
| document.getMetadata().put(METADATA_TITLE, from.title()); | ||
| document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber()); | ||
| document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber()); | ||
| document.getMetadata().put(METADATA_END_PAGE, from.endPageNumber()); | ||
| document.getMetadata().put(METADATA_LEVEL, from.level()); | ||
| document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName); | ||
| } | ||
| 
     | 
||
| public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) { | ||
| 
     | 
||
| if (fromParagraph.startPageNumber() < 1) { | ||
| 
     | 
||
| logger.warn("Skipping paragraph titled '{}' because it has an invalid start page number: {}", | ||
| fromParagraph.title(), fromParagraph.startPageNumber()); | ||
| return ""; | ||
| } | ||
| 
     | 
||
| // Page started from index 0, while PDFBOx getPage return them from index 1. | ||
| int startPage = fromParagraph.startPageNumber() - 1; | ||
| int endPage = toParagraph.startPageNumber() - 1; | ||
| 
     | 
||
| if (fromParagraph == toParagraph || endPage < startPage) { | ||
| endPage = startPage; | ||
| } | ||
| 
     | 
||
| try { | ||
| 
     | 
||
| StringBuilder sb = new StringBuilder(); | ||
| 
        
          
        
         | 
    @@ -194,39 +195,36 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara | |
| for (int pageNumber = startPage; pageNumber <= endPage; pageNumber++) { | ||
| 
     | 
||
| var page = this.document.getPage(pageNumber); | ||
| float pageHeight = page.getMediaBox().getHeight(); | ||
| 
     | 
||
| int fromPosition = fromParagraph.position(); | ||
| int toPosition = toParagraph.position(); | ||
| 
     | 
||
| if (this.config.reversedParagraphPosition) { | ||
| fromPosition = (int) (page.getMediaBox().getHeight() - fromPosition); | ||
| toPosition = (int) (page.getMediaBox().getHeight() - toPosition); | ||
| } | ||
| int fromPos = fromParagraph.position(); | ||
| int toPos = (fromParagraph != toParagraph) ? toParagraph.position() : 0; | ||
| 
     | 
||
| int x0 = (int) page.getMediaBox().getLowerLeftX(); | ||
| int xW = (int) page.getMediaBox().getWidth(); | ||
| int x = (int) page.getMediaBox().getLowerLeftX(); | ||
| int w = (int) page.getMediaBox().getWidth(); | ||
| int y, h; | ||
| 
     | 
||
| int y0 = (int) page.getMediaBox().getLowerLeftY(); | ||
| int yW = (int) page.getMediaBox().getHeight(); | ||
| 
     | 
||
| if (pageNumber == startPage) { | ||
| y0 = fromPosition; | ||
| yW = (int) page.getMediaBox().getHeight() - y0; | ||
| if (pageNumber == startPage && pageNumber == endPage) { | ||
| y = toPos; | ||
| h = fromPos - toPos; | ||
| } | ||
| if (pageNumber == endPage) { | ||
| yW = toPosition - y0; | ||
| else if (pageNumber == startPage) { | ||
| y = 0; | ||
| h = fromPos; | ||
| } | ||
| 
     | 
||
| if ((y0 + yW) == (int) page.getMediaBox().getHeight()) { | ||
| yW = yW - this.config.pageBottomMargin; | ||
| else if (pageNumber == endPage) { | ||
| y = toPos; | ||
| h = (int) pageHeight - toPos; | ||
| } | ||
| 
     | 
||
| if (y0 == 0) { | ||
| y0 = y0 + this.config.pageTopMargin; | ||
| yW = yW - this.config.pageTopMargin; | ||
| else { | ||
| y = 0; | ||
| h = (int) pageHeight; | ||
| } | ||
| 
     | 
||
| pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x0, y0, xW, yW)); | ||
| if (h < 0) | ||
| h = 0; | ||
                
       | 
||
| 
     | 
||
| pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h)); | ||
| pdfTextStripper.extractRegions(page); | ||
| var text = pdfTextStripper.getTextForRegion("pdfPageRegion"); | ||
| if (StringUtils.hasText(text)) { | ||
| 
          
            
          
           | 
    ||
| 
                       There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please rearrange the import statements in the following order: 
 For reference, you can refer to the example in this file:  | 
            
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this an intended line break?