Skip to content

Commit 909b085

Browse files
committed
Fix NPE, when structure element references a page outside the page tree
For some reason OCR in Apryse SDK produces such files. There is a proper page in the page tree. But the structure elements reference some entirely different page, which is not referenced in /Pages... Now in such cases we will assume, that no marked content was found.
1 parent 47797d8 commit 909b085

File tree

3 files changed

+47
-5
lines changed

3 files changed

+47
-5
lines changed

src/main/java/com/itextpdf/rups/view/itext/StructureTree.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ This file is part of the iText (R) project.
4747
import com.itextpdf.kernel.pdf.PdfName;
4848
import com.itextpdf.kernel.pdf.PdfNumber;
4949
import com.itextpdf.kernel.pdf.PdfObject;
50+
import com.itextpdf.kernel.pdf.PdfPage;
5051
import com.itextpdf.rups.controller.PdfReaderController;
5152
import com.itextpdf.rups.model.ObjectLoader;
5253
import com.itextpdf.rups.model.TreeNodeFactory;
@@ -134,15 +135,27 @@ TreeModel recalculateTreeModel() {
134135
return new DefaultTreeModel(root);
135136
}
136137

137-
private Map<Integer, MarkedContentInfo> indexMarkedContentOnPage(PdfDictionary page) {
138-
final PdfIndirectReference ref = page.getIndirectReference();
138+
private Map<Integer, MarkedContentInfo> indexMarkedContentOnPage(PdfDictionary pageDict) {
139+
final PdfIndirectReference ref = pageDict.getIndirectReference();
139140
Map<Integer, MarkedContentInfo> result = this.mciByPage.get(ref);
140141
if (result != null) {
141142
return result;
142143
}
143-
final MarkedContentInfoGatherer gatherer = new MarkedContentInfoGatherer();
144-
gatherer.processPageContent(this.loader.getFile().getPdfDocument().getPage(page));
145-
result = gatherer.getMarkedContentIndex();
144+
final PdfPage page = this.loader.getFile().getPdfDocument().getPage(pageDict);
145+
if (page != null) {
146+
final MarkedContentInfoGatherer gatherer = new MarkedContentInfoGatherer();
147+
gatherer.processPageContent(page);
148+
result = gatherer.getMarkedContentIndex();
149+
} else {
150+
/*
151+
* This can happen in weird cases, when there is a page in the
152+
* document, that is referenced within a structure element, but it
153+
* is absent in the page tree for some reason... OCR in Apryse SDK
154+
* does this, oddly enough. So we will just assume, that we could
155+
* not find anything there.
156+
*/
157+
result = Map.of();
158+
}
146159
this.mciByPage.put(ref, result);
147160
return result;
148161
}

src/test/java/com/itextpdf/rups/view/itext/StructureTreeTest.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,35 @@ void structElemsGeneratedByAcrobatTest()
127127
Assertions.assertEquals(0, secondTextNode.getChildCount());
128128
}
129129

130+
@Test
131+
void structElemsGeneratedByApryseSdkOcrTest()
132+
throws IOException, ExecutionException, InterruptedException {
133+
/*
134+
* For some reason when OCR in Apryse SDK creates the structure elem
135+
* tree, the elements there reference a page, which is absent in the
136+
* overall page tree. We shouldn't crash in such cases.
137+
*/
138+
final PdfFile pdfFile = PdfFile.open(
139+
new File(SOURCE_DIR + "ApryseSdkOcrTest.pdf")
140+
);
141+
142+
final StructureTreeNode rootNode = getStructureTreeRootNode(pdfFile);
143+
Assertions.assertEquals(1, rootNode.getChildCount());
144+
145+
final StructureTreeNode docNode = (StructureTreeNode) rootNode.getChildAt(0);
146+
Assertions.assertEquals("/Document", docNode.toString());
147+
Assertions.assertEquals(1, docNode.getChildCount());
148+
final StructureTreeNode floatNode = (StructureTreeNode) docNode.getChildAt(0);
149+
Assertions.assertEquals("/Float", floatNode.toString());
150+
Assertions.assertEquals(1, floatNode.getChildCount());
151+
final StructureTreeNode figureNode = (StructureTreeNode) floatNode.getChildAt(0);
152+
Assertions.assertEquals("/Figure", figureNode.toString());
153+
Assertions.assertEquals(1, figureNode.getChildCount());
154+
final StructureTreeNode mcidNode = (StructureTreeNode) figureNode.getChildAt(0);
155+
Assertions.assertEquals("0", mcidNode.toString());
156+
Assertions.assertEquals(0, mcidNode.getChildCount());
157+
}
158+
130159
private static StructureTreeNode getStructureTreeRootNode(IPdfFile pdfFile)
131160
throws ExecutionException, InterruptedException {
132161

Binary file not shown.

0 commit comments

Comments
 (0)