itext · iText-CI · Jan 6, 2026 · Jan 6, 2026
diff --git a/src/main/java/com/itextpdf/rups/view/itext/StructureTree.java b/src/main/java/com/itextpdf/rups/view/itext/StructureTree.java
@@ -47,6 +47,7 @@ This file is part of the iText (R) project.
 import com.itextpdf.kernel.pdf.PdfName;
 import com.itextpdf.kernel.pdf.PdfNumber;
 import com.itextpdf.kernel.pdf.PdfObject;
+import com.itextpdf.kernel.pdf.PdfPage;
 import com.itextpdf.rups.controller.PdfReaderController;
 import com.itextpdf.rups.model.ObjectLoader;
 import com.itextpdf.rups.model.TreeNodeFactory;
@@ -134,15 +135,26 @@ TreeModel recalculateTreeModel() {
         return new DefaultTreeModel(root);
     }
 
-    private Map<Integer, MarkedContentInfo> indexMarkedContentOnPage(PdfDictionary page) {
-        final PdfIndirectReference ref = page.getIndirectReference();
+    private Map<Integer, MarkedContentInfo> indexMarkedContentOnPage(PdfDictionary pageDict) {
+        final PdfIndirectReference ref = pageDict.getIndirectReference();
         Map<Integer, MarkedContentInfo> result = this.mciByPage.get(ref);
         if (result != null) {
             return result;
         }
-        final MarkedContentInfoGatherer gatherer = new MarkedContentInfoGatherer();
-        gatherer.processPageContent(this.loader.getFile().getPdfDocument().getPage(page));
-        result = gatherer.getMarkedContentIndex();
+        final PdfPage page = this.loader.getFile().getPdfDocument().getPage(pageDict);
+        if (page != null) {
+            final MarkedContentInfoGatherer gatherer = new MarkedContentInfoGatherer();
+            gatherer.processPageContent(page);
+            result = gatherer.getMarkedContentIndex();
+        } else {
+            /*
+             * This can happen in weird cases, when there is a page in the
+             * document, that is referenced within a structure element, but it
+             * is absent in the page tree for some reason... So we will just
+             * assume, that we could not find anything there.
+             */
+            result = Map.of();
+        }
         this.mciByPage.put(ref, result);
         return result;
     }

diff --git a/src/main/java/com/itextpdf/rups/view/itext/treenodes/PdfObjectTreeNode.java b/src/main/java/com/itextpdf/rups/view/itext/treenodes/PdfObjectTreeNode.java
@@ -54,6 +54,7 @@ This file is part of the iText (R) project.
 import com.itextpdf.rups.view.icons.IconTreeNode;
 
 import java.util.Enumeration;
+import java.util.regex.Pattern;
 import javax.swing.tree.TreeNode;
 
 /**
@@ -73,6 +74,8 @@ public class PdfObjectTreeNode extends IconTreeNode implements IPdfContextMenuTa
     private static final String STREAM_ICON = "stream.png";
     private static final String STRING_ICON = "string.png";
 
+    private static final Pattern CAPTION_STRIP_REGEX = Pattern.compile("[\\x00-\\x08\\x0B-\\x1F]");
+
     /**
      * If the object is indirect, the number of the PDF object.
      */
@@ -369,7 +372,7 @@ public static String getCaption(PdfObject object) {
                 }
                 return String.format(Language.STREAM_OF_TYPE.getString(), type);
             case PdfObject.STRING:
-                return ((PdfString) object).toUnicodeString();
+                return toStrippedUnicodeString((PdfString) object);
             case PdfObject.DICTIONARY:
                 type = ((PdfDictionary) object).getAsName(PdfName.Type);
                 if (type == null) {
@@ -463,4 +466,16 @@ public boolean supportsSave() {
          */
         return object.isStream() || object.isString();
     }
+
+    /**
+     * Returns the result of a {@link PdfString#toUnicodeString()} call, but
+     * removes some of the non-printable ASCII control characters.
+     *
+     * @param pdfString PDF string to convert to Java string
+     *
+     * @return the converted string
+     */
+    private static String toStrippedUnicodeString(PdfString pdfString) {
+        return CAPTION_STRIP_REGEX.matcher(pdfString.toUnicodeString()).replaceAll("");
+    }
 }
diff --git a/src/main/java/com/itextpdf/rups/view/itext/treenodes/StructureTreeNode.java b/src/main/java/com/itextpdf/rups/view/itext/treenodes/StructureTreeNode.java
@@ -101,9 +101,13 @@ public PdfObjectTreeNode getCorrespondingPdfObjectNode() {
     }
 
     private static Object ingestDictionaryNode(PdfDictionary dict, PdfObjectTreeNode node) {
-        final Object userObj;
         final PdfObject dictType = dict.get(PdfName.Type, false);
-        if (PdfName.StructElem.equals(dictType)) {
+        if (PdfName.OBJR.equals(dictType)) {
+            return "OBJR => " + node.getPdfObject().getIndirectReference();
+        }
+        // Assuming StructElem from here, since the /Type key for structure
+        // elements is optional
+        if (PdfName.StructElem.equals(dictType) || dictType == null) {
             final StringBuilder buf = new StringBuilder();
             if (dict.get(PdfName.S, false) != null) {
                 buf.append(PdfObjectTreeNode.getCaption(dict.get(PdfName.S, false)));
@@ -113,15 +117,14 @@ private static Object ingestDictionaryNode(PdfDictionary dict, PdfObjectTreeNode
             }
             final PdfString actualText = dict.getAsString(PdfName.ActualText);
             if (actualText != null) {
-                formatExtractedText(buf, actualText.toUnicodeString());
+                formatExtractedText(buf, PdfObjectTreeNode.getCaption(actualText));
+            }
+            if (buf.length() > 0) {
+                return buf.toString();
             }
-            userObj = buf.toString();
-        } else if (PdfName.OBJR.equals(dictType)){
-            userObj = "OBJR => " + node.getPdfObject().getIndirectReference();
-        } else {
-            userObj = node;
         }
-        return userObj;
+        // Using just the node itself as fallback
+        return node;
     }
 
     protected static void formatExtractedText(StringBuilder base, String extractedText) {

diff --git a/src/test/java/com/itextpdf/rups/view/itext/StructureTreeTest.java b/src/test/java/com/itextpdf/rups/view/itext/StructureTreeTest.java
@@ -91,6 +91,71 @@ void extractMcidContentInStructureTreeWithActualTextTest()
         Assertions.assertEquals("0 [Olleh ]", nodeLabel);
     }
 
+    @Test
+    void structElemsGeneratedByAcrobatTest()
+            throws IOException, ExecutionException, InterruptedException {
+        /*
+         * Acrobat doesn't set the /Type key on the structure elements
+         * themselves. This test is here to check, that we process this case
+         * properly.
+         */
+        final PdfFile pdfFile = PdfFile.open(
+                new File(SOURCE_DIR + "AcrobatStructElemTest.pdf")
+        );
+
+        final StructureTreeNode rootNode = getStructureTreeRootNode(pdfFile);
+        Assertions.assertEquals(2, rootNode.getChildCount());
+
+        final StructureTreeNode firstPNode = (StructureTreeNode) rootNode.getChildAt(0);
+        Assertions.assertEquals("/P", firstPNode.toString());
+        Assertions.assertEquals(1, firstPNode.getChildCount());
+        final StructureTreeNode firstTextNode = (StructureTreeNode) firstPNode.getChildAt(0);
+        Assertions.assertEquals(
+                "0 [A paragraph created in Acrobat, tagged. ]",
+                firstTextNode.toString()
+        );
+        Assertions.assertEquals(0, firstTextNode.getChildCount());
+
+        final StructureTreeNode secondPNode = (StructureTreeNode) rootNode.getChildAt(1);
+        Assertions.assertEquals("/P -> #2 [Second Paragraph]", secondPNode.toString());
+        Assertions.assertEquals(1, secondPNode.getChildCount());
+        final StructureTreeNode secondTextNode = (StructureTreeNode) secondPNode.getChildAt(0);
+        Assertions.assertEquals(
+                "1 [A second paragraph. ]",
+                secondTextNode.toString()
+        );
+        Assertions.assertEquals(0, secondTextNode.getChildCount());
+    }
+
+    @Test
+    void structElemsUnexpectedPageRefTestTest()
+            throws IOException, ExecutionException, InterruptedException {
+        /*
+         * For some reason there are documents in the wild, where there is a
+         * structure elem tree, but the elements there reference a page, which
+         * is absent in the overall page tree. We shouldn't crash in such cases.
+         */
+        final PdfFile pdfFile = PdfFile.open(
+                new File(SOURCE_DIR + "UnexpectedPageRefTest.pdf")
+        );
+
+        final StructureTreeNode rootNode = getStructureTreeRootNode(pdfFile);
+        Assertions.assertEquals(1, rootNode.getChildCount());
+
+        final StructureTreeNode docNode = (StructureTreeNode) rootNode.getChildAt(0);
+        Assertions.assertEquals("/Document", docNode.toString());
+        Assertions.assertEquals(1, docNode.getChildCount());
+        final StructureTreeNode floatNode = (StructureTreeNode) docNode.getChildAt(0);
+        Assertions.assertEquals("/Float", floatNode.toString());
+        Assertions.assertEquals(1, floatNode.getChildCount());
+        final StructureTreeNode figureNode = (StructureTreeNode) floatNode.getChildAt(0);
+        Assertions.assertEquals("/Figure", figureNode.toString());
+        Assertions.assertEquals(1, figureNode.getChildCount());
+        final StructureTreeNode mcidNode = (StructureTreeNode) figureNode.getChildAt(0);
+        Assertions.assertEquals("0", mcidNode.toString());
+        Assertions.assertEquals(0, mcidNode.getChildCount());
+    }
+
     private static StructureTreeNode getStructureTreeRootNode(IPdfFile pdfFile)
             throws ExecutionException, InterruptedException {
 

diff --git a/src/test/resources/com/itextpdf/rups/controller/AcrobatStructElemTest.pdf b/src/test/resources/com/itextpdf/rups/controller/AcrobatStructElemTest.pdf
diff --git a/src/test/resources/com/itextpdf/rups/controller/UnexpectedPageRefTest.pdf b/src/test/resources/com/itextpdf/rups/controller/UnexpectedPageRefTest.pdf