Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions src/main/java/com/itextpdf/rups/view/itext/StructureTree.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ This file is part of the iText (R) project.
import com.itextpdf.kernel.pdf.PdfName;
import com.itextpdf.kernel.pdf.PdfNumber;
import com.itextpdf.kernel.pdf.PdfObject;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.rups.controller.PdfReaderController;
import com.itextpdf.rups.model.ObjectLoader;
import com.itextpdf.rups.model.TreeNodeFactory;
Expand Down Expand Up @@ -134,15 +135,26 @@ TreeModel recalculateTreeModel() {
return new DefaultTreeModel(root);
}

private Map<Integer, MarkedContentInfo> indexMarkedContentOnPage(PdfDictionary page) {
final PdfIndirectReference ref = page.getIndirectReference();
private Map<Integer, MarkedContentInfo> indexMarkedContentOnPage(PdfDictionary pageDict) {
final PdfIndirectReference ref = pageDict.getIndirectReference();
Map<Integer, MarkedContentInfo> result = this.mciByPage.get(ref);
if (result != null) {
return result;
}
final MarkedContentInfoGatherer gatherer = new MarkedContentInfoGatherer();
gatherer.processPageContent(this.loader.getFile().getPdfDocument().getPage(page));
result = gatherer.getMarkedContentIndex();
final PdfPage page = this.loader.getFile().getPdfDocument().getPage(pageDict);
if (page != null) {
final MarkedContentInfoGatherer gatherer = new MarkedContentInfoGatherer();
gatherer.processPageContent(page);
result = gatherer.getMarkedContentIndex();
} else {
/*
* This can happen in weird cases, when there is a page in the
* document, that is referenced within a structure element, but it
* is absent in the page tree for some reason... So we will just
* assume, that we could not find anything there.
*/
result = Map.of();
}
this.mciByPage.put(ref, result);
return result;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ This file is part of the iText (R) project.
import com.itextpdf.rups.view.icons.IconTreeNode;

import java.util.Enumeration;
import java.util.regex.Pattern;
import javax.swing.tree.TreeNode;

/**
Expand All @@ -73,6 +74,8 @@ public class PdfObjectTreeNode extends IconTreeNode implements IPdfContextMenuTa
private static final String STREAM_ICON = "stream.png";
private static final String STRING_ICON = "string.png";

private static final Pattern CAPTION_STRIP_REGEX = Pattern.compile("[\\x00-\\x08\\x0B-\\x1F]");

/**
* If the object is indirect, the number of the PDF object.
*/
Expand Down Expand Up @@ -369,7 +372,7 @@ public static String getCaption(PdfObject object) {
}
return String.format(Language.STREAM_OF_TYPE.getString(), type);
case PdfObject.STRING:
return ((PdfString) object).toUnicodeString();
return toStrippedUnicodeString((PdfString) object);
case PdfObject.DICTIONARY:
type = ((PdfDictionary) object).getAsName(PdfName.Type);
if (type == null) {
Expand Down Expand Up @@ -463,4 +466,16 @@ public boolean supportsSave() {
*/
return object.isStream() || object.isString();
}

/**
* Returns the result of a {@link PdfString#toUnicodeString()} call, but
* removes some of the non-printable ASCII control characters.
*
* @param pdfString PDF string to convert to Java string
*
* @return the converted string
*/
private static String toStrippedUnicodeString(PdfString pdfString) {
return CAPTION_STRIP_REGEX.matcher(pdfString.toUnicodeString()).replaceAll("");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,13 @@ public PdfObjectTreeNode getCorrespondingPdfObjectNode() {
}

private static Object ingestDictionaryNode(PdfDictionary dict, PdfObjectTreeNode node) {
final Object userObj;
final PdfObject dictType = dict.get(PdfName.Type, false);
if (PdfName.StructElem.equals(dictType)) {
if (PdfName.OBJR.equals(dictType)) {
return "OBJR => " + node.getPdfObject().getIndirectReference();
}
// Assuming StructElem from here, since the /Type key for structure
// elements is optional
if (PdfName.StructElem.equals(dictType) || dictType == null) {
final StringBuilder buf = new StringBuilder();
if (dict.get(PdfName.S, false) != null) {
buf.append(PdfObjectTreeNode.getCaption(dict.get(PdfName.S, false)));
Expand All @@ -113,15 +117,14 @@ private static Object ingestDictionaryNode(PdfDictionary dict, PdfObjectTreeNode
}
final PdfString actualText = dict.getAsString(PdfName.ActualText);
if (actualText != null) {
formatExtractedText(buf, actualText.toUnicodeString());
formatExtractedText(buf, PdfObjectTreeNode.getCaption(actualText));
}
if (buf.length() > 0) {
return buf.toString();
}
userObj = buf.toString();
} else if (PdfName.OBJR.equals(dictType)){
userObj = "OBJR => " + node.getPdfObject().getIndirectReference();
} else {
userObj = node;
}
return userObj;
// Using just the node itself as fallback
return node;
}

protected static void formatExtractedText(StringBuilder base, String extractedText) {
Expand Down
65 changes: 65 additions & 0 deletions src/test/java/com/itextpdf/rups/view/itext/StructureTreeTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,71 @@ void extractMcidContentInStructureTreeWithActualTextTest()
Assertions.assertEquals("0 [Olleh ]", nodeLabel);
}

@Test
void structElemsGeneratedByAcrobatTest()
throws IOException, ExecutionException, InterruptedException {
/*
* Acrobat doesn't set the /Type key on the structure elements
* themselves. This test is here to check, that we process this case
* properly.
*/
final PdfFile pdfFile = PdfFile.open(
new File(SOURCE_DIR + "AcrobatStructElemTest.pdf")
);

final StructureTreeNode rootNode = getStructureTreeRootNode(pdfFile);
Assertions.assertEquals(2, rootNode.getChildCount());

final StructureTreeNode firstPNode = (StructureTreeNode) rootNode.getChildAt(0);
Assertions.assertEquals("/P", firstPNode.toString());
Assertions.assertEquals(1, firstPNode.getChildCount());
final StructureTreeNode firstTextNode = (StructureTreeNode) firstPNode.getChildAt(0);
Assertions.assertEquals(
"0 [A paragraph created in Acrobat, tagged. ]",
firstTextNode.toString()
);
Assertions.assertEquals(0, firstTextNode.getChildCount());

final StructureTreeNode secondPNode = (StructureTreeNode) rootNode.getChildAt(1);
Assertions.assertEquals("/P -> #2 [Second Paragraph]", secondPNode.toString());
Assertions.assertEquals(1, secondPNode.getChildCount());
final StructureTreeNode secondTextNode = (StructureTreeNode) secondPNode.getChildAt(0);
Assertions.assertEquals(
"1 [A second paragraph. ]",
secondTextNode.toString()
);
Assertions.assertEquals(0, secondTextNode.getChildCount());
}

@Test
void structElemsUnexpectedPageRefTestTest()
throws IOException, ExecutionException, InterruptedException {
/*
* For some reason there are documents in the wild, where there is a
* structure elem tree, but the elements there reference a page, which
* is absent in the overall page tree. We shouldn't crash in such cases.
*/
final PdfFile pdfFile = PdfFile.open(
new File(SOURCE_DIR + "UnexpectedPageRefTest.pdf")
);

final StructureTreeNode rootNode = getStructureTreeRootNode(pdfFile);
Assertions.assertEquals(1, rootNode.getChildCount());

final StructureTreeNode docNode = (StructureTreeNode) rootNode.getChildAt(0);
Assertions.assertEquals("/Document", docNode.toString());
Assertions.assertEquals(1, docNode.getChildCount());
final StructureTreeNode floatNode = (StructureTreeNode) docNode.getChildAt(0);
Assertions.assertEquals("/Float", floatNode.toString());
Assertions.assertEquals(1, floatNode.getChildCount());
final StructureTreeNode figureNode = (StructureTreeNode) floatNode.getChildAt(0);
Assertions.assertEquals("/Figure", figureNode.toString());
Assertions.assertEquals(1, figureNode.getChildCount());
final StructureTreeNode mcidNode = (StructureTreeNode) figureNode.getChildAt(0);
Assertions.assertEquals("0", mcidNode.toString());
Assertions.assertEquals(0, mcidNode.getChildCount());
}

private static StructureTreeNode getStructureTreeRootNode(IPdfFile pdfFile)
throws ExecutionException, InterruptedException {

Expand Down
Binary file not shown.
Binary file not shown.