Skip to content

Commit b799a41

Browse files
committed
PDFBOX-5974: check that all MCIDs of a page content stream have an entry in the ParentTree.
git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1924338 13f79535-47bb-0310-9956-ffa450edef68
1 parent 3a9a14c commit b799a41

File tree

1 file changed

+70
-2
lines changed

1 file changed

+70
-2
lines changed

pdfbox/src/test/java/org/apache/pdfbox/multipdf/PDFMergerUtilityTest.java

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import java.util.List;
2828
import java.util.Map;
2929
import java.util.Set;
30+
import java.util.TreeSet;
3031

3132
import org.apache.pdfbox.Loader;
3233
import org.apache.pdfbox.cos.COSArray;
@@ -44,9 +45,12 @@
4445
import org.apache.pdfbox.pdmodel.common.COSObjectable;
4546
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
4647
import org.apache.pdfbox.pdmodel.common.PDNumberTreeNode;
48+
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDMarkedContentReference;
49+
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDParentTreeValue;
4750
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement;
4851
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode;
4952
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
53+
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
5054
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
5155
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
5256
import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
@@ -62,6 +66,7 @@
6266
import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
6367
import org.apache.pdfbox.pdmodel.interactive.form.PDField;
6468
import org.apache.pdfbox.rendering.PDFRenderer;
69+
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
6570

6671
import static org.junit.jupiter.api.Assertions.assertEquals;
6772
import static org.junit.jupiter.api.Assertions.assertNotEquals;
@@ -648,6 +653,9 @@ private void checkStructTreeRootCount(File file) throws IOException
648653
/**
649654
* PDFBOX-4408: Check that /StructParents values from pages and /StructParent values from
650655
* annotations are found in the /ParentTree.
656+
* <p>
657+
* Expanded in 2025 to check that all MCIDs of a page content stream have an entry in the
658+
* ParentTree.
651659
*
652660
* @param document
653661
*/
@@ -673,11 +681,71 @@ void checkWithNumberTree(PDDocument document) throws IOException
673681
}
674682
}
675683
}
676-
for (PDPage page : document.getPages())
684+
PDPageTree pageTree = document.getPages();
685+
for (PDPage page : pageTree)
677686
{
687+
int pageNum = pageTree.indexOf(page) + 1;
678688
if (page.getStructParents() >= 0)
679689
{
680-
assertTrue(keySet.contains(page.getStructParents()));
690+
assertTrue(keySet.contains(page.getStructParents()), "/StructParents " + page.getStructParents() + " from page " +
691+
pageNum + " not found in /ParentTree");
692+
PDParentTreeValue obj = (PDParentTreeValue) numberTreeAsMap.get(page.getStructParents());
693+
assertTrue(obj.getCOSObject() instanceof COSArray, "Expected array in page " + pageNum + ", got " + obj.getClass());
694+
COSArray array = (COSArray) obj.getCOSObject();
695+
696+
PDFMarkedContentExtractor markedContentExtractor = new PDFMarkedContentExtractor();
697+
markedContentExtractor.processPage(page);
698+
List<PDMarkedContent> markedContents = markedContentExtractor.getMarkedContents();
699+
TreeSet<Integer> set = new TreeSet<>();
700+
for (PDMarkedContent pdMarkedContent : markedContents)
701+
{
702+
COSDictionary pdmcProperties = pdMarkedContent.getProperties();
703+
if (pdmcProperties == null)
704+
{
705+
continue;
706+
}
707+
int mcid = pdMarkedContent.getMCID();
708+
if (mcid >= 0)
709+
{
710+
// "For a page object (...), the value shall be an array of references
711+
// to the parent elements of those marked-content sequences."
712+
// this means that the /Pg entry doesn't have to match the page
713+
COSDictionary dict = (COSDictionary) array.getObject(mcid);
714+
assertNotNull(dict);
715+
set.add(mcid);
716+
PDStructureElement structureElemen = (PDStructureElement) PDStructureNode.create(dict);
717+
List<Object> kids = structureElemen.getKids();
718+
boolean found = false;
719+
for (Object kid : kids)
720+
{
721+
if (kid instanceof Integer && ((Integer) kid) == mcid)
722+
{
723+
found = true;
724+
break;
725+
}
726+
if (kid instanceof PDMarkedContentReference)
727+
{
728+
PDMarkedContentReference mcr = (PDMarkedContentReference) kid;
729+
if (mcid == mcr.getMCID())
730+
{
731+
found = true;
732+
if (mcr.getPage() != null)
733+
{
734+
assertEquals(page, mcr.getPage());
735+
}
736+
else
737+
{
738+
assertEquals(page, structureElemen.getPage());
739+
}
740+
break;
741+
}
742+
}
743+
}
744+
assertTrue(found, "page: " + pageNum + ", mcid: " + mcid + " not found");
745+
}
746+
}
747+
// actual count may be larger if last element is null, e.g. PDFBOX-4408
748+
assertTrue(set.last() <= array.size() - 1);
681749
}
682750
for (PDAnnotation ann : page.getAnnotations())
683751
{

0 commit comments

Comments
 (0)