2727import java .util .List ;
2828import java .util .Map ;
2929import java .util .Set ;
30+ import java .util .TreeSet ;
3031
3132import org .apache .pdfbox .Loader ;
3233import org .apache .pdfbox .cos .COSArray ;
4445import org .apache .pdfbox .pdmodel .common .COSObjectable ;
4546import org .apache .pdfbox .pdmodel .common .PDNameTreeNode ;
4647import org .apache .pdfbox .pdmodel .common .PDNumberTreeNode ;
48+ import org .apache .pdfbox .pdmodel .documentinterchange .logicalstructure .PDMarkedContentReference ;
49+ import org .apache .pdfbox .pdmodel .documentinterchange .logicalstructure .PDParentTreeValue ;
4750import org .apache .pdfbox .pdmodel .documentinterchange .logicalstructure .PDStructureElement ;
4851import org .apache .pdfbox .pdmodel .documentinterchange .logicalstructure .PDStructureNode ;
4952import org .apache .pdfbox .pdmodel .documentinterchange .logicalstructure .PDStructureTreeRoot ;
53+ import org .apache .pdfbox .pdmodel .documentinterchange .markedcontent .PDMarkedContent ;
5054import org .apache .pdfbox .pdmodel .graphics .image .PDImageXObject ;
5155import org .apache .pdfbox .pdmodel .interactive .action .PDAction ;
5256import org .apache .pdfbox .pdmodel .interactive .action .PDActionGoTo ;
6266import org .apache .pdfbox .pdmodel .interactive .form .PDAcroForm ;
6367import org .apache .pdfbox .pdmodel .interactive .form .PDField ;
6468import org .apache .pdfbox .rendering .PDFRenderer ;
69+ import org .apache .pdfbox .text .PDFMarkedContentExtractor ;
6570
6671import static org .junit .jupiter .api .Assertions .assertEquals ;
6772import static org .junit .jupiter .api .Assertions .assertNotEquals ;
@@ -648,6 +653,9 @@ private void checkStructTreeRootCount(File file) throws IOException
648653 /**
649654 * PDFBOX-4408: Check that /StructParents values from pages and /StructParent values from
650655 * annotations are found in the /ParentTree.
656+ * <p>
657+ * Expanded in 2025 to check that all MCIDs of a page content stream have an entry in the
658+ * ParentTree.
651659 *
652660 * @param document
653661 */
@@ -673,11 +681,71 @@ void checkWithNumberTree(PDDocument document) throws IOException
673681 }
674682 }
675683 }
676- for (PDPage page : document .getPages ())
684+ PDPageTree pageTree = document .getPages ();
685+ for (PDPage page : pageTree )
677686 {
687+ int pageNum = pageTree .indexOf (page ) + 1 ;
678688 if (page .getStructParents () >= 0 )
679689 {
680- assertTrue (keySet .contains (page .getStructParents ()));
690+ assertTrue (keySet .contains (page .getStructParents ()), "/StructParents " + page .getStructParents () + " from page " +
691+ pageNum + " not found in /ParentTree" );
692+ PDParentTreeValue obj = (PDParentTreeValue ) numberTreeAsMap .get (page .getStructParents ());
693+ assertTrue (obj .getCOSObject () instanceof COSArray , "Expected array in page " + pageNum + ", got " + obj .getClass ());
694+ COSArray array = (COSArray ) obj .getCOSObject ();
695+
696+ PDFMarkedContentExtractor markedContentExtractor = new PDFMarkedContentExtractor ();
697+ markedContentExtractor .processPage (page );
698+ List <PDMarkedContent > markedContents = markedContentExtractor .getMarkedContents ();
699+ TreeSet <Integer > set = new TreeSet <>();
700+ for (PDMarkedContent pdMarkedContent : markedContents )
701+ {
702+ COSDictionary pdmcProperties = pdMarkedContent .getProperties ();
703+ if (pdmcProperties == null )
704+ {
705+ continue ;
706+ }
707+ int mcid = pdMarkedContent .getMCID ();
708+ if (mcid >= 0 )
709+ {
710+ // "For a page object (...), the value shall be an array of references
711+ // to the parent elements of those marked-content sequences."
712+ // this means that the /Pg entry doesn't have to match the page
713+ COSDictionary dict = (COSDictionary ) array .getObject (mcid );
714+ assertNotNull (dict );
715+ set .add (mcid );
716+ PDStructureElement structureElemen = (PDStructureElement ) PDStructureNode .create (dict );
717+ List <Object > kids = structureElemen .getKids ();
718+ boolean found = false ;
719+ for (Object kid : kids )
720+ {
721+ if (kid instanceof Integer && ((Integer ) kid ) == mcid )
722+ {
723+ found = true ;
724+ break ;
725+ }
726+ if (kid instanceof PDMarkedContentReference )
727+ {
728+ PDMarkedContentReference mcr = (PDMarkedContentReference ) kid ;
729+ if (mcid == mcr .getMCID ())
730+ {
731+ found = true ;
732+ if (mcr .getPage () != null )
733+ {
734+ assertEquals (page , mcr .getPage ());
735+ }
736+ else
737+ {
738+ assertEquals (page , structureElemen .getPage ());
739+ }
740+ break ;
741+ }
742+ }
743+ }
744+ assertTrue (found , "page: " + pageNum + ", mcid: " + mcid + " not found" );
745+ }
746+ }
747+ // actual count may be larger if last element is null, e.g. PDFBOX-4408
748+ assertTrue (set .last () <= array .size () - 1 );
681749 }
682750 for (PDAnnotation ann : page .getAnnotations ())
683751 {
0 commit comments