From f53ab295e2db64881160cee8279a896b07d230dd Mon Sep 17 00:00:00 2001 From: Edoardo Cavazza Date: Thu, 2 Oct 2025 10:53:19 +0200 Subject: [PATCH 1/3] Collect all child nodes of lists and tables --- src/core/struct_tree.js | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/core/struct_tree.js b/src/core/struct_tree.js index f8eafbc17b481..e31b06a17e6fe 100644 --- a/src/core/struct_tree.js +++ b/src/core/struct_tree.js @@ -754,6 +754,23 @@ class StructTreePage { const element = new StructElementNode(this, dict); map.set(dict, element); + switch (element.role) { + case "L": + case "LBody": + case "LI": + case "Table": + case "THead": + case "TBody": + case "TFoot": + case "TR": { + // Always collect all child nodes of lists and tables, even empty ones + for (const kid of element.kids) { + if (kid.type === StructElementType.ELEMENT) { + this.addNode(kid.dict, map, level - 1); + } + } + } + } const parent = dict.get("P"); From 38cc9ae38cbae7a227265f49eba962f1ee999f04 Mon Sep 17 00:00:00 2001 From: Edoardo Cavazza Date: Mon, 6 Oct 2025 09:57:02 +0200 Subject: [PATCH 2/3] Add test case for empty cells --- test/pdfs/.gitignore | 1 + test/pdfs/issue20324.pdf | Bin 0 -> 11151 bytes test/test_manifest.json | 8 +++++++- test/unit/api_spec.js | 35 +++++++++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 test/pdfs/issue20324.pdf diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 6b1ede9aefd21..0a79b24a4c4cd 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -620,6 +620,7 @@ !autoprint.pdf !bug1811694.pdf !bug1811510.pdf +!issue20324.pdf !bug1815476.pdf !issue16021.pdf !bug1770750.pdf diff --git a/test/pdfs/issue20324.pdf b/test/pdfs/issue20324.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c5f2a1d4ffc8d94defd614bbfce93627b0287046 GIT binary patch literal 11151 zcmeG?c|26@_fjZJQQA{CDNAPV>{~;PT?|4+h%s}ADYG;)wss_yR)t8G(xz2Op{OLS z)LW5~3U84%ul7{G=g!zd@B99~pWi>f&*yvR!@c);&NSkky!DC4} zQ|lWl@9Joh=nxJP2QJcKGGS+d2;oQtOA#J~hn*oj9#6)@t`PWV3sHzP0_+0XJs~^} zN2b6IkPk#AQXo8;;%8~8gNS%26{w#}Cqjzw6(ivZI-0nS3r~R*PmrjqjYg*RZo}j7 z#NKTL0uC^ZUfs9^8om!Mk$~%iOC;jSy>W>oLLXkp6biLB9GOn)xhaYooef9*R@_c6qKC~J+OwTWY|s+B1fdKT?j{x*dSan z4}l#aIu&+CL_zXkh)RS#AR^7uQYM!o9HEY8a9vTQjvS;sFD>oj#MZE615hGgbGD6h=|UaC*aD(QV0)b+AsGa9unb0hDap8&Pfn( z$~7U;Vb4ejqL3Cm-vtr-o&X6)q+r&3M2d*G2r3O5ffV7&1!58GOmMdZSEp5y@!gBdlFF^b5k5F9 zU>E$6>0~T{N`Yu3JeEfHgRMdkg!gv<)?$$y5y@o`S;?xkR4n;Bcs5uU&ffumwFLA! zChm^N#Gz8>aAu=#T2uu;S1KfaaPrN_qSI7}Gx=F=c z4@B+*gS`&+L?Yy1ErHRYq#&^33fU@0m4SCpsX+L5{3*naE)MJ_K)6W9QaXxk5SbuI zgo<6v4HbeZINC7nZ0+oDI4TW?qu78u*aQf+;F$#e=*l~hg~JhnZBob`?*NDh`oLzT zxKltM0pOzV3K$d?JS*BzIuy{?qYZ^8qHXAypkD!p(nH5XX`pn`v4~0-1zi*#@TTY^ zTEfm8nVc==3HSnp*D(bWAf;dxMF*e?_5c}lXvI6))`sc-CBx;A@5x|{D0sk;W8VO4$k%3av6{d(G63JMwnM3#vP}EM*X9Asy1%4P{ zP(TwE?+4?3faB}z1@{huI5L(*Bz6w)f=R8t81@9 zx0Mb@)Q=D9a6e=cu6uuWSf97!hUgJa*Y;EU25)z*ITMQ2#;Ddmctdp1njUXxqFL$? zlb3(T5c{waZcfg2*nBYdS*-a6*{~N26DFMfIe9r$vx^>A9`x6vlGo+g1Io$UvEe6t zx8u%EGugIe`?sId3C0=Y@204?Y!%$gH+u>hZJ&4g`QF@x*Z$nCTOl`Y^dzYmNpEb- z6X`aA$LuxS=x*;=*P`|z!;;|elvjX(A6k9$d3mso-@*4MimHOSHH)#lo^ zX4>I$^U=H6t)&P516Ve9%P5N85I?* zq_#eZ4`ddo6R_cltdBuwGjO%8-s_)Sa`>`4^TDc-nGp|;76e>rnsA+*B|b2J#D(m3 z*8Zs}brGj!v?I^VZ1W-{NA{TodfFbyX^c3Lx6h30i5$pDsDB#iG`H+vPU@oTW0T#m zoS*GlFq>leWJ375WegYhTTyk>4$aEk4m+BCKm6(Y;xEOc4=*#{F!`EELUS%=ZT23A z{M_)EP3&Q%1-7e>FH+et9RJk;)DtRzxn;S9>+c^wl2i zcg416E5c4M%6!}W(sbLbj1f9=E!XuKR%h0?y0=_aX?vrxBHovi^~J<6xuw@rOjyvS-h`Qu?W#r5VY5KRmYA(Zyx zr%^Z2pRRMv=Q(%wKKsg#?pzZ`j(c-xx0UA_6 zXv_;>6<$Wcio!}{J_G_vB-*ckzk{dHfzza+I8-9oX(&o;Dg^|pJ$_VD ze`8XJsN?PLM}l*#{K zJTy#$|AXOBnf~JjMgZSUzv_f_&kb$4r@(WB_o!orMlkWNvGa(A;8+)1jV(*v1|-@t z8PQsX(A7Pw2j3bFnVUMOMv1(g2e%A1(w(Iryhc}7zihvrFnP|MiM363K5D|-OI~fe zlb*k&0<)js6x%$hZL!Plw%uRftax(%Nzw6z&vu?SSljTq{&n%nWn3%2=sz{-`v%W- zKU~T?RFE=cwvYSl_$5oXS-gH9weCxFE~9nak8P!~0*!}1N?FD~wCryVEnoHZ(>$}H zS*EUR52we}5Qz_B{VCFS#+H^BcNR}PKi{#su%O^p!^22l_yc?YaO977s)-Ad_9dvB zB$yzSea>z-5~>|m5Q+QR_RV$Q=RDQpc}?`s(`Jc>vk3(Z_Wm8yPF(Y(AD(|bVc#Oe z>ZJK}R?6gkAt^tzOn)9o^gOop@EkE%CJYiqUj=*`)%v z#bq*05$EaemL$pq;iPG0?}PT`-%?+n_w=?;bObBz$?c_Mp^uBGP32~S7H ze`RdRcr|q{Q7lBw8&PkAnQxi)aB$t zw`&H+j$j_yd)WprPN_KRnXqZl{ZE3&eiMpQp5GQhBh`?RiS`>(bSIj}Cf2@vl6Jl| zE#gdTRiQ;Lz1cefmoZc){5JD_+_HL&QvAY+tntw+t~FUa<2=(_PX6w6PhLT)WnLbw z@4x?>h;b>gd0^$52BQ3s15H-g^6K{7*2Juh0scRE^VI~Nl9tV3;}~U^T3h@NoISR# zt_Wj(N6L)5H#T%G<9vSJ!Gk#mPoB+hbkHj~`=zZ`uXWHG3zf=mb5{9=Z)9#&?g?voU2K@6RmH};1b&$r z-!voo`B9Th-;#6#Rg!$lByQwmzd%(X6As)H8NOP!>|>SI4^Fzu^BfHHEqavTU6_}; zhwg=xHm_h-k+!YQ*h~V|mt;*E_thkRiuum9pWl|Q z)UEhqc?3MXxVVZyr#Wp5Uh6SqszFLC+voDe^~P!0w|5lJd&~Apo&uk*vxc)E4R)R2 zq_3g)eMYf7S-0Yc@b0?}qlTO+b{sV4U`c-NMF_u>v8d?srRq~bR_*79UM7a|LJEV| z7^kUStX@)4^1s~}puzE5%M8s= zs*wziDjnvx_esJQJNtzfG%g>``-UHFM2()mYi!oeT-hj{DpKmk;d`(7QjLmdEI2~5 zcPUHvZgIx7Sd1|pQJ-A*(amu6o<&-r!`0QZn(ws_=0nHqciK~94jeQw_5ZPcZN_$| z=9R{qUB0dGu=Kut3P88gGV8eB5Gqgu{Ghs%e2edR$_B#A(}>im>r(BK3F2jdRH>A-r!>dr$fJ z6xNQq7X2vgTG;=w8ZsZ#cQA8 zoYJ06863Oy%qq2I?ke+hFz=nT>Qra!=dT`!O{3q;kDU;vRXA=pBlV#{0b%OX1~E5% zn)9EZ=2KSOy>JG{(0qK4nw0jTq>vv_eJJ2Z?vVvY*9MH5;(B!D(OchFs}+osUlFLL zZ7tZJTVr1E?!flFFWwAWc|UmE@fTVPM?HyIS9$RnY30)4f$2+v<31Ti=DfOV)cTQK zec{lCHU9T6#qH;k+p8WHehsT{t+IjZl>9 z>BtJ#n3)-ktHZL`dx*7@o5CFyPH4YoX)%55bfV|y5mn~;ncuywMxVf*7*8HoerCHa zdB~kDv98ZuPSxXLj(x-q*w)0pd7tg+vV@av6X~|?_|-rCtCrUm`KM)>eIdo^uS`kI z@Yy>5>Z<5^@*LY%#z%XLoTH{%Mmvr2oDL5Q*flF>Y}tuN zns9Sem4o*QA-iy_?U=l^c-4k^@3d8J;=954OGYOj1X z2G&`Z#}^5*zHW6t;w+A|if(>;@S4}%EOP5u?DEUgndhdjZV6XGrvDg7SGn?YOO)E} zZ)en*$GF#b?Ao#;ZQ{`7sh=HpC8d0tx-%v9c#`_J#59eGnRaph+C$>CwGL}O)cP#f z|5QIxRa@oqY=aB^1juL**cmU$xM*^QPNT!t9Ek%W2nv=%6p#@{<64koF@a?g4i|wX zh*Tiv!Dv(_4;RDoV34|k`5=`8vtTRO2DXJAU{}~1MqmLP0t;aoEH{PGh#94he$S$b z#vqW$BPn8OD>Na4A<)sUW00A|Q)ncV2R}up9Y-h#i8N8h>JSLlO_j|c%?iGG(df%v z(cr=nBEJvp(O4U#f)F`3xHIrXo8~L02+|+0BgnJ~xGYgnh%#-22yr8w=iZ@=po`ogn^hTa|J9FUD^4Ae+^ zJ;2?kSv8)U_TErDv&PKe=GLZDA8yQ9Id$%eBf^>1f4#AP{8Zadys2DpE^?FP;$pFF zQcUxQ`E z^Sn(PQ!#Cncj=LnKHqfKIsAGuK`l$OYBbCLU8G*+^yg;pwtaCp*H9Ohb;e*-vK8*x zk24pCrjBEttD?TW_MG;xNi4}f^Lc0Scmub&5tomv3{-iEO^NBN%02R){p1o9pK##w zjnbJ9rcx=O#z7(}*Vr|15lEePN9DnFJ3Mc=jA!Y#Ct6O^IM**tTq_?8}7??pEA|0*}#-FVEAnJ6VW^N=<3cIshP== zuY(!}np#!_@0_I_kVqTt8B5#|Gbw6tF@K6(!aPZ|5iME$3i-Xc4e9-n``KAR7Jp?1 z&BlW-|fskrh5A75-I5OZR-9t9@8_ilw@yEXX(K=ZMDjgVMZCw-o(J ztj%(OMtpYq>HjWnWyvZ0Qy)%7=_2~ic`&?kOUdYAydrT!{r(hT-Gyf9TdMf2$Hnb~ z-r?unIp!Y}pUPeGBu{)_)!@)`;pwGgkDpPQ_%vH^rtEAwi?^X>KyvwUEr$Atm`xYU zo^nmNO{v!!s$&oC*|TATS&m0!+Kk#RePODl>%C|r zwKaZWmRjEW!o0x@x2(ihW|S5soMZUbSqjhPBnbvNRk$r1TiWtGvgoirtcn%@8+bncK+7Se68De4Y`pO8|zAWDfrWsH_E0L#eW|epmLsX2K6lhEbSr`V(2w%JfC&T)fWXKiMRHCAMkJe}B+^Nz)7gOZ6ok^{1d2oD7KVoCKwVeI#DtQr zfkZN_Vv<;{SO`aOBruv;`3(}A{r6}>VZWF%xosHWq+@3N4k%Oj%+A`5Z5VL+Q~F~< zCI{up{|zYs-Y+2@w~LA-R2rfXG>;1-A&3x+Edx@(_d6F!k2U&bCZGc#5#g>y0WxO- zePURH>L*C~d5Xm$mP)PZtPBAK!IB{pHis(^$;Gl@QzqOSp?mSVX(6CofLNl+g~8)6 zIEp8Z&cM+aMCx=LP%&H&B1*_!G&;@Y-%$Alwl^KL9u)bnsB}aAN`uX2IEp~Y8B`}7 zZ7f4W1w00wOQ%zKWC{ks({PwTJc*B?(*kiA8iz~+2~QH2kH;%!_$zK7x_t{qmO!>x z9vA|T@ThbGor57DoInhTj}OFfC^R01P7e$ub7=%VE|AzcL_cKu4AoteQVan`Kxjk? ziOS^z{sCPQpU=n8cmyJb1kMZu2I4t91nHVY59~ga-9dR(g^BF+V zI=SoB-v{3TRAj``NX1^$wFM|P9S>y5udS%IqPs?jK(Tp9I8qsc+N6bHr&0D)>~3NT z+h%YSMP*B_SQLgx<)B6g|I*&gSZ_oy4v4|QF#?$-Un&+t9Ek)()QX}K9Dxylf;?J6 zh2-L2dK4m1j?ja^??;h|`SNg3L}mp6+x}-GR8syG>G#CB!Jw3n@GRlZNq08<9wj^& z5&fM3D;ZEg^(MlEm8M2`)UDQq`>8k|jRszh&27Hl)Av`-&?kLfJUUQ~m`(+5a zZ5FjD!N0btKIp&9u7f(St62WOv{)te9_am17xyV7|3{}f6vqv-DHK@IdUz(+7Il#56^v>%*%fSd(^ zQjRnd!h?b{6aEFL7c!{!N+dwV?a(@m6120a15&AgJwgx#zsgn+;DF!??uxF!OILgb zYI1{1}39JxFcg(6h$ObBI^sX~&;M@bqhmb*04IFU;FW^Ft z%5|UrQnpdi!kFT*`^*{@l*M|t(Pgr!T>IfR6K*{{yBH5()qS literal 0 HcmV?d00001 diff --git a/test/test_manifest.json b/test/test_manifest.json index 6a810a98ceaee..7c350b69b9ea8 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -1676,6 +1676,12 @@ "lastPage": 1, "type": "eq" }, + { "id": "issue20324", + "file": "pdfs/issue20324.pdf", + "md5": "13250232aa91444f983279581d9c02d6", + "rounds": 1, + "type": "eq" + }, { "id": "issue13561_reduced", "file": "pdfs/issue13561_reduced.pdf", @@ -13026,6 +13032,6 @@ "file": "pdfs/issue20232.pdf", "md5": "cc53e96a8fd9eafbfbb74de564f37047", "rounds": 1, - "type": "eq" + "type": "other" } ] diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 648fe309e7952..2ef29bec46878 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -5043,6 +5043,41 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) canvasFactory.destroy(canvasAndCtx); await loadingTask.destroy(); }); + + it("should collect all list and table items in StructTree", async function() { + const findNodes = (node, check) => { + const results = []; + if (check(node)) { + results.push(node); + } + if (node.children) { + for (const child of node.children) { + results.push(...findNodes(child, check)); + } + } + return results; + }; + const loadingTask = getDocument(buildGetDocumentParams("issue20324.pdf")); + + const pdfDoc = await loadingTask.promise; + const page = await pdfDoc.getPage(1); + const tree = await page.getStructTree({ + includeMarkedContent: true, + }); + const cells = findNodes( + tree, + node => node.role === "TD" + ); + expect(cells.length).toEqual(4); + + const listItems = findNodes( + tree, + node => node.role === "LI" + ); + expect(listItems.length).toEqual(4); + + await loadingTask.destroy(); + }); }); describe("Multiple `getDocument` instances", function () { From 2146cb26f11939ddabf6f9fc648d8d9cd3001a30 Mon Sep 17 00:00:00 2001 From: Edoardo Cavazza Date: Mon, 20 Oct 2025 09:47:21 +0200 Subject: [PATCH 3/3] Move tables test to specific struct tree spec file --- test/test_manifest.json | 8 +------- test/unit/api_spec.js | 35 ----------------------------------- test/unit/struct_tree_spec.js | 29 +++++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 42 deletions(-) diff --git a/test/test_manifest.json b/test/test_manifest.json index 7c350b69b9ea8..6a810a98ceaee 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -1676,12 +1676,6 @@ "lastPage": 1, "type": "eq" }, - { "id": "issue20324", - "file": "pdfs/issue20324.pdf", - "md5": "13250232aa91444f983279581d9c02d6", - "rounds": 1, - "type": "eq" - }, { "id": "issue13561_reduced", "file": "pdfs/issue13561_reduced.pdf", @@ -13032,6 +13026,6 @@ "file": "pdfs/issue20232.pdf", "md5": "cc53e96a8fd9eafbfbb74de564f37047", "rounds": 1, - "type": "other" + "type": "eq" } ] diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 2ef29bec46878..648fe309e7952 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -5043,41 +5043,6 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) canvasFactory.destroy(canvasAndCtx); await loadingTask.destroy(); }); - - it("should collect all list and table items in StructTree", async function() { - const findNodes = (node, check) => { - const results = []; - if (check(node)) { - results.push(node); - } - if (node.children) { - for (const child of node.children) { - results.push(...findNodes(child, check)); - } - } - return results; - }; - const loadingTask = getDocument(buildGetDocumentParams("issue20324.pdf")); - - const pdfDoc = await loadingTask.promise; - const page = await pdfDoc.getPage(1); - const tree = await page.getStructTree({ - includeMarkedContent: true, - }); - const cells = findNodes( - tree, - node => node.role === "TD" - ); - expect(cells.length).toEqual(4); - - const listItems = findNodes( - tree, - node => node.role === "LI" - ); - expect(listItems.length).toEqual(4); - - await loadingTask.destroy(); - }); }); describe("Multiple `getDocument` instances", function () { diff --git a/test/unit/struct_tree_spec.js b/test/unit/struct_tree_spec.js index 0551565597c79..8c9c1c2343bab 100644 --- a/test/unit/struct_tree_spec.js +++ b/test/unit/struct_tree_spec.js @@ -151,4 +151,33 @@ describe("struct tree", function () { ); await loadingTask.destroy(); }); + + it("should collect all list and table items in StructTree", async function () { + const findNodes = (node, check) => { + const results = []; + if (check(node)) { + results.push(node); + } + if (node.children) { + for (const child of node.children) { + results.push(...findNodes(child, check)); + } + } + return results; + }; + const loadingTask = getDocument(buildGetDocumentParams("issue20324.pdf")); + + const pdfDoc = await loadingTask.promise; + const page = await pdfDoc.getPage(1); + const tree = await page.getStructTree({ + includeMarkedContent: true, + }); + const cells = findNodes(tree, node => node.role === "TD"); + expect(cells.length).toEqual(4); + + const listItems = findNodes(tree, node => node.role === "LI"); + expect(listItems.length).toEqual(4); + + await loadingTask.destroy(); + }); });