Skip to content

Commit 520363b

Browse files
authored
Merge pull request #20384 from calixteman/bug1937438
Make MathML elements visible in the struct tree (bug 1937438)
2 parents f6317dd + e5a62c8 commit 520363b

File tree

9 files changed

+395
-1
lines changed

9 files changed

+395
-1
lines changed

src/core/struct_tree.js

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import { AnnotationPrefix, stringToPDFString, warn } from "../shared/util.js";
1717
import { Dict, isName, Name, Ref, RefSetCache } from "./primitives.js";
1818
import { lookupNormalRect, stringToAsciiOrUTF16BE } from "./core_utils.js";
19+
import { BaseStream } from "./base_stream.js";
1920
import { NumberTree } from "./name_number_tree.js";
2021

2122
const MAX_DEPTH = 40;
@@ -579,6 +580,50 @@ class StructElementNode {
579580
return root.roleMap.get(name) ?? name;
580581
}
581582

583+
get mathML() {
584+
let AFs = this.dict.get("AF") || [];
585+
if (!Array.isArray(AFs)) {
586+
AFs = [AFs];
587+
}
588+
for (let af of AFs) {
589+
af = this.xref.fetchIfRef(af);
590+
if (!(af instanceof Dict)) {
591+
continue;
592+
}
593+
if (!isName(af.get("Type"), "Filespec")) {
594+
continue;
595+
}
596+
if (!isName(af.get("AFRelationship"), "Supplement")) {
597+
continue;
598+
}
599+
const ef = af.get("EF");
600+
if (!(ef instanceof Dict)) {
601+
continue;
602+
}
603+
const fileStream = ef.get("UF") || ef.get("F");
604+
if (!(fileStream instanceof BaseStream)) {
605+
continue;
606+
}
607+
if (!isName(fileStream.dict.get("Type"), "EmbeddedFile")) {
608+
continue;
609+
}
610+
if (!isName(fileStream.dict.get("Subtype"), "application/mathml+xml")) {
611+
continue;
612+
}
613+
return fileStream.getString();
614+
}
615+
const A = this.dict.get("A");
616+
if (A instanceof Dict) {
617+
// This stuff isn't in the spec, but MS Office seems to use it.
618+
const O = A.get("O");
619+
if (isName(O, "MSFT_Office")) {
620+
const mathml = A.get("MSFT_MathML");
621+
return mathml ? stringToPDFString(mathml) : null;
622+
}
623+
}
624+
return null;
625+
}
626+
582627
parseKids() {
583628
let pageObjId = null;
584629
const objRef = this.dict.getRaw("Pg");
@@ -842,6 +887,12 @@ class StructTreePage {
842887
if (typeof alt === "string") {
843888
obj.alt = stringToPDFString(alt);
844889
}
890+
if (obj.role === "Formula") {
891+
const { mathML } = node;
892+
if (mathML) {
893+
obj.mathML = mathML;
894+
}
895+
}
845896

846897
const a = node.dict.get("A");
847898
if (a instanceof Dict) {

src/shared/util.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,15 @@ class FeatureTest {
658658
);
659659
}
660660

661+
static get isSanitizerSupported() {
662+
return shadow(
663+
this,
664+
"isSanitizerSupported",
665+
// eslint-disable-next-line no-undef
666+
typeof Sanitizer !== "undefined"
667+
);
668+
}
669+
661670
static get platform() {
662671
const { platform, userAgent } = navigator;
663672

test/integration/accessibility_spec.mjs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,4 +305,72 @@ describe("accessibility", () => {
305305
);
306306
});
307307
});
308+
309+
describe("MathML in AF entry from LaTeX", () => {
310+
let pages;
311+
312+
beforeEach(async () => {
313+
pages = await loadAndWait("bug1937438_af_from_latex.pdf", ".textLayer");
314+
});
315+
316+
afterEach(async () => {
317+
await closePages(pages);
318+
});
319+
320+
it("must check that the MathML is correctly inserted", async () => {
321+
await Promise.all(
322+
pages.map(async ([browserName, page]) => {
323+
const isSanitizerSupported = await page.evaluate(() => {
324+
try {
325+
// eslint-disable-next-line no-undef
326+
return typeof Sanitizer !== "undefined";
327+
} catch {
328+
return false;
329+
}
330+
});
331+
if (isSanitizerSupported) {
332+
const mathML = await page.$eval(
333+
"span.structTree span[aria-owns='p58R_mc13'] > math",
334+
el => el?.innerHTML ?? ""
335+
);
336+
expect(mathML)
337+
.withContext(`In ${browserName}`)
338+
.toEqual(
339+
" <msqrt><msup><mi>x</mi><mn>2</mn></msup></msqrt> <mo>=</mo> <mrow><mo>|</mo><mi>x</mi><mo>|</mo></mrow> "
340+
);
341+
} else {
342+
pending(`Sanitizer API (in ${browserName}) is not supported`);
343+
}
344+
})
345+
);
346+
});
347+
});
348+
349+
describe("MathML tags in the struct tree", () => {
350+
let pages;
351+
352+
beforeEach(async () => {
353+
pages = await loadAndWait("bug1937438_mml_from_latex.pdf", ".textLayer");
354+
});
355+
356+
afterEach(async () => {
357+
await closePages(pages);
358+
});
359+
360+
it("must check that the MathML is correctly inserted", async () => {
361+
await Promise.all(
362+
pages.map(async ([browserName, page]) => {
363+
const mathML = await page.$eval(
364+
"span.structTree span[role='group'] span[role='group']:last-child > span math",
365+
el => el?.innerHTML ?? ""
366+
);
367+
expect(mathML)
368+
.withContext(`In ${browserName}`)
369+
.toEqual(
370+
`<mi aria-owns="p76R_mc16"></mi><mo aria-owns="p76R_mc17"></mo><msqrt><mrow><msup><mi aria-owns="p76R_mc18"></mi><mn aria-owns="p76R_mc19"></mn></msup><mo aria-owns="p76R_mc20"></mo><msup><mi aria-owns="p76R_mc21"></mi><mn aria-owns="p76R_mc22"></mn></msup></mrow></msqrt>`
371+
);
372+
})
373+
);
374+
});
375+
});
308376
});

test/pdfs/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -749,3 +749,6 @@
749749
!issue20319_1.pdf
750750
!issue20319_2.pdf
751751
!bug1992868.pdf
752+
!bug1937438_af_from_latex.pdf
753+
!bug1937438_from_word.pdf
754+
!bug1937438_mml_from_latex.pdf
39 KB
Binary file not shown.

test/pdfs/bug1937438_from_word.pdf

45.2 KB
Binary file not shown.
42.6 KB
Binary file not shown.

test/unit/struct_tree_spec.js

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ function equalTrees(rootA, rootB) {
2121
expect(a.role).toEqual(b.role);
2222
expect(a.lang).toEqual(b.lang);
2323
expect(a.type).toEqual(b.type);
24+
expect(a.mathML).toEqual(b.mathML);
2425
expect("children" in a).toEqual("children" in b);
2526
if (!a.children) {
2627
return;
@@ -151,4 +152,154 @@ describe("struct tree", function () {
151152
);
152153
await loadingTask.destroy();
153154
});
155+
156+
it("parses structure with some MathML in AF dictionary", async function () {
157+
const filename = "bug1937438_af_from_latex.pdf";
158+
const params = buildGetDocumentParams(filename);
159+
const loadingTask = getDocument(params);
160+
const doc = await loadingTask.promise;
161+
const page = await doc.getPage(1);
162+
const struct = await page.getStructTree();
163+
equalTrees(
164+
{
165+
children: [
166+
{
167+
role: "Document",
168+
children: [
169+
{
170+
role: "Part",
171+
children: [
172+
{
173+
role: "P",
174+
children: [
175+
{
176+
role: "P",
177+
children: [{ type: "content", id: "p58R_mc0" }],
178+
},
179+
],
180+
},
181+
{
182+
role: "P",
183+
children: [{ type: "content", id: "p58R_mc1" }],
184+
},
185+
{
186+
role: "P",
187+
children: [{ type: "content", id: "p58R_mc2" }],
188+
},
189+
],
190+
},
191+
{
192+
role: "Sect",
193+
children: [
194+
{
195+
role: "H1",
196+
children: [
197+
{
198+
role: "Lbl",
199+
children: [{ type: "content", id: "p58R_mc3" }],
200+
},
201+
{ type: "content", id: "p58R_mc4" },
202+
],
203+
},
204+
{
205+
role: "Part",
206+
children: [
207+
{
208+
role: "P",
209+
children: [
210+
{ type: "content", id: "p58R_mc5" },
211+
{
212+
role: "Formula",
213+
children: [{ type: "content", id: "p58R_mc6" }],
214+
mathML: "<math> <mi>x</mi> </math>",
215+
},
216+
{ type: "content", id: "p58R_mc7" },
217+
{
218+
role: "Formula",
219+
children: [{ type: "content", id: "p58R_mc8" }],
220+
mathML: "<math> <mi>y</mi> </math>",
221+
},
222+
{ type: "content", id: "p58R_mc9" },
223+
{
224+
role: "Formula",
225+
children: [{ type: "content", id: "p58R_mc10" }],
226+
mathML:
227+
"<math> <mi>x</mi> <mo>&gt;</mo> <mi>y</mi> </math>",
228+
},
229+
{ type: "content", id: "p58R_mc11" },
230+
],
231+
},
232+
],
233+
},
234+
{
235+
role: "Part",
236+
children: [
237+
{
238+
role: "P",
239+
children: [{ type: "content", id: "p58R_mc12" }],
240+
},
241+
{
242+
role: "Formula",
243+
children: [{ type: "content", id: "p58R_mc13" }],
244+
mathML:
245+
'<math> <msqrt><msup><mi>x</mi><mn>2</mn></msup></msqrt> <mo>=</mo> <mrow intent="absolute-value($x)"><mo>|</mo><mi arg="x">x</mi><mo>|</mo></mrow> </math>',
246+
},
247+
],
248+
},
249+
],
250+
},
251+
],
252+
},
253+
],
254+
role: "Root",
255+
},
256+
struct
257+
);
258+
await loadingTask.destroy();
259+
});
260+
261+
it("parses structure with some MathML in MS Office specific entry", async function () {
262+
const filename = "bug1937438_from_word.pdf";
263+
const params = buildGetDocumentParams(filename);
264+
const loadingTask = getDocument(params);
265+
const doc = await loadingTask.promise;
266+
const page = await doc.getPage(1);
267+
const struct = await page.getStructTree();
268+
equalTrees(
269+
{
270+
children: [
271+
{
272+
role: "Document",
273+
children: [
274+
{
275+
role: "P",
276+
children: [
277+
{ type: "content", id: "p3R_mc0" },
278+
{
279+
role: "Formula",
280+
children: [{ type: "content", id: "p3R_mc1" }],
281+
alt: "pi",
282+
mathML: '<math display="block"><mi>&#x1D70B;</mi></math>',
283+
},
284+
{ type: "content", id: "p3R_mc2" },
285+
],
286+
},
287+
{
288+
role: "Formula",
289+
children: [{ type: "content", id: "p3R_mc3" }],
290+
alt: "6 sum from n equals 1 to infinity of 1 over n squared , equals pi squared",
291+
mathML:
292+
'<math display="block"><mn>6</mn><mrow><munderover><mo stretchy="false">&#x2211;</mo><mrow><mi>n</mi><mo>=</mo><mn>1</mn></mrow><mo>&#x221E;</mo></munderover><mfrac><mn>1</mn><msup><mrow><mi>n</mi></mrow><mn>2</mn></msup></mfrac></mrow><mo>=</mo><msup><mrow><mi>&#x1D70B;</mi></mrow><mn>2</mn></msup></math>',
293+
},
294+
{ role: "P", children: [{ type: "content", id: "p3R_mc4" }] },
295+
{ role: "P", children: [{ type: "content", id: "p3R_mc5" }] },
296+
],
297+
},
298+
],
299+
role: "Root",
300+
},
301+
struct
302+
);
303+
await loadingTask.destroy();
304+
});
154305
});

0 commit comments

Comments
 (0)