Skip to content

Commit e5a62c8

Browse files
committed
Make MathML elements visible in the struct tree (bug 1937438)
It'll help to make math equations "visible" for screen readers. MS Office has a specific way to add some MathML code to struc tree leaf and this patch handles it.
1 parent 9f397a6 commit e5a62c8

File tree

9 files changed

+395
-1
lines changed

9 files changed

+395
-1
lines changed

src/core/struct_tree.js

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import { AnnotationPrefix, stringToPDFString, warn } from "../shared/util.js";
1717
import { Dict, isName, Name, Ref, RefSetCache } from "./primitives.js";
1818
import { lookupNormalRect, stringToAsciiOrUTF16BE } from "./core_utils.js";
19+
import { BaseStream } from "./base_stream.js";
1920
import { NumberTree } from "./name_number_tree.js";
2021

2122
const MAX_DEPTH = 40;
@@ -579,6 +580,50 @@ class StructElementNode {
579580
return root.roleMap.get(name) ?? name;
580581
}
581582

583+
get mathML() {
584+
let AFs = this.dict.get("AF") || [];
585+
if (!Array.isArray(AFs)) {
586+
AFs = [AFs];
587+
}
588+
for (let af of AFs) {
589+
af = this.xref.fetchIfRef(af);
590+
if (!(af instanceof Dict)) {
591+
continue;
592+
}
593+
if (!isName(af.get("Type"), "Filespec")) {
594+
continue;
595+
}
596+
if (!isName(af.get("AFRelationship"), "Supplement")) {
597+
continue;
598+
}
599+
const ef = af.get("EF");
600+
if (!(ef instanceof Dict)) {
601+
continue;
602+
}
603+
const fileStream = ef.get("UF") || ef.get("F");
604+
if (!(fileStream instanceof BaseStream)) {
605+
continue;
606+
}
607+
if (!isName(fileStream.dict.get("Type"), "EmbeddedFile")) {
608+
continue;
609+
}
610+
if (!isName(fileStream.dict.get("Subtype"), "application/mathml+xml")) {
611+
continue;
612+
}
613+
return fileStream.getString();
614+
}
615+
const A = this.dict.get("A");
616+
if (A instanceof Dict) {
617+
// This stuff isn't in the spec, but MS Office seems to use it.
618+
const O = A.get("O");
619+
if (isName(O, "MSFT_Office")) {
620+
const mathml = A.get("MSFT_MathML");
621+
return mathml ? stringToPDFString(mathml) : null;
622+
}
623+
}
624+
return null;
625+
}
626+
582627
parseKids() {
583628
let pageObjId = null;
584629
const objRef = this.dict.getRaw("Pg");
@@ -842,6 +887,12 @@ class StructTreePage {
842887
if (typeof alt === "string") {
843888
obj.alt = stringToPDFString(alt);
844889
}
890+
if (obj.role === "Formula") {
891+
const { mathML } = node;
892+
if (mathML) {
893+
obj.mathML = mathML;
894+
}
895+
}
845896

846897
const a = node.dict.get("A");
847898
if (a instanceof Dict) {

src/shared/util.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,15 @@ class FeatureTest {
658658
);
659659
}
660660

661+
static get isSanitizerSupported() {
662+
return shadow(
663+
this,
664+
"isSanitizerSupported",
665+
// eslint-disable-next-line no-undef
666+
typeof Sanitizer !== "undefined"
667+
);
668+
}
669+
661670
static get platform() {
662671
const { platform, userAgent } = navigator;
663672

test/integration/accessibility_spec.mjs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,4 +305,72 @@ describe("accessibility", () => {
305305
);
306306
});
307307
});
308+
309+
describe("MathML in AF entry from LaTeX", () => {
310+
let pages;
311+
312+
beforeEach(async () => {
313+
pages = await loadAndWait("bug1937438_af_from_latex.pdf", ".textLayer");
314+
});
315+
316+
afterEach(async () => {
317+
await closePages(pages);
318+
});
319+
320+
it("must check that the MathML is correctly inserted", async () => {
321+
await Promise.all(
322+
pages.map(async ([browserName, page]) => {
323+
const isSanitizerSupported = await page.evaluate(() => {
324+
try {
325+
// eslint-disable-next-line no-undef
326+
return typeof Sanitizer !== "undefined";
327+
} catch {
328+
return false;
329+
}
330+
});
331+
if (isSanitizerSupported) {
332+
const mathML = await page.$eval(
333+
"span.structTree span[aria-owns='p58R_mc13'] > math",
334+
el => el?.innerHTML ?? ""
335+
);
336+
expect(mathML)
337+
.withContext(`In ${browserName}`)
338+
.toEqual(
339+
" <msqrt><msup><mi>x</mi><mn>2</mn></msup></msqrt> <mo>=</mo> <mrow><mo>|</mo><mi>x</mi><mo>|</mo></mrow> "
340+
);
341+
} else {
342+
pending(`Sanitizer API (in ${browserName}) is not supported`);
343+
}
344+
})
345+
);
346+
});
347+
});
348+
349+
describe("MathML tags in the struct tree", () => {
350+
let pages;
351+
352+
beforeEach(async () => {
353+
pages = await loadAndWait("bug1937438_mml_from_latex.pdf", ".textLayer");
354+
});
355+
356+
afterEach(async () => {
357+
await closePages(pages);
358+
});
359+
360+
it("must check that the MathML is correctly inserted", async () => {
361+
await Promise.all(
362+
pages.map(async ([browserName, page]) => {
363+
const mathML = await page.$eval(
364+
"span.structTree span[role='group'] span[role='group']:last-child > span math",
365+
el => el?.innerHTML ?? ""
366+
);
367+
expect(mathML)
368+
.withContext(`In ${browserName}`)
369+
.toEqual(
370+
`<mi aria-owns="p76R_mc16"></mi><mo aria-owns="p76R_mc17"></mo><msqrt><mrow><msup><mi aria-owns="p76R_mc18"></mi><mn aria-owns="p76R_mc19"></mn></msup><mo aria-owns="p76R_mc20"></mo><msup><mi aria-owns="p76R_mc21"></mi><mn aria-owns="p76R_mc22"></mn></msup></mrow></msqrt>`
371+
);
372+
})
373+
);
374+
});
375+
});
308376
});

test/pdfs/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -749,3 +749,6 @@
749749
!issue20319_1.pdf
750750
!issue20319_2.pdf
751751
!bug1992868.pdf
752+
!bug1937438_af_from_latex.pdf
753+
!bug1937438_from_word.pdf
754+
!bug1937438_mml_from_latex.pdf
39 KB
Binary file not shown.

test/pdfs/bug1937438_from_word.pdf

45.2 KB
Binary file not shown.
42.6 KB
Binary file not shown.

test/unit/struct_tree_spec.js

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ function equalTrees(rootA, rootB) {
2121
expect(a.role).toEqual(b.role);
2222
expect(a.lang).toEqual(b.lang);
2323
expect(a.type).toEqual(b.type);
24+
expect(a.mathML).toEqual(b.mathML);
2425
expect("children" in a).toEqual("children" in b);
2526
if (!a.children) {
2627
return;
@@ -151,4 +152,154 @@ describe("struct tree", function () {
151152
);
152153
await loadingTask.destroy();
153154
});
155+
156+
it("parses structure with some MathML in AF dictionary", async function () {
157+
const filename = "bug1937438_af_from_latex.pdf";
158+
const params = buildGetDocumentParams(filename);
159+
const loadingTask = getDocument(params);
160+
const doc = await loadingTask.promise;
161+
const page = await doc.getPage(1);
162+
const struct = await page.getStructTree();
163+
equalTrees(
164+
{
165+
children: [
166+
{
167+
role: "Document",
168+
children: [
169+
{
170+
role: "Part",
171+
children: [
172+
{
173+
role: "P",
174+
children: [
175+
{
176+
role: "P",
177+
children: [{ type: "content", id: "p58R_mc0" }],
178+
},
179+
],
180+
},
181+
{
182+
role: "P",
183+
children: [{ type: "content", id: "p58R_mc1" }],
184+
},
185+
{
186+
role: "P",
187+
children: [{ type: "content", id: "p58R_mc2" }],
188+
},
189+
],
190+
},
191+
{
192+
role: "Sect",
193+
children: [
194+
{
195+
role: "H1",
196+
children: [
197+
{
198+
role: "Lbl",
199+
children: [{ type: "content", id: "p58R_mc3" }],
200+
},
201+
{ type: "content", id: "p58R_mc4" },
202+
],
203+
},
204+
{
205+
role: "Part",
206+
children: [
207+
{
208+
role: "P",
209+
children: [
210+
{ type: "content", id: "p58R_mc5" },
211+
{
212+
role: "Formula",
213+
children: [{ type: "content", id: "p58R_mc6" }],
214+
mathML: "<math> <mi>x</mi> </math>",
215+
},
216+
{ type: "content", id: "p58R_mc7" },
217+
{
218+
role: "Formula",
219+
children: [{ type: "content", id: "p58R_mc8" }],
220+
mathML: "<math> <mi>y</mi> </math>",
221+
},
222+
{ type: "content", id: "p58R_mc9" },
223+
{
224+
role: "Formula",
225+
children: [{ type: "content", id: "p58R_mc10" }],
226+
mathML:
227+
"<math> <mi>x</mi> <mo>&gt;</mo> <mi>y</mi> </math>",
228+
},
229+
{ type: "content", id: "p58R_mc11" },
230+
],
231+
},
232+
],
233+
},
234+
{
235+
role: "Part",
236+
children: [
237+
{
238+
role: "P",
239+
children: [{ type: "content", id: "p58R_mc12" }],
240+
},
241+
{
242+
role: "Formula",
243+
children: [{ type: "content", id: "p58R_mc13" }],
244+
mathML:
245+
'<math> <msqrt><msup><mi>x</mi><mn>2</mn></msup></msqrt> <mo>=</mo> <mrow intent="absolute-value($x)"><mo>|</mo><mi arg="x">x</mi><mo>|</mo></mrow> </math>',
246+
},
247+
],
248+
},
249+
],
250+
},
251+
],
252+
},
253+
],
254+
role: "Root",
255+
},
256+
struct
257+
);
258+
await loadingTask.destroy();
259+
});
260+
261+
it("parses structure with some MathML in MS Office specific entry", async function () {
262+
const filename = "bug1937438_from_word.pdf";
263+
const params = buildGetDocumentParams(filename);
264+
const loadingTask = getDocument(params);
265+
const doc = await loadingTask.promise;
266+
const page = await doc.getPage(1);
267+
const struct = await page.getStructTree();
268+
equalTrees(
269+
{
270+
children: [
271+
{
272+
role: "Document",
273+
children: [
274+
{
275+
role: "P",
276+
children: [
277+
{ type: "content", id: "p3R_mc0" },
278+
{
279+
role: "Formula",
280+
children: [{ type: "content", id: "p3R_mc1" }],
281+
alt: "pi",
282+
mathML: '<math display="block"><mi>&#x1D70B;</mi></math>',
283+
},
284+
{ type: "content", id: "p3R_mc2" },
285+
],
286+
},
287+
{
288+
role: "Formula",
289+
children: [{ type: "content", id: "p3R_mc3" }],
290+
alt: "6 sum from n equals 1 to infinity of 1 over n squared , equals pi squared",
291+
mathML:
292+
'<math display="block"><mn>6</mn><mrow><munderover><mo stretchy="false">&#x2211;</mo><mrow><mi>n</mi><mo>=</mo><mn>1</mn></mrow><mo>&#x221E;</mo></munderover><mfrac><mn>1</mn><msup><mrow><mi>n</mi></mrow><mn>2</mn></msup></mfrac></mrow><mo>=</mo><msup><mrow><mi>&#x1D70B;</mi></mrow><mn>2</mn></msup></math>',
293+
},
294+
{ role: "P", children: [{ type: "content", id: "p3R_mc4" }] },
295+
{ role: "P", children: [{ type: "content", id: "p3R_mc5" }] },
296+
],
297+
},
298+
],
299+
role: "Root",
300+
},
301+
struct
302+
);
303+
await loadingTask.destroy();
304+
});
154305
});

0 commit comments

Comments
 (0)