Skip to content

Commit 2fd6ed1

Browse files
authored
Extract headings from www.rfc-editor.org RFCs (#1883)
RFCs published at www.rfc-editor.org use spans to define headings. This update creates custom logic to deal with these, while attempting to restrict the pattern as much as practical, in case another spec suddenly introduces class names such as `h2` attached to spans.
1 parent 79b36dd commit 2fd6ed1

File tree

3 files changed

+40
-1
lines changed

3 files changed

+40
-1
lines changed

src/browserlib/extract-dfns.mjs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ import {parse} from "../../node_modules/webidl2/index.js";
2424
* can be one of "dt", "pre", "table", "heading", "note", "example", or
2525
* "prose" (last one indicates that definition appears in the main body of
2626
* the spec)
27+
* - links: A list of interesting links with IDs that complete the definitions,
28+
* notably non-normative descriptions that target web developers.
2729
*
2830
* The extraction ignores definitions with an unknown type. A warning is issued
2931
* to the console when that happens.

src/browserlib/extract-headings.mjs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,29 @@ export default function (spec, idToHeading) {
2020
};
2121
});
2222

23+
// Headings using spans in www.rfc-editor.org RFCs
24+
const rfcSelector = 'pre > span:is(.h2,.h3,.h4,.h5,.h6) > a.selflink[id]';
25+
const rfcHeadings = [...document.querySelectorAll(rfcSelector)].map(n => {
26+
const headingNumber = n.textContent;
27+
const headingLevel = headingNumber ? headingNumber.split(".").length : undefined;
28+
return {
29+
id: n.id,
30+
href: getAbsoluteUrl(n, { singlePage }),
31+
title: n.parentNode.textContent
32+
.replace(headingNumber, '')
33+
.replace(/^\s*\./, '')
34+
.trim(),
35+
level: headingLevel,
36+
number: headingNumber
37+
};
38+
});
39+
2340
const headingsSelector = [
2441
':is(h1,h2,h3,h4,h5,h6)[id]', // Regular headings
2542
':is(h1,h2,h3,h4,h5,h6):not([id]) > a[name]' // CSS 2.1 headings
2643
].join(',');
2744

28-
return esHeadings.concat([...document.querySelectorAll(headingsSelector)].map(n => {
45+
return esHeadings.concat(rfcHeadings).concat([...document.querySelectorAll(headingsSelector)].map(n => {
2946
// Note: In theory, all <hX> heading elements that have an ID are associated
3047
// with a heading in idToHeading. One exception to the rule: when the
3148
// heading element appears in a <hgroup> element, the mapping is not

test/extract-headings.js

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,26 @@ const testHeadings = [
6363
html: "<section id=title-0><h1 id=title>Heading in a section with its own id</h1>",
6464
res: [{id: "title-0", "href": "about:blank#title-0", title: "Heading in a section with its own id", level: 1, alternateIds: ["title"]}]
6565
},
66+
{
67+
title: "deals with headings in www.rfc-editor.org RFCs",
68+
html: `<pre>
69+
<span class="h2">
70+
<a class="selflink" id="title" href="#title">2</a>.
71+
Title
72+
</span>
73+
</pre>`,
74+
res: [{id: "title", href: "about:blank#title", title: "Title", number: "2", level: 1}]
75+
},
76+
{
77+
title: "deals with sub-headings in www.rfc-editor.org RFCs",
78+
html: `<pre>
79+
<span class="h3">
80+
<a class="selflink" id="title" href="#title">3.1</a>.
81+
Title
82+
</span>
83+
</pre>`,
84+
res: [{id: "title", href: "about:blank#title", title: "Title", number: "3.1", level: 2}]
85+
},
6686
];
6787

6888
describe("Test headings extraction", function () {

0 commit comments

Comments
 (0)