Skip to content

Commit a6d0fb9

Browse files
committed
Implement pure markdown-it title extraction
1 parent debc2ae commit a6d0fb9

File tree

4 files changed

+215
-5
lines changed

4 files changed

+215
-5
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import markdownit from 'markdown-it'
2+
3+
const md = markdownit()
4+
5+
/**
6+
* Extract the first H1 heading from markdown using markdown-it's token API
7+
* @param {string} markdown
8+
* @returns {string | null}
9+
*/
10+
export function extractFirstH1 (markdown) {
11+
const tokens = md.parse(markdown, {})
12+
13+
for (let i = 0; i < tokens.length; i++) {
14+
const token = tokens[i]
15+
16+
// Look for heading_open token with tag 'h1'
17+
if (token && token.type === 'heading_open' && token.tag === 'h1') {
18+
// The next token should be inline with the heading content
19+
const nextToken = tokens[i + 1]
20+
if (nextToken && nextToken.type === 'inline') {
21+
// The inline token's content is the raw text of the heading
22+
return nextToken.content.trim()
23+
}
24+
}
25+
}
26+
27+
return null
28+
}
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
import { test } from 'node:test'
2+
import assert from 'node:assert'
3+
4+
import { extractFirstH1 } from './extract-title-from-md.js'
5+
6+
test.describe('extractFirstH1', () => {
7+
test('extracts ATX style H1 headings', async () => {
8+
const tests = [
9+
{
10+
input: '# Simple Heading',
11+
expect: 'Simple Heading',
12+
note: 'basic ATX H1'
13+
},
14+
{
15+
input: '# Extra Spaces ',
16+
expect: 'Extra Spaces',
17+
note: 'ATX H1 with extra spaces'
18+
},
19+
{
20+
input: '# Heading with **bold** and *italic*',
21+
expect: 'Heading with **bold** and *italic*',
22+
note: 'ATX H1 with inline formatting'
23+
},
24+
{
25+
input: 'Some text\n# First Heading\n## Second Heading',
26+
expect: 'First Heading',
27+
note: 'ATX H1 after other content'
28+
},
29+
{
30+
input: '## Not H1\n# Real H1\n### Not H1',
31+
expect: 'Real H1',
32+
note: 'ATX H1 between other headings'
33+
}
34+
]
35+
36+
for (const testCase of tests) {
37+
const result = extractFirstH1(testCase.input)
38+
assert.equal(result, testCase.expect, testCase.note)
39+
}
40+
})
41+
42+
test('extracts Setext style H1 headings', async () => {
43+
const tests = [
44+
{
45+
input: 'Simple Heading\n==============',
46+
expect: 'Simple Heading',
47+
note: 'basic Setext H1'
48+
},
49+
{
50+
input: 'Simple Heading\n===',
51+
expect: 'Simple Heading',
52+
note: 'Setext H1 with minimum underline'
53+
},
54+
{
55+
input: ' Trimmed Heading \n==============',
56+
expect: 'Trimmed Heading',
57+
note: 'Setext H1 with spaces to trim'
58+
},
59+
{
60+
input: 'Heading with **bold** and *italic*\n==================',
61+
expect: 'Heading with **bold** and *italic*',
62+
note: 'Setext H1 with inline formatting'
63+
},
64+
{
65+
input: 'Some text\n\nFirst Heading\n=============\n\nMore text',
66+
expect: 'First Heading',
67+
note: 'Setext H1 with surrounding content'
68+
}
69+
]
70+
71+
for (const testCase of tests) {
72+
const result = extractFirstH1(testCase.input)
73+
assert.equal(result, testCase.expect, testCase.note)
74+
}
75+
})
76+
77+
test('handles edge cases correctly', async () => {
78+
const tests = [
79+
{
80+
input: '',
81+
expect: null,
82+
note: 'empty string'
83+
},
84+
{
85+
input: '## Only H2\n### Only H3',
86+
expect: null,
87+
note: 'no H1 present'
88+
},
89+
{
90+
input: 'Not a heading\n---',
91+
expect: null,
92+
note: 'Setext H2 (dashes) should not match'
93+
},
94+
{
95+
input: 'Not a heading\n--',
96+
expect: null,
97+
note: 'Setext H2 (dashes) should not match'
98+
},
99+
{
100+
input: 'Not a heading\n==',
101+
expect: 'Not a heading',
102+
note: 'markdown-it accepts any number of equals for Setext H1'
103+
},
104+
{
105+
input: '\n========',
106+
expect: null,
107+
note: 'empty line before Setext underline'
108+
},
109+
{
110+
input: ' # Code block heading',
111+
expect: null,
112+
note: 'indented code block should not match'
113+
},
114+
{
115+
input: '```\n# Code fence heading\n```',
116+
expect: null,
117+
note: 'fenced code block should not match (simple case)'
118+
}
119+
]
120+
121+
for (const testCase of tests) {
122+
const result = extractFirstH1(testCase.input)
123+
assert.equal(result, testCase.expect, testCase.note)
124+
}
125+
})
126+
127+
test('prefers first H1 when multiple exist', async () => {
128+
const tests = [
129+
{
130+
input: '# First ATX\n# Second ATX',
131+
expect: 'First ATX',
132+
note: 'multiple ATX H1s'
133+
},
134+
{
135+
input: 'First Setext\n============\n\nSecond Setext\n============',
136+
expect: 'First Setext',
137+
note: 'multiple Setext H1s'
138+
},
139+
{
140+
input: '# ATX First\n\nSetext Second\n=============',
141+
expect: 'ATX First',
142+
note: 'ATX before Setext'
143+
},
144+
{
145+
input: 'Setext First\n============\n\n# ATX Second',
146+
expect: 'Setext First',
147+
note: 'Setext before ATX'
148+
}
149+
]
150+
151+
for (const testCase of tests) {
152+
const result = extractFirstH1(testCase.input)
153+
assert.equal(result, testCase.expect, testCase.note)
154+
}
155+
})
156+
157+
test('handles multiline documents correctly', async () => {
158+
const markdown = `
159+
Some introductory text here
160+
that spans multiple lines
161+
162+
# The Real Title
163+
164+
## A subsection
165+
166+
More content here
167+
`
168+
const result = extractFirstH1(markdown)
169+
assert.equal(result, 'The Real Title', 'finds H1 in realistic document')
170+
})
171+
172+
test('handles frontmatter-like content', async () => {
173+
const markdown = `---
174+
title: Frontmatter Title
175+
---
176+
177+
# Actual H1 Title
178+
179+
Content here`
180+
const result = extractFirstH1(markdown)
181+
assert.equal(result, 'Actual H1 Title', 'ignores frontmatter')
182+
})
183+
})

lib/build-pages/page-builders/md/index.js

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55
import assert from 'node:assert'
66
import { readFile } from 'fs/promises'
77
import yaml from 'js-yaml'
8-
import * as cheerio from 'cheerio'
9-
108
import { getMd, renderMd } from './get-md.js'
9+
import { extractFirstH1 } from './extract-title-from-md.js'
1110

1211
/** @type {markdownIt | null} */
1312
let md = null
@@ -39,8 +38,8 @@ export async function mdBuilder ({ pageInfo, options }) {
3938
mdUnparsed = fileContents
4039
}
4140

42-
const body = await renderMd(mdUnparsed, { handlebars: false, ...frontMatter }, md, markdownItSettingsPath)
43-
const title = cheerio.load(body)('h1').first().text().trim()
41+
// Extract title from first H1 using markdown-it's token API
42+
const title = extractFirstH1(mdUnparsed)
4443

4544
return {
4645
vars: Object.assign({ title }, frontMatter),

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
"argsclopts": "^1.0.4",
2121
"async-folder-walker": "^3.0.5",
2222
"browser-sync": "^3.0.2",
23-
"cheerio": "^1.0.0-rc.10",
2423
"chokidar": "^4.0.0",
2524
"clean-deep": "^3.4.0",
2625
"cpx2": "^8.0.0",
@@ -58,6 +57,7 @@
5857
},
5958
"devDependencies": {
6059
"c8": "^10.0.0",
60+
"cheerio": "^1.0.0-rc.10",
6161
"@types/browser-sync": "^2.29.0",
6262
"@types/js-yaml": "^4.0.9",
6363
"@types/markdown-it": "^14.1.1",

0 commit comments

Comments
 (0)