Implement pure markdown-it title extraction

bcomnes · bcomnes · commit a6d0fb92df69 · 2025-07-05T16:14:20.000-07:00
diff --git a/lib/build-pages/page-builders/md/extract-title-from-md.js b/lib/build-pages/page-builders/md/extract-title-from-md.js
@@ -0,0 +1,28 @@
+import markdownit from 'markdown-it'
+
+const md = markdownit()
+
+/**
+ * Extract the first H1 heading from markdown using markdown-it's token API
+ * @param {string} markdown
+ * @returns {string | null}
+ */
+export function extractFirstH1 (markdown) {
+  const tokens = md.parse(markdown, {})
+
+  for (let i = 0; i < tokens.length; i++) {
+    const token = tokens[i]
+
+    // Look for heading_open token with tag 'h1'
+    if (token && token.type === 'heading_open' && token.tag === 'h1') {
+      // The next token should be inline with the heading content
+      const nextToken = tokens[i + 1]
+      if (nextToken && nextToken.type === 'inline') {
+        // The inline token's content is the raw text of the heading
+        return nextToken.content.trim()
+      }
+    }
+  }
+
+  return null
+}
diff --git a/lib/build-pages/page-builders/md/extract-title-from-md.test.js b/lib/build-pages/page-builders/md/extract-title-from-md.test.js
@@ -0,0 +1,183 @@
+import { test } from 'node:test'
+import assert from 'node:assert'
+
+import { extractFirstH1 } from './extract-title-from-md.js'
+
+test.describe('extractFirstH1', () => {
+  test('extracts ATX style H1 headings', async () => {
+    const tests = [
+      {
+        input: '# Simple Heading',
+        expect: 'Simple Heading',
+        note: 'basic ATX H1'
+      },
+      {
+        input: '#    Extra Spaces   ',
+        expect: 'Extra Spaces',
+        note: 'ATX H1 with extra spaces'
+      },
+      {
+        input: '# Heading with **bold** and *italic*',
+        expect: 'Heading with **bold** and *italic*',
+        note: 'ATX H1 with inline formatting'
+      },
+      {
+        input: 'Some text\n# First Heading\n## Second Heading',
+        expect: 'First Heading',
+        note: 'ATX H1 after other content'
+      },
+      {
+        input: '## Not H1\n# Real H1\n### Not H1',
+        expect: 'Real H1',
+        note: 'ATX H1 between other headings'
+      }
+    ]
+
+    for (const testCase of tests) {
+      const result = extractFirstH1(testCase.input)
+      assert.equal(result, testCase.expect, testCase.note)
+    }
+  })
+
+  test('extracts Setext style H1 headings', async () => {
+    const tests = [
+      {
+        input: 'Simple Heading\n==============',
+        expect: 'Simple Heading',
+        note: 'basic Setext H1'
+      },
+      {
+        input: 'Simple Heading\n===',
+        expect: 'Simple Heading',
+        note: 'Setext H1 with minimum underline'
+      },
+      {
+        input: '  Trimmed Heading  \n==============',
+        expect: 'Trimmed Heading',
+        note: 'Setext H1 with spaces to trim'
+      },
+      {
+        input: 'Heading with **bold** and *italic*\n==================',
+        expect: 'Heading with **bold** and *italic*',
+        note: 'Setext H1 with inline formatting'
+      },
+      {
+        input: 'Some text\n\nFirst Heading\n=============\n\nMore text',
+        expect: 'First Heading',
+        note: 'Setext H1 with surrounding content'
+      }
+    ]
+
+    for (const testCase of tests) {
+      const result = extractFirstH1(testCase.input)
+      assert.equal(result, testCase.expect, testCase.note)
+    }
+  })
+
+  test('handles edge cases correctly', async () => {
+    const tests = [
+      {
+        input: '',
+        expect: null,
+        note: 'empty string'
+      },
+      {
+        input: '## Only H2\n### Only H3',
+        expect: null,
+        note: 'no H1 present'
+      },
+      {
+        input: 'Not a heading\n---',
+        expect: null,
+        note: 'Setext H2 (dashes) should not match'
+      },
+      {
+        input: 'Not a heading\n--',
+        expect: null,
+        note: 'Setext H2 (dashes) should not match'
+      },
+      {
+        input: 'Not a heading\n==',
+        expect: 'Not a heading',
+        note: 'markdown-it accepts any number of equals for Setext H1'
+      },
+      {
+        input: '\n========',
+        expect: null,
+        note: 'empty line before Setext underline'
+      },
+      {
+        input: '    # Code block heading',
+        expect: null,
+        note: 'indented code block should not match'
+      },
+      {
+        input: '```\n# Code fence heading\n```',
+        expect: null,
+        note: 'fenced code block should not match (simple case)'
+      }
+    ]
+
+    for (const testCase of tests) {
+      const result = extractFirstH1(testCase.input)
+      assert.equal(result, testCase.expect, testCase.note)
+    }
+  })
+
+  test('prefers first H1 when multiple exist', async () => {
+    const tests = [
+      {
+        input: '# First ATX\n# Second ATX',
+        expect: 'First ATX',
+        note: 'multiple ATX H1s'
+      },
+      {
+        input: 'First Setext\n============\n\nSecond Setext\n============',
+        expect: 'First Setext',
+        note: 'multiple Setext H1s'
+      },
+      {
+        input: '# ATX First\n\nSetext Second\n=============',
+        expect: 'ATX First',
+        note: 'ATX before Setext'
+      },
+      {
+        input: 'Setext First\n============\n\n# ATX Second',
+        expect: 'Setext First',
+        note: 'Setext before ATX'
+      }
+    ]
+
+    for (const testCase of tests) {
+      const result = extractFirstH1(testCase.input)
+      assert.equal(result, testCase.expect, testCase.note)
+    }
+  })
+
+  test('handles multiline documents correctly', async () => {
+    const markdown = `
+Some introductory text here
+that spans multiple lines
+
+# The Real Title
+
+## A subsection
+
+More content here
+`
+    const result = extractFirstH1(markdown)
+    assert.equal(result, 'The Real Title', 'finds H1 in realistic document')
+  })
+
+  test('handles frontmatter-like content', async () => {
+    const markdown = `---
+title: Frontmatter Title
+---
+
+# Actual H1 Title
+
+Content here`
+    const result = extractFirstH1(markdown)
+    assert.equal(result, 'Actual H1 Title', 'ignores frontmatter')
+  })
+})
diff --git a/lib/build-pages/page-builders/md/index.js b/lib/build-pages/page-builders/md/index.js
@@ -5,9 +5,8 @@
 import assert from 'node:assert'
 import { readFile } from 'fs/promises'
 import yaml from 'js-yaml'
-import * as cheerio from 'cheerio'
-
 import { getMd, renderMd } from './get-md.js'
+import { extractFirstH1 } from './extract-title-from-md.js'
 
 /** @type {markdownIt | null} */
 let md = null
@@ -39,8 +38,8 @@ export async function mdBuilder ({ pageInfo, options }) {
     mdUnparsed = fileContents
   }
 
-  const body = await renderMd(mdUnparsed, { handlebars: false, ...frontMatter }, md, markdownItSettingsPath)
-  const title = cheerio.load(body)('h1').first().text().trim()
+  // Extract title from first H1 using markdown-it's token API
+  const title = extractFirstH1(mdUnparsed)
 
   return {
     vars: Object.assign({ title }, frontMatter),
diff --git a/package.json b/package.json
@@ -20,7 +20,6 @@
     "argsclopts": "^1.0.4",
     "async-folder-walker": "^3.0.5",
     "browser-sync": "^3.0.2",
-    "cheerio": "^1.0.0-rc.10",
     "chokidar": "^4.0.0",
     "clean-deep": "^3.4.0",
     "cpx2": "^8.0.0",
@@ -58,6 +57,7 @@
   },
   "devDependencies": {
     "c8": "^10.0.0",
+    "cheerio": "^1.0.0-rc.10",
     "@types/browser-sync": "^2.29.0",
     "@types/js-yaml": "^4.0.9",
     "@types/markdown-it": "^14.1.1",