Skip to content

Commit 5928f59

Browse files
committed
fixes
1 parent bbe9713 commit 5928f59

File tree

2 files changed

+99
-7
lines changed

2 files changed

+99
-7
lines changed

lib/markdown-for-llm.js

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
'use strict'
22

33
const { NodeHtmlMarkdown } = require('node-html-markdown')
4+
const YAML = require('yaml')
45

56
let nhm
67
const customTranslators = {
@@ -12,8 +13,8 @@ const customTranslators = {
1213

1314
const type = (node
1415
.classList.values().
15-
find(v => v != 'admonitionblock')
16-
.toUpperCase())
16+
find(v => v != 'admonitionblock') || 'NOTE'
17+
).toUpperCase()
1718

1819
const content = (
1920
nhm.translate(
@@ -30,16 +31,28 @@ nhm = new NodeHtmlMarkdown({}, customTranslators)
3031

3132
const File = require('vinyl')
3233

33-
function markdownify(page) {
34+
function markdownify(page, pubDate) {
35+
const title = page.title
3436
const html = page.contents.toString()
37+
const markdown = nhm.translate(html)
3538

3639
// haha, we now have to translate the .md link back to .html just for this link
3740
const orig = page.pub.url.replace(/\.md$/, '.html')
38-
const link = `[View original HTML](${orig})\n\n`
3941

40-
const markdown = link + nhm.translate(html)
42+
let output =
43+
`[View original HTML](${orig})\n\n` +
44+
`# ${title}\n\n${markdown}`
4145

42-
page.contents = Buffer.from(markdown)
46+
const frontmatter = {
47+
title,
48+
description: page.asciidoc.attributes.description,
49+
editUrl: page.src.editUrl,
50+
pubDate,
51+
}
52+
53+
output = `---\n${YAML.stringify(frontmatter)}---\n\n${output}`
54+
55+
page.contents = Buffer.from(output)
4356
}
4457

4558
module.exports.register = function ({ playbook, config }) {
@@ -84,6 +97,8 @@ module.exports.register = function ({ playbook, config }) {
8497
this.once('documentsConverted', async ({ playbook, contentCatalog, siteCatalog }) => {
8598
const logger = this.getLogger('llm-summaries')
8699

100+
const pubDate = new Date().toISOString()
101+
87102
const pages = contentCatalog.getPages(
88103
(page) =>
89104
page.mediaType === 'text/html'
@@ -92,7 +107,7 @@ module.exports.register = function ({ playbook, config }) {
92107

93108
for (const page of pages) {
94109
page.mediaType = 'text/markdown'
95-
markdownify(page)
110+
markdownify(page, pubDate)
96111
}
97112
})
98113

scripts/update-snapshot

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/bin/bash
2+
3+
# An antora build with the lib/markdown-for-llms.js extension
4+
# will output a directory of Markdown files, with YAML front-matter
5+
# including pubDate.
6+
# If we first clone the https://github.com/couchbaselabs/docs-markdown-snapshot/
7+
# repo into `public/` then the Antora build will naturally
8+
# update those files in place.
9+
#
10+
# This script attempts to handle these changes, including
11+
# * new files
12+
# * deletion of no-longer output files
13+
# * preserving files which are unchanged (other than the pubDate)
14+
15+
cd public
16+
17+
# all markdown files
18+
find . -name *.md \
19+
| sed 's|^\./||' \
20+
| sort \
21+
> md-all
22+
23+
# all markdown that
24+
# were output by Antora
25+
# and caused a modification (M) flag
26+
# (e.g. that have at the very least, a new pubDate:)
27+
git status --porcelain=v1 \
28+
--untracked-files=no \
29+
| grep '\.md$' \
30+
| grep '^\s*M' \
31+
| awk '{ print $2 }' \
32+
| sort \
33+
> md-output
34+
35+
# all markdown files that have changed more than just pubDate
36+
# (-I flag ignores changes which *only* contain that pattern)
37+
git diff --numstat -I^pubDate: \
38+
| grep '\.md$' \
39+
| awk '{ print $3 }' \
40+
| sort \
41+
> md-changed
42+
43+
# all markdown that
44+
# were output by Antora
45+
# and aren't already in git (??)
46+
git status --porcelain=v1 \
47+
--untracked-files=all \
48+
| grep '\.md$' \
49+
| grep ^\?\? \
50+
| awk '{ print $2 }' \
51+
| sort \
52+
> md-added
53+
54+
# So we want to categorise the following groups:
55+
# A. in (md-changed, md-added) -> ADD
56+
# B. in (md-output) but NOT (md-changed) -> CHECKOUT (revert)
57+
# C. in (md-all) but NOT (md-added, md-output) -> DELETE
58+
59+
# The useful-but-confusing `comm` utility prints:
60+
# (only 1) (only 2) (common)
61+
#
62+
# so `comm -23` prints in 1 but NOT 2 (or common)
63+
64+
# A. in (md-changed, md-added) -> ADD
65+
git add --pathspec-from-file md-changed
66+
git add --pathspec-from-file md-added
67+
68+
# B. in (md-output) but NOT (md-changed) -> CHECKOUT (revert)
69+
comm -23 md-output md-changed > md-checkout
70+
git checkout --pathspec-from-file md-checkout
71+
72+
# C. in (md-all) but NOT (md-added, md-output) -> DELETE
73+
comm -23 md-all <(sort md-added md-output) > md-delete
74+
git rm --pathspec-from-file md-delete
75+
76+
git commit -m "Updating snapshot"
77+
git push

0 commit comments

Comments
 (0)