Skip to content

Commit f04da31

Browse files
authored
DOC-13760 produce markdown per page (#863)
* DOC-13760 produce markup file per published page * DOC-13760 test on staging chatbot * DOC-13760 remove unneeded async copilot review suggests this may introduce unneeded overhead. As we're getting Node memory errors, then we may as well try! * DOC-13760 save ONLY markdown summary Reduce memory usage by *only* saving the Markdown version in this run. We can do this as a separate build, and scp the output to the same bucket. e.g. this content would be overlaid on the *standard* build (which would only need to add the <link href="..."> URL feature. Rewrite relative hrefs to .md target. * DOC-13760 tidy up fix page.pub.url to ensure that the site nav is also updated * DOC-13760 create an initial llms.txt * tidy * tidy * reduce memory usage by explicitly freeing DOM * change markdown parsing dependency Use the one used by https://github.com/cerbos/antora-llm-generator * remove chunking, fix links * workaround for mangled noopener links in nav * Admonition handling * fixes
1 parent adb98aa commit f04da31

File tree

7 files changed

+288
-16
lines changed

7 files changed

+288
-16
lines changed

antora-playbook-staging-chatbot.diff.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
antora:
22
extensions:
33
- ./lib/report-tree.js
4+
- ./lib/markdown-for-llm.js
45
- ./lib/site-stats-extension.js
56
- ./lib/component-stats.js
67

antora-playbook-staging-chatbot.yml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ antora:
2323
- ./lib/embargo.js
2424
- ./lib/antora-component-version-rank.js
2525
- ./lib/report-tree.js
26+
- ./lib/markdown-for-llm.js
2627
- ./lib/site-stats-extension.js
2728
- ./lib/component-stats.js
2829
site:
@@ -36,7 +37,6 @@ site:
3637
{ "title": "Server", "startPage": "home::server.adoc", "components": ["server", "enterprise-analytics"] },
3738
{ "title": "Mobile / Edge", "startPage": "home::mobile.adoc", "components": ["couchbase-lite", "couchbase-lite-javascript", "sync-gateway", "couchbase-edge-server"] },
3839
{ "title": "Capella", "startPage": "home::cloud.adoc", "components": ["cloud", "app-services", "ai", "analytics"] },
39-
{ "title": "Cloud-Native", "components": ["cloud-native-database"] },
4040
{ "title": "Kubernetes Operator", "startPage": "operator::overview.adoc", "components": ["operator"] },
4141
{ "title": "CMOS", "components": ["cmos"] },
4242
{ "title": "Develop", "startPage": "home::developer.adoc",
@@ -160,9 +160,6 @@ content:
160160
start_path: docs
161161
branches:
162162
- master
163-
- url: https://github.com/couchbase/docs-cloud-native
164-
branches:
165-
- cloud-native-2.2
166163
- url: https://github.com/couchbase/docs-elastic-search
167164
branches:
168165
- main
@@ -221,6 +218,7 @@ content:
221218
- release/7.6.2
222219
- release/7.6
223220
- release/7.2
221+
- release/8.0.1
224222
- url: https://github.com/couchbase/docs-sdk-c
225223
branches:
226224
- release/3.3
@@ -247,9 +245,9 @@ content:
247245
- release/1.0
248246
- url: https://github.com/couchbase/docs-sdk-java
249247
branches:
248+
- release/3.11
250249
- release/3.10
251250
- release/3.9
252-
- release/3.8
253251
- url: https://github.com/couchbase/docs-quarkus-extension
254252
branches:
255253
- release/1.2
@@ -264,9 +262,9 @@ content:
264262
- release/1.2
265263
- url: https://github.com/couchbase/docs-sdk-scala
266264
branches:
265+
- release/3.11
267266
- release/3.10
268267
- release/3.9
269-
- release/1.8
270268
- url: https://github.com/couchbase/docs-sdk-nodejs
271269
branches:
272270
- temp/4.6

antora-playbook.preview.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ antora:
2323
- ./lib/embargo.js
2424
- ./lib/antora-component-version-rank.js
2525
- ./lib/preview.js
26+
- ./lib/markdown-for-llm.js
2627
- ./lib/report-tree.js
2728
- ./lib/site-stats-extension.js
2829
- ./lib/component-stats.js

lib/markdown-for-llm.js

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
'use strict'
2+
3+
const { NodeHtmlMarkdown } = require('node-html-markdown')
4+
const YAML = require('yaml')
5+
6+
let nhm
7+
const customTranslators = {
8+
DIV: ({ visitor }) => ({
9+
surroundingNewlines: 2,
10+
postprocess: ({ nodeMetadata, node }) => {
11+
// <div class="admonitionblock note">
12+
if (node.classList.contains('admonitionblock')) {
13+
14+
const type = (node
15+
.classList.values().
16+
find(v => v != 'admonitionblock') || 'NOTE'
17+
).toUpperCase()
18+
19+
const content = (
20+
nhm.translate(
21+
node.querySelector('td:nth-child(2)')
22+
.innerHTML
23+
).replace(/^/mg, '> '))
24+
25+
return `> [!${type}]\n${content}`
26+
}
27+
}
28+
})
29+
}
30+
nhm = new NodeHtmlMarkdown({}, customTranslators)
31+
32+
const File = require('vinyl')
33+
34+
function markdownify(page, pubDate) {
35+
const title = page.title
36+
const html = page.contents.toString()
37+
const markdown = nhm.translate(html)
38+
39+
// haha, we now have to translate the .md link back to .html just for this link
40+
const orig = page.pub.url.replace(/\.md$/, '.html')
41+
42+
let output =
43+
`[View original HTML](${orig})\n\n` +
44+
`# ${title}\n\n${markdown}`
45+
46+
const frontmatter = {
47+
title,
48+
description: page.asciidoc.attributes.description,
49+
editUrl: page.src.editUrl,
50+
pubDate,
51+
}
52+
53+
output = `---\n${YAML.stringify(frontmatter)}---\n\n${output}`
54+
55+
page.contents = Buffer.from(output)
56+
}
57+
58+
module.exports.register = function ({ playbook, config }) {
59+
this.once('contextStarted', () => {
60+
const { createPageComposer: _delegate } = this.getFunctions()
61+
62+
this.replaceFunctions({
63+
// see https://gitlab.com/antora/antora/-/blob/v3.1.x/packages/page-composer/lib/create-page-composer.js
64+
createPageComposer (playbook, contentCatalog, uiCatalog) {
65+
function composePage (file, _contentCatalog, _navigationCatalog) {
66+
// instead of wrapping the file in a layout, just
67+
// return the file as-is
68+
return file
69+
}
70+
const create404Page = (siteAsciiDocConfig) =>
71+
composePage({
72+
asciidoc: siteAsciiDocConfig,
73+
mediaType: 'text/html',
74+
out: { path: '404.html' },
75+
pub: {},
76+
src: { stem: '404' },
77+
title: siteAsciiDocConfig?.attributes['404-page-title'] || 'PageNot Found',
78+
})
79+
80+
const ret = Object.assign(composePage, {composePage, create404Page} )
81+
return ret
82+
}
83+
})
84+
})
85+
86+
this.once('contentClassified', async ({ playbook, contentCatalog, siteCatalog }) => {
87+
const logger = this.getLogger('llm-summaries')
88+
89+
const pages = contentCatalog.getPages((page) => page.pub && page.out)
90+
91+
for (const page of pages) {
92+
page.pub.url = page.pub.url.replace(/\.html$/, '.md')
93+
page.out.path = page.out.path.replace(/\.html$/, '.md')
94+
}
95+
})
96+
97+
this.once('documentsConverted', async ({ playbook, contentCatalog, siteCatalog }) => {
98+
const logger = this.getLogger('llm-summaries')
99+
100+
const pubDate = new Date().toISOString()
101+
102+
const pages = contentCatalog.getPages(
103+
(page) =>
104+
page.mediaType === 'text/html'
105+
&& page.pub
106+
&& page.out)
107+
108+
for (const page of pages) {
109+
page.mediaType = 'text/markdown'
110+
markdownify(page, pubDate)
111+
}
112+
})
113+
114+
this.once('beforePublish', async ({ siteCatalog }) => {
115+
const nav = siteCatalog.getFiles().find(
116+
file => file.path === '_/js/site-navigation-data.js').contents.toString()
117+
const match = nav.match(/^siteNavigationData=(.+)$/s);
118+
// This file is created by @antora/site-generator-ms
119+
// eslint-disable-next-line no-eval
120+
const navObj = eval(match[1])
121+
122+
let output = ''
123+
124+
output+=`# Couchbase
125+
126+
> This is the official documentation for Couchbase, a leading NoSQL distributed database platform
127+
> designed for high performance, scalability, and flexibility.
128+
> The documentation covers comprehensive guides for developers and operations
129+
> teams, including installation and configuration, development tutorials, API
130+
> references, administration, security, cloud deployment, and integration with
131+
> various programming languages and frameworks.
132+
> Couchbase enables applications to handle massive data volumes and high
133+
> concurrency with its memory-first architecture and flexible JSON document
134+
> model.
135+
136+
137+
## Docs\n`
138+
139+
for (const c of navObj) {
140+
const v = c.versions[0]
141+
if (! v.sets.length) { continue }
142+
143+
const version = v.version.match(/\d\.\d/) ? `(${v.version})` : ''
144+
output += `\n\n### ${c.title} ${version}\n`
145+
146+
function process(item, level=0) {
147+
if (item.content) {
148+
149+
const content = item.url ?
150+
`[${item.content}](${item.url})` :
151+
nhm.translate(item.content)
152+
153+
// above is due to a quirk in the way the navigation data is generated:
154+
// in the case of a link with `^` created with `rel="noopener" target="_blank"`
155+
// something in the antora -> antora-site-generator-ms pipeline causes the content
156+
// to be the HTML node, with no url extracted. This still *works* because Markdown
157+
// allows HTML snippets, but it's ugly for readers of the raw markdown, so we translate
158+
// the HTML link to markdown ourselves here.
159+
160+
output+=`${' '.repeat(4*(level))}- ${content}\n`
161+
}
162+
const indent = item.content ? 1 : 0
163+
164+
for (const i of item.items || []) {
165+
process(i, level+indent)
166+
}
167+
}
168+
169+
process({ items: v.sets })
170+
const file = new File({
171+
contents: Buffer.from(output),
172+
mediaType: 'text/markdown',
173+
out: { path: 'llms.txt' },
174+
path: 'llms.txt',
175+
pub: { url: `/llms.txt`, rootPath: '' },
176+
src: { stem: 'llms' },
177+
})
178+
siteCatalog.addFile(file)
179+
}
180+
})
181+
}

package-lock.json

Lines changed: 23 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"gulp-connect": "~5.7",
2020
"js-yaml": "~4.1",
2121
"markdown-it": "^13.0.1",
22+
"node-html-markdown": "^2.0.0",
2223
"ora": "^9.0.0",
2324
"picomatch": "^4.0.3",
2425
"striptags": "^3.2.0",

0 commit comments

Comments
 (0)