Skip to content

Commit 75ff0a0

Browse files
committed
feat: Expose llms.txt and markdown content that is easily accessible to LLMs
1 parent 5aebe3a commit 75ff0a0

File tree

10 files changed

+1158
-30
lines changed

10 files changed

+1158
-30
lines changed

gulp/helpers/html-index.js

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
const fs = require('fs');
2+
const { parse } = require('node-html-parser');
3+
const { createTurndownService } = require('./turndown-config');
4+
5+
/**
6+
* Generic function to generate toon format markdown from HTML index pages.
7+
* Converts HTML pages to .html.md files as per https://github.com/toon-format/toon specification.
8+
*
9+
* @param {Object} config - Configuration object
10+
* @param {string} config.htmlPath - Path to the HTML file (e.g., 'public/components/next/index.html')
11+
* @param {string} config.title - Title for the markdown file (e.g., 'Components Index')
12+
* @param {string} config.description - Description text (e.g., 'List of all Camel components')
13+
*/
14+
async function generateHtmlIndex(config) {
15+
const { htmlPath, title, description } = config;
16+
const mdPath = htmlPath.replace(/\.html$/, '.md');
17+
18+
try {
19+
// Check if file exists
20+
if (!fs.existsSync(htmlPath)) {
21+
return;
22+
}
23+
24+
const htmlContent = fs.readFileSync(htmlPath, 'utf8');
25+
const root = parse(htmlContent);
26+
27+
// Create turndown service
28+
const turndownService = createTurndownService();
29+
30+
// Extract only the main article content
31+
let mainContent = root.querySelector('article.doc') ||
32+
root.querySelector('main') ||
33+
root.querySelector('.article') ||
34+
root.querySelector('article');
35+
36+
if (!mainContent) {
37+
return;
38+
}
39+
40+
// Remove navigation elements
41+
const elementsToRemove = mainContent.querySelectorAll('nav, header, footer, .nav, .navbar, .toolbar');
42+
elementsToRemove.forEach(el => el.remove());
43+
44+
// Remove anchor links
45+
const anchors = mainContent.querySelectorAll('a.anchor');
46+
anchors.forEach(el => el.remove());
47+
48+
// Clean up table cells by unwrapping div.content and div.paragraph wrappers
49+
const tableCells = mainContent.querySelectorAll('td.tableblock, th.tableblock');
50+
tableCells.forEach(cell => {
51+
let html = cell.innerHTML;
52+
// Unwrap <div class="content"><div class="paragraph"><p>...</p></div></div>
53+
html = html.replace(/<div class="content"><div class="paragraph">\s*<p>(.*?)<\/p>\s*<\/div><\/div>/gs, '$1');
54+
// Unwrap <div class="content"><div id="..." class="paragraph"><p>...</p></div></div>
55+
html = html.replace(/<div class="content"><div[^>]*class="paragraph"[^>]*>\s*<p>(.*?)<\/p>\s*<\/div><\/div>/gs, '$1');
56+
// Also handle simple <p class="tableblock">...</p> wrappers
57+
html = html.replace(/<p class="tableblock">(.*?)<\/p>/gs, '$1');
58+
cell.set_content(html);
59+
});
60+
61+
// Convert to Markdown
62+
let markdown = turndownService.turndown(mainContent.innerHTML);
63+
64+
// Update links to point to .md files instead of .html
65+
// Replace https://camel.apache.org/**/*.html with https://camel.apache.org/**/*.md
66+
markdown = markdown.replace(/(https:\/\/camel\.apache\.org\/[^)\s]*?)\.html/g, '$1.md');
67+
// Replace relative links *.html with *.md
68+
markdown = markdown.replace(/\[([^\]]+)\]\(([^)]+?)\.html\)/g, '[$1]($2.md)');
69+
70+
// Add header if title and description provided
71+
if (title && description) {
72+
markdown = `# ${title}\n\n${description}\n\n${markdown}`;
73+
}
74+
75+
// Write markdown file
76+
fs.writeFileSync(mdPath, markdown, 'utf8');
77+
} catch (error) {
78+
console.error(`Error generating markdown for ${htmlPath}:`, error.message);
79+
}
80+
}
81+
82+
/**
83+
* Generates markdown for all index files (HTML index pages).
84+
* This function processes all the index files specified in the configuration.
85+
*/
86+
async function generateAllIndexes() {
87+
console.log('\nGenerating markdown for all index files...');
88+
89+
// Define all HTML index files to process
90+
const htmlIndexes = [
91+
{
92+
htmlPath: 'public/camel-k/next/index.html',
93+
title: 'Camel K Documentation Index',
94+
description: 'Index of Camel K documentation pages.'
95+
},
96+
{
97+
htmlPath: 'public/camel-kafka-connector/next/index.html',
98+
title: 'Camel Kafka Connector Documentation Index',
99+
description: 'Index of Camel Kafka Connector documentation pages.'
100+
},
101+
{
102+
htmlPath: 'public/camel-kamelets/next/index.html',
103+
title: 'Camel Kamelets Documentation Index',
104+
description: 'Index of Camel Kamelets documentation pages.'
105+
},
106+
{
107+
htmlPath: 'public/camel-quarkus/next/index.html',
108+
title: 'Camel Quarkus Documentation Index',
109+
description: 'Index of Camel Quarkus documentation pages.'
110+
},
111+
{
112+
htmlPath: 'public/camel-spring-boot/next/index.html',
113+
title: 'Camel Spring Boot Documentation Index',
114+
description: 'Index of Camel Spring Boot documentation pages.'
115+
},
116+
{
117+
htmlPath: 'public/components/next/index.html',
118+
title: 'Components Index',
119+
description: 'Index of all Camel components.'
120+
},
121+
{
122+
htmlPath: 'public/components/next/others/index.html',
123+
title: 'Other Components Index',
124+
description: 'Index of other Camel components.'
125+
},
126+
{
127+
htmlPath: 'public/components/next/languages/index.html',
128+
title: 'Languages Index',
129+
description: 'Index of Camel expression and predicate languages.'
130+
},
131+
{
132+
htmlPath: 'public/components/next/eips/index.html',
133+
title: 'Enterprise Integration Patterns Index',
134+
description: 'Index of Enterprise Integration Patterns (EIPs).'
135+
},
136+
{
137+
htmlPath: 'public/components/next/dataformats/index.html',
138+
title: 'Data Formats Index',
139+
description: 'Index of Camel data formats.'
140+
},
141+
{
142+
htmlPath: 'public/manual/index.html',
143+
title: 'User Manual Index',
144+
description: 'Index of Apache Camel user manual pages.'
145+
},
146+
{
147+
htmlPath: 'public/manual/faq/index.html',
148+
title: 'FAQ Index',
149+
description: 'Frequently Asked Questions about Apache Camel.'
150+
}
151+
];
152+
153+
// Process all HTML indexes
154+
for (const config of htmlIndexes) {
155+
await generateHtmlIndex(config);
156+
}
157+
158+
console.log('All index files generation complete');
159+
}
160+
161+
module.exports = {
162+
generateHtmlIndex,
163+
generateAllIndexes
164+
};

gulp/helpers/rss-feed.js

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
const fs = require('fs');
2+
const xml2js = require('xml2js');
3+
4+
/**
5+
* Generic function to generate toon format markdown from RSS XML feeds.
6+
* Converts RSS feeds to .xml.md files as per https://github.com/toon-format/toon specification.
7+
*
8+
* @param {Object} config - Configuration object
9+
* @param {string} config.xmlPath - Path to the XML file (e.g., 'public/blog/index.xml')
10+
* @param {string} config.title - Title for the markdown file (e.g., 'Apache Camel Blog')
11+
* @param {string} config.description - Description text (e.g., 'Blog posts about Apache Camel')
12+
* @param {string} config.itemsName - Name for the items collection (e.g., 'posts', 'releases')
13+
*/
14+
async function generateRssFeedIndex(config) {
15+
const parser = new xml2js.Parser();
16+
17+
const { xmlPath, title, description, itemsName } = config;
18+
const mdPath = xmlPath.replace(/\.xml$/, '.md');
19+
20+
try {
21+
// Check if file exists
22+
if (!fs.existsSync(xmlPath)) {
23+
return;
24+
}
25+
26+
// Read XML file
27+
const xmlContent = fs.readFileSync(xmlPath, 'utf8');
28+
29+
// Parse XML to JavaScript object
30+
const result = await parser.parseStringPromise(xmlContent);
31+
32+
let toonContent = '';
33+
34+
// Check if it's an RSS feed
35+
if (result.rss && result.rss.channel && result.rss.channel[0]) {
36+
const channel = result.rss.channel[0];
37+
const items = channel.item || [];
38+
39+
// Create toon format header
40+
toonContent = `# ${title}\n\n`;
41+
toonContent += `${description}\n\n`;
42+
toonContent += `${itemsName}[${items.length}]{title,link,pubDate,description}:\n`;
43+
44+
// Add each item
45+
for (const item of items) {
46+
const itemTitle = item.title ? item.title[0] : '';
47+
const link = item.link ? item.link[0] : '';
48+
const pubDate = item.pubDate ? item.pubDate[0] : '';
49+
const itemDesc = item.description ? item.description[0].replace(/\n/g, ' ').substring(0, 200) : '';
50+
51+
// Convert links to markdown format
52+
let mdLink = link;
53+
if (mdLink.endsWith('/')) {
54+
mdLink = mdLink + 'index.md';
55+
} else {
56+
mdLink = mdLink.replace(/\.html$/, '.md');
57+
}
58+
59+
toonContent += ` ${itemTitle}|${mdLink}|${pubDate}|${itemDesc}\n`;
60+
}
61+
}
62+
63+
// Write toon format file
64+
fs.writeFileSync(mdPath, toonContent, 'utf8');
65+
} catch (error) {
66+
console.error(`Error generating toon format for ${xmlPath}:`, error.message);
67+
}
68+
}
69+
70+
/**
71+
* Generates toon format markdown for the Releases category RSS feed.
72+
* Converts public/categories/Releases/index.xml to index.xml.md
73+
* as per https://github.com/toon-format/toon specification.
74+
*/
75+
async function generateReleasesIndex() {
76+
console.log('\nGenerating toon format for Releases index...');
77+
await generateRssFeedIndex({
78+
xmlPath: 'public/categories/Releases/index.xml',
79+
title: 'Apache Camel Releases',
80+
description: 'Release feed for Apache Camel and related projects.',
81+
itemsName: 'releases'
82+
});
83+
console.log('Releases index toon format generation complete');
84+
}
85+
86+
/**
87+
* Generates toon format markdown for the Blog RSS feed.
88+
* Converts public/blog/index.xml to index.xml.md
89+
* as per https://github.com/toon-format/toon specification.
90+
*/
91+
async function generateBlogIndex() {
92+
console.log('\nGenerating toon format for Blog index...');
93+
await generateRssFeedIndex({
94+
xmlPath: 'public/blog/index.xml',
95+
title: 'Apache Camel Blog',
96+
description: 'Blog posts about Apache Camel and related topics.',
97+
itemsName: 'posts'
98+
});
99+
console.log('Blog index toon format generation complete');
100+
}
101+
102+
module.exports = {
103+
generateRssFeedIndex,
104+
generateReleasesIndex,
105+
generateBlogIndex
106+
};

gulp/helpers/toon-format.js

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
const fs = require('fs');
2+
const path = require('path');
3+
const xml2js = require('xml2js');
4+
5+
/**
6+
* Generates toon format sitemaps from XML sitemaps.
7+
* Converts all sitemap*.xml files to toon format (.md files)
8+
* as per https://github.com/toon-format/toon specification.
9+
*/
10+
async function generateToonSitemaps() {
11+
const parser = new xml2js.Parser();
12+
const glob = require('glob');
13+
14+
console.log('\nGenerating toon format sitemaps...');
15+
16+
// Find all sitemap*.xml files in the public directory
17+
const sitemapFiles = glob.sync('public/sitemap*.xml');
18+
19+
if (sitemapFiles.length === 0) {
20+
return;
21+
}
22+
23+
for (const xmlPath of sitemapFiles) {
24+
const sitemapFile = path.basename(xmlPath);
25+
const toonPath = xmlPath.replace(/\.xml$/, '.md');
26+
27+
try {
28+
// Read XML file
29+
const xmlContent = fs.readFileSync(xmlPath, 'utf8');
30+
31+
// Parse XML to JavaScript object
32+
const result = await parser.parseStringPromise(xmlContent);
33+
34+
let toonContent = '';
35+
36+
// Check if it's a sitemap index or a urlset
37+
if (result.sitemapindex) {
38+
// This is a sitemap index (sitemap.xml)
39+
const sitemaps = result.sitemapindex.sitemap || [];
40+
toonContent = `sitemaps[${sitemaps.length}]{loc}:\n`;
41+
for (const sitemap of sitemaps) {
42+
let loc = sitemap.loc ? sitemap.loc[0] : '';
43+
// Convert .xml URLs to .md
44+
loc = loc.replace(/\.xml$/, '.md');
45+
toonContent += ` ${loc}\n`;
46+
}
47+
} else if (result.urlset) {
48+
// This is a regular sitemap with URLs
49+
const urls = result.urlset.url || [];
50+
toonContent = `urls[${urls.length}]{loc,lastmod}:\n`;
51+
for (const url of urls) {
52+
let loc = url.loc ? url.loc[0] : '';
53+
const lastmod = url.lastmod ? url.lastmod[0] : '';
54+
// Convert .html URLs to .md
55+
loc = loc.replace(/\.html$/, '.md');
56+
toonContent += ` ${loc},${lastmod}\n`;
57+
}
58+
}
59+
60+
// Write toon format file
61+
fs.writeFileSync(toonPath, toonContent, 'utf8');
62+
console.log(`Generated ${sitemapFile.replace('.xml', '.md')}`);
63+
} catch (error) {
64+
console.error(`Error generating toon sitemap for ${sitemapFile}:`, error.message);
65+
}
66+
}
67+
68+
console.log(`Toon format sitemaps generation complete - ${sitemapFiles.length} files converted`);
69+
}
70+
71+
/**
72+
* Generates the /llms.txt file as per https://llmstxt.org/ specification.
73+
* This file helps LLMs discover and understand the structure of the documentation.
74+
* Reads from llms-txt-template.md and uses it as content.
75+
*
76+
* @param {Array<string>} pages - Array of page URLs that were converted to markdown
77+
*/
78+
function generateLlmsTxt(pages) {
79+
// Read the template file
80+
const templatePath = path.join(__dirname, '../../llms-txt-template.md');
81+
let llmsTxtContent = fs.readFileSync(templatePath, 'utf8');
82+
83+
fs.writeFileSync('public/llms.txt', llmsTxtContent, 'utf8');
84+
console.log('Generated /llms.txt');
85+
}
86+
87+
module.exports = {
88+
generateToonSitemaps,
89+
generateLlmsTxt
90+
};

0 commit comments

Comments
 (0)