Skip to content

Commit e080a1a

Browse files
committed
feat: Expose llms.txt and markdown content that is easily accessible to LLMs
1 parent e208de9 commit e080a1a

File tree

10 files changed

+1175
-30
lines changed

10 files changed

+1175
-30
lines changed

gulp/helpers/html-index.js

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
const fs = require('fs');
2+
const { parse } = require('node-html-parser');
3+
const { createTurndownService } = require('./turndown-config');
4+
5+
/**
6+
* Generic function to generate markdown from HTML index pages.
7+
*
8+
* @param {Object} config - Configuration object
9+
* @param {string} config.htmlPath - Path to the HTML file (e.g., 'public/components/next/index.html')
10+
* @param {string} config.title - Title for the markdown file (e.g., 'Components Index')
11+
* @param {string} config.description - Description text (e.g., 'List of all Camel components')
12+
*/
13+
async function generateHtmlIndex(config) {
14+
const { htmlPath, title, description } = config;
15+
const mdPath = htmlPath.replace(/\.html$/, '.md');
16+
17+
try {
18+
// Check if file exists
19+
if (!fs.existsSync(htmlPath)) {
20+
return;
21+
}
22+
23+
const htmlContent = fs.readFileSync(htmlPath, 'utf8');
24+
const root = parse(htmlContent);
25+
26+
// Create turndown service
27+
const turndownService = createTurndownService();
28+
29+
// Extract only the main article content
30+
let mainContent = root.querySelector('article.doc') ||
31+
root.querySelector('main') ||
32+
root.querySelector('.article') ||
33+
root.querySelector('article');
34+
35+
if (!mainContent) {
36+
return;
37+
}
38+
39+
// Remove navigation elements
40+
const elementsToRemove = mainContent.querySelectorAll('nav, header, footer, .nav, .navbar, .toolbar');
41+
elementsToRemove.forEach(el => el.remove());
42+
43+
// Remove anchor links
44+
const anchors = mainContent.querySelectorAll('a.anchor');
45+
anchors.forEach(el => el.remove());
46+
47+
// Clean up table cells by unwrapping div.content and div.paragraph wrappers
48+
const tableCells = mainContent.querySelectorAll('td.tableblock, th.tableblock');
49+
tableCells.forEach(cell => {
50+
let html = cell.innerHTML;
51+
// Unwrap <div class="content"><div class="paragraph"><p>...</p></div></div>
52+
html = html.replace(/<div class="content"><div class="paragraph">\s*<p>(.*?)<\/p>\s*<\/div><\/div>/gs, '$1');
53+
// Unwrap <div class="content"><div id="..." class="paragraph"><p>...</p></div></div>
54+
html = html.replace(/<div class="content"><div[^>]*class="paragraph"[^>]*>\s*<p>(.*?)<\/p>\s*<\/div><\/div>/gs, '$1');
55+
// Also handle simple <p class="tableblock">...</p> wrappers
56+
html = html.replace(/<p class="tableblock">(.*?)<\/p>/gs, '$1');
57+
cell.set_content(html);
58+
});
59+
60+
// Convert to Markdown
61+
let markdown = turndownService.turndown(mainContent.innerHTML);
62+
63+
// Update links to point to .md files instead of .html
64+
// Replace https://camel.apache.org/**/*.html with https://camel.apache.org/**/*.md
65+
markdown = markdown.replace(/(https:\/\/camel\.apache\.org\/[^)\s]*?)\.html/g, '$1.md');
66+
// Replace relative links *.html with *.md
67+
markdown = markdown.replace(/\[([^\]]+)\]\(([^)]+?)\.html\)/g, '[$1]($2.md)');
68+
69+
// Add header if title and description provided
70+
if (title && description) {
71+
markdown = `# ${title}\n\n${description}\n\n${markdown}`;
72+
}
73+
74+
// Write markdown file
75+
fs.writeFileSync(mdPath, markdown, 'utf8');
76+
} catch (error) {
77+
console.error(`Error generating markdown for ${htmlPath}:`, error.message);
78+
}
79+
}
80+
81+
/**
82+
* Generates markdown for all index files (HTML index pages).
83+
* This function processes all the index files specified in the configuration.
84+
*/
85+
async function generateAllIndexes() {
86+
console.log('\nGenerating markdown for all index files...');
87+
88+
const glob = require('glob');
89+
90+
// Define all HTML index files to process
91+
const htmlIndexes = [
92+
{
93+
htmlPath: 'public/camel-k/next/index.html',
94+
title: 'Camel K Documentation Index',
95+
description: 'Index of Camel K documentation pages.'
96+
},
97+
{
98+
htmlPath: 'public/camel-kafka-connector/next/index.html',
99+
title: 'Camel Kafka Connector Documentation Index',
100+
description: 'Index of Camel Kafka Connector documentation pages.'
101+
},
102+
{
103+
htmlPath: 'public/camel-kamelets/next/index.html',
104+
title: 'Camel Kamelets Documentation Index',
105+
description: 'Index of Camel Kamelets documentation pages.'
106+
},
107+
{
108+
htmlPath: 'public/camel-quarkus/next/index.html',
109+
title: 'Camel Quarkus Documentation Index',
110+
description: 'Index of Camel Quarkus documentation pages.'
111+
},
112+
{
113+
htmlPath: 'public/camel-spring-boot/next/index.html',
114+
title: 'Camel Spring Boot Documentation Index',
115+
description: 'Index of Camel Spring Boot documentation pages.'
116+
},
117+
{
118+
htmlPath: 'public/components/next/index.html',
119+
title: 'Components Index',
120+
description: 'Index of all Camel components.'
121+
},
122+
{
123+
htmlPath: 'public/components/next/others/index.html',
124+
title: 'Other Components Index',
125+
description: 'Index of other Camel components.'
126+
},
127+
{
128+
htmlPath: 'public/components/next/languages/index.html',
129+
title: 'Languages Index',
130+
description: 'Index of Camel expression and predicate languages.'
131+
},
132+
{
133+
htmlPath: 'public/components/next/eips/index.html',
134+
title: 'Enterprise Integration Patterns Index',
135+
description: 'Index of Enterprise Integration Patterns (EIPs).'
136+
},
137+
{
138+
htmlPath: 'public/components/next/dataformats/index.html',
139+
title: 'Data Formats Index',
140+
description: 'Index of Camel data formats.'
141+
},
142+
{
143+
htmlPath: 'public/manual/index.html',
144+
title: 'User Manual Index',
145+
description: 'Index of Apache Camel user manual pages.'
146+
},
147+
{
148+
htmlPath: 'public/manual/faq/index.html',
149+
title: 'FAQ Index',
150+
description: 'Frequently Asked Questions about Apache Camel.'
151+
},
152+
{
153+
htmlPath: 'public/releases/index.html',
154+
title: 'Releases Index',
155+
description: 'Apache Camel version releases Index.'
156+
}
157+
];
158+
159+
// Process all HTML indexes
160+
for (const config of htmlIndexes) {
161+
await generateHtmlIndex(config);
162+
}
163+
164+
// Find all index.html files under public/releases/**/
165+
console.log('\nGenerating markdown for all release index files...');
166+
const releaseIndexFiles = glob.sync('public/releases/**/index.html');
167+
console.log(`Found ${releaseIndexFiles.length} release index files to process`);
168+
169+
// Process each release index file without custom title/description
170+
for (const htmlPath of releaseIndexFiles) {
171+
await generateHtmlIndex({ htmlPath });
172+
}
173+
174+
console.log('All index files generation complete');
175+
}
176+
177+
module.exports = {
178+
generateHtmlIndex,
179+
generateAllIndexes
180+
};

gulp/helpers/llms-txt.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
const fs = require('fs');
2+
const path = require('path');
3+
4+
/**
5+
* Generates the /llms.txt file as per https://llmstxt.org/ specification.
6+
* This file helps LLMs discover and understand the structure of the documentation.
7+
* Reads from llms-txt-template.md and uses it as content.
8+
*
9+
* @param {Array<string>} pages - Array of page URLs that were converted to markdown
10+
*/
11+
function generateLlmsTxt(pages) {
12+
// Read the template file
13+
const templatePath = path.join(__dirname, '../../llms-txt-template.md');
14+
let llmsTxtContent = fs.readFileSync(templatePath, 'utf8');
15+
16+
fs.writeFileSync('public/llms.txt', llmsTxtContent, 'utf8');
17+
console.log('Generated /llms.txt');
18+
}
19+
20+
module.exports = {
21+
generateLlmsTxt
22+
};

gulp/helpers/rss-feed.js

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
const fs = require('fs');
2+
const xml2js = require('xml2js');
3+
4+
/**
5+
* Generic function to generate toon format markdown from RSS XML feeds.
6+
* Converts RSS feeds to plain text files (.md) as per https://github.com/toon-format/toon specification.
7+
*
8+
* @param {Object} config - Configuration object
9+
* @param {string} config.xmlPath - Path to the XML file (e.g., 'public/blog/index.xml')
10+
* @param {string} config.title - Title for the markdown file (e.g., 'Apache Camel Blog')
11+
* @param {string} config.description - Description text (e.g., 'Blog posts about Apache Camel')
12+
* @param {string} config.itemsName - Name for the items collection (e.g., 'posts', 'releases')
13+
*/
14+
async function generateRssFeedIndex(config) {
15+
const parser = new xml2js.Parser();
16+
17+
const { xmlPath, title, description, itemsName } = config;
18+
const mdPath = xmlPath.replace(/\.xml$/, '.md');
19+
20+
try {
21+
// Check if file exists
22+
if (!fs.existsSync(xmlPath)) {
23+
return;
24+
}
25+
26+
// Read XML file
27+
const xmlContent = fs.readFileSync(xmlPath, 'utf8');
28+
29+
// Parse XML to JavaScript object
30+
const result = await parser.parseStringPromise(xmlContent);
31+
32+
let toonContent = '';
33+
34+
// Check if it's an RSS feed
35+
if (result.rss && result.rss.channel && result.rss.channel[0]) {
36+
const channel = result.rss.channel[0];
37+
const items = channel.item || [];
38+
39+
// Create toon format header
40+
toonContent = `# ${title}\n\n`;
41+
toonContent += `${description}\n\n`;
42+
toonContent += `${itemsName}[${items.length}]{title,link,pubDate,description}:\n`;
43+
44+
// Add each item
45+
for (const item of items) {
46+
const itemTitle = item.title ? item.title[0] : '';
47+
const link = item.link ? item.link[0] : '';
48+
const pubDate = item.pubDate ? item.pubDate[0] : '';
49+
const itemDesc = item.description ? item.description[0].replace(/\n/g, ' ').substring(0, 200) : '';
50+
51+
// Convert links to markdown format
52+
let mdLink = link;
53+
if (mdLink.endsWith('/')) {
54+
mdLink = mdLink + 'index.md';
55+
} else {
56+
mdLink = mdLink.replace(/\.html$/, '.md');
57+
}
58+
59+
toonContent += ` ${itemTitle}|${mdLink}|${pubDate}|${itemDesc}\n`;
60+
}
61+
}
62+
63+
// Write toon format file
64+
fs.writeFileSync(mdPath, toonContent, 'utf8');
65+
} catch (error) {
66+
console.error(`Error generating toon format for ${xmlPath}:`, error.message);
67+
}
68+
}
69+
70+
/**
71+
* Generates toon format markdown for the Releases category RSS feed.
72+
* Converts public/categories/Releases/index.xml to index.md
73+
* as per https://github.com/toon-format/toon specification.
74+
*/
75+
async function generateReleasesIndex() {
76+
console.log('\nGenerating toon format for Releases index...');
77+
await generateRssFeedIndex({
78+
xmlPath: 'public/categories/Releases/index.xml',
79+
title: 'Apache Camel Releases',
80+
description: 'Release feed for Apache Camel and related projects.',
81+
itemsName: 'releases'
82+
});
83+
console.log('Releases index toon format generation complete');
84+
}
85+
86+
/**
87+
* Generates toon format markdown for the Blog RSS feed.
88+
* Converts public/blog/index.xml to index.md
89+
* as per https://github.com/toon-format/toon specification.
90+
*/
91+
async function generateBlogIndex() {
92+
console.log('\nGenerating toon format for Blog index...');
93+
await generateRssFeedIndex({
94+
xmlPath: 'public/blog/index.xml',
95+
title: 'Apache Camel Blog',
96+
description: 'Blog posts about Apache Camel and related topics.',
97+
itemsName: 'posts'
98+
});
99+
console.log('Blog index toon format generation complete');
100+
}
101+
102+
module.exports = {
103+
generateRssFeedIndex,
104+
generateReleasesIndex,
105+
generateBlogIndex
106+
};

gulp/helpers/toon-format.js

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
const fs = require('fs');
2+
const path = require('path');
3+
const xml2js = require('xml2js');
4+
5+
/**
6+
* Generates toon format sitemaps from XML sitemaps.
7+
* Converts all sitemap*.xml files to toon format (.md files)
8+
* as per https://github.com/toon-format/toon specification.
9+
*/
10+
async function generateToonSitemaps() {
11+
const parser = new xml2js.Parser();
12+
const glob = require('glob');
13+
14+
console.log('\nGenerating toon format sitemaps...');
15+
16+
// Find all sitemap*.xml files in the public directory
17+
const sitemapFiles = glob.sync('public/sitemap*.xml');
18+
19+
if (sitemapFiles.length === 0) {
20+
return;
21+
}
22+
23+
for (const xmlPath of sitemapFiles) {
24+
const sitemapFile = path.basename(xmlPath);
25+
const toonPath = xmlPath.replace(/\.xml$/, '.md');
26+
27+
try {
28+
// Read XML file
29+
const xmlContent = fs.readFileSync(xmlPath, 'utf8');
30+
31+
// Parse XML to JavaScript object
32+
const result = await parser.parseStringPromise(xmlContent);
33+
34+
let toonContent = '';
35+
36+
// Check if it's a sitemap index or a urlset
37+
if (result.sitemapindex) {
38+
// This is a sitemap index (sitemap.xml)
39+
const sitemaps = result.sitemapindex.sitemap || [];
40+
toonContent = `sitemaps[${sitemaps.length}]{loc}:\n`;
41+
for (const sitemap of sitemaps) {
42+
let loc = sitemap.loc ? sitemap.loc[0] : '';
43+
// Convert .xml URLs to .md
44+
loc = loc.replace(/\.xml$/, '.md');
45+
toonContent += ` ${loc}\n`;
46+
}
47+
} else if (result.urlset) {
48+
// This is a regular sitemap with URLs
49+
const urls = result.urlset.url || [];
50+
toonContent = `urls[${urls.length}]{loc,lastmod}:\n`;
51+
for (const url of urls) {
52+
let loc = url.loc ? url.loc[0] : '';
53+
const lastmod = url.lastmod ? url.lastmod[0] : '';
54+
// Convert .html URLs to .md
55+
loc = loc.replace(/\.html$/, '.md');
56+
toonContent += ` ${loc},${lastmod}\n`;
57+
}
58+
}
59+
60+
// Write toon format file
61+
fs.writeFileSync(toonPath, toonContent, 'utf8');
62+
console.log(`Generated ${sitemapFile.replace('.xml', '.md')}`);
63+
} catch (error) {
64+
console.error(`Error generating toon sitemap for ${sitemapFile}:`, error.message);
65+
}
66+
}
67+
68+
console.log(`Toon format sitemaps generation complete - ${sitemapFiles.length} files converted`);
69+
}
70+
71+
module.exports = {
72+
generateToonSitemaps
73+
};

0 commit comments

Comments
 (0)