|
4 | 4 | * |
5 | 5 | * 1. Markdown endpoints — serves a clean .md file alongside every HTML page |
6 | 6 | * 2. llms.txt — discovery index listing all pages with links to .md endpoints |
| 7 | + * 3. Agent signaling — injects a hidden llms.txt directive right after <body> |
| 8 | + * in every HTML page so agents discover it early (before nav/sidebar) |
7 | 9 | * |
8 | 10 | * Runs in the astro:build:done hook so it operates on the final build output. |
9 | 11 | */ |
@@ -62,8 +64,119 @@ function findSection(filePath) { |
62 | 64 | return best; |
63 | 65 | } |
64 | 66 |
|
| 67 | +// Path to the CLI reference page — split into per-command endpoints for agents. |
| 68 | +const CLI_REFERENCE = "reference/cli.md"; |
| 69 | + |
| 70 | +/** |
| 71 | + * Split the CLI reference into per-command markdown files. |
| 72 | + * Each `## \`icp ...\`` heading becomes its own file under reference/cli/. |
| 73 | + * Returns metadata for each generated sub-page (for llms.txt). |
| 74 | + */ |
| 75 | +function splitCliReference(outDir) { |
| 76 | + const cliMd = path.join(outDir, CLI_REFERENCE); |
| 77 | + if (!fs.existsSync(cliMd)) return []; |
| 78 | + |
| 79 | + const content = fs |
| 80 | + .readFileSync(cliMd, "utf-8") |
| 81 | + // Strip the clap-markdown generation footer that appears at the end. |
| 82 | + .replace(/\n*<hr\/>\s*\n*<small>[\s\S]*?<\/small>\s*$/, "\n"); |
| 83 | + // Split on ## `icp ...` headings, keeping the heading with the section. |
| 84 | + const sections = content.split(/^(?=## `icp\b)/m).filter((s) => s.trim()); |
| 85 | + |
| 86 | + const subDir = path.join(outDir, "reference", "cli"); |
| 87 | + fs.mkdirSync(subDir, { recursive: true }); |
| 88 | + |
| 89 | + const subPages = []; |
| 90 | + const seenSlugs = new Map(); // slug → command name, for collision detection |
| 91 | + for (const section of sections) { |
| 92 | + const match = section.match(/^## `(icp[\w\s-]*?)`/); |
| 93 | + if (!match) continue; |
| 94 | + |
| 95 | + const command = match[1].trim(); |
| 96 | + // icp build → build, icp canister call → canister-call |
| 97 | + const slug = command === "icp" ? "index" : command.replace(/^icp /, "").replace(/ /g, "-"); |
| 98 | + const fileName = `${slug}.md`; |
| 99 | + |
| 100 | + // Detect slug collisions (e.g., "icp foo-bar" vs "icp foo bar"). |
| 101 | + if (seenSlugs.has(slug)) { |
| 102 | + throw new Error( |
| 103 | + `CLI reference split: slug collision for "${fileName}" ` + |
| 104 | + `between commands "${seenSlugs.get(slug)}" and "${command}"` |
| 105 | + ); |
| 106 | + } |
| 107 | + seenSlugs.set(slug, command); |
| 108 | + |
| 109 | + // Extract the description: first plain-text line after the heading, |
| 110 | + // skipping **Usage:**, ###### headings, list items, and empty lines. |
| 111 | + const lines = section.split("\n"); |
| 112 | + const descLine = lines.find( |
| 113 | + (l, i) => |
| 114 | + i > 0 && |
| 115 | + l.trim() && |
| 116 | + !l.startsWith("**Usage") && |
| 117 | + !l.startsWith("#") && |
| 118 | + !l.startsWith("*") |
| 119 | + ); |
| 120 | + const description = descLine ? descLine.trim() : ""; |
| 121 | + |
| 122 | + // Rewrite subcommand list items to link to their per-command endpoints. |
| 123 | + // e.g., `* \`call\` — ...` → `* [\`call\`](canister-call.md) — ...` |
| 124 | + // The parent prefix (e.g., "canister") is used to build the slug. |
| 125 | + const parentSlug = command.replace(/^icp ?/, "").replace(/ /g, "-"); |
| 126 | + const body = section.replace(/^## [^\n]+\n+/, "").replace( |
| 127 | + /^\* `(\w[\w-]*)` —/gm, |
| 128 | + (_, sub) => { |
| 129 | + const subSlug = parentSlug ? `${parentSlug}-${sub}` : sub; |
| 130 | + return `* [\`${sub}\`](${subSlug}.md) —`; |
| 131 | + } |
| 132 | + ); |
| 133 | + |
| 134 | + fs.writeFileSync( |
| 135 | + path.join(subDir, fileName), |
| 136 | + BOM + `# ${command}\n\n` + body + "\n" |
| 137 | + ); |
| 138 | + |
| 139 | + subPages.push({ |
| 140 | + file: `reference/cli/${fileName}`, |
| 141 | + title: `\`${command}\``, |
| 142 | + description, |
| 143 | + // Top-level commands have exactly one space (e.g., "icp build"). |
| 144 | + // The bare "icp" root and deep subcommands are excluded from llms.txt. |
| 145 | + isTopLevel: (command.match(/ /g) || []).length === 1, |
| 146 | + }); |
| 147 | + } |
| 148 | + |
| 149 | + // Validate: the CLI reference should contain commands. If the format changed |
| 150 | + // and nothing was extracted, fail loudly rather than silently producing no output. |
| 151 | + if (subPages.length === 0) { |
| 152 | + throw new Error( |
| 153 | + "CLI reference split: no commands found. " + |
| 154 | + "Expected ## `icp ...` headings in " + CLI_REFERENCE |
| 155 | + ); |
| 156 | + } |
| 157 | + |
| 158 | + // Validate: all subcommand links in generated files point to existing files. |
| 159 | + for (const { file } of subPages) { |
| 160 | + const filePath = path.join(outDir, file); |
| 161 | + const md = fs.readFileSync(filePath, "utf-8"); |
| 162 | + const linkPattern = /\]\((\S+\.md)\)/g; |
| 163 | + let linkMatch; |
| 164 | + while ((linkMatch = linkPattern.exec(md)) !== null) { |
| 165 | + const target = path.join(path.dirname(filePath), linkMatch[1]); |
| 166 | + if (!fs.existsSync(target)) { |
| 167 | + throw new Error( |
| 168 | + `CLI reference split: broken link in ${file}: ` + |
| 169 | + `${linkMatch[1]} does not exist` |
| 170 | + ); |
| 171 | + } |
| 172 | + } |
| 173 | + } |
| 174 | + |
| 175 | + return subPages; |
| 176 | +} |
| 177 | + |
65 | 178 | /** Generate llms.txt content from collected page metadata. */ |
66 | | -function generateLlmsTxt(pages, siteUrl, basePath) { |
| 179 | +function generateLlmsTxt(pages, siteUrl, basePath, cliSubPages) { |
67 | 180 | const base = (siteUrl + basePath).replace(/\/$/, ""); |
68 | 181 |
|
69 | 182 | const skillsBase = |
@@ -149,6 +262,20 @@ function generateLlmsTxt(pages, siteUrl, basePath) { |
149 | 262 | ? `- [${page.title}](${url}): ${page.description}` |
150 | 263 | : `- [${page.title}](${url})`; |
151 | 264 | lines.push(entry); |
| 265 | + |
| 266 | + // Nest top-level command endpoints under the CLI Reference entry. |
| 267 | + // Subcommands (e.g., "icp canister call") are omitted from the index |
| 268 | + // but still available as .md endpoints for agents to fetch on demand. |
| 269 | + if (page.file === CLI_REFERENCE && cliSubPages.length > 0) { |
| 270 | + for (const sub of cliSubPages) { |
| 271 | + if (!sub.isTopLevel) continue; |
| 272 | + const subUrl = `${base}/${sub.file}`; |
| 273 | + const subEntry = sub.description |
| 274 | + ? ` - [${sub.title}](${subUrl}): ${sub.description}` |
| 275 | + : ` - [${sub.title}](${subUrl})`; |
| 276 | + lines.push(subEntry); |
| 277 | + } |
| 278 | + } |
152 | 279 | } |
153 | 280 | lines.push(""); |
154 | 281 | } |
@@ -200,12 +327,58 @@ export default function agentDocs() { |
200 | 327 |
|
201 | 328 | logger.info(`Generated ${pages.length} markdown endpoints`); |
202 | 329 |
|
| 330 | + // 1b. Split CLI reference into per-command endpoints for agents |
| 331 | + const cliSubPages = splitCliReference(outDir); |
| 332 | + if (cliSubPages.length > 0) { |
| 333 | + logger.info( |
| 334 | + `Split CLI reference into ${cliSubPages.length} per-command endpoints` |
| 335 | + ); |
| 336 | + } |
| 337 | + |
203 | 338 | // 2. Generate llms.txt |
204 | | - const llmsTxt = generateLlmsTxt(pages, siteUrl, basePath); |
| 339 | + const llmsTxt = generateLlmsTxt(pages, siteUrl, basePath, cliSubPages); |
205 | 340 | fs.writeFileSync(path.join(outDir, "llms.txt"), llmsTxt); |
206 | 341 | logger.info( |
207 | 342 | `Generated llms.txt (${llmsTxt.length} chars, ${pages.length} pages)` |
208 | 343 | ); |
| 344 | + |
| 345 | + // 3. Inject agent signaling directive into HTML pages |
| 346 | + // Places a visually-hidden blockquote right after <body> so it appears |
| 347 | + // early in the document (within the first ~15%), before nav/sidebar. |
| 348 | + // Uses CSS clip-rect (not display:none) so it survives HTML-to-markdown |
| 349 | + // conversion. See: https://agentdocsspec.com |
| 350 | + const llmsTxtUrl = `${basePath}llms.txt`; |
| 351 | + const directive = |
| 352 | + `<blockquote class="agent-signaling" data-pagefind-ignore>` + |
| 353 | + `<p>For AI agents: Documentation index at ` + |
| 354 | + `<a href="${llmsTxtUrl}">${llmsTxtUrl}</a></p></blockquote>`; |
| 355 | + const htmlFiles = fs.globSync("**/*.html", { cwd: outDir }); |
| 356 | + let injected = 0; |
| 357 | + for (const file of htmlFiles) { |
| 358 | + const filePath = path.join(outDir, file); |
| 359 | + const html = fs.readFileSync(filePath, "utf-8"); |
| 360 | + const bodyIdx = html.indexOf("<body"); |
| 361 | + if (bodyIdx === -1) continue; |
| 362 | + const closeIdx = html.indexOf(">", bodyIdx); |
| 363 | + if (closeIdx === -1) continue; |
| 364 | + const insertAt = closeIdx + 1; |
| 365 | + fs.writeFileSync( |
| 366 | + filePath, |
| 367 | + html.slice(0, insertAt) + directive + html.slice(insertAt) |
| 368 | + ); |
| 369 | + injected++; |
| 370 | + } |
| 371 | + logger.info(`Injected agent signaling into ${injected} HTML pages`); |
| 372 | + |
| 373 | + // 4. Alias sitemap-index.xml → sitemap.xml |
| 374 | + // Astro's sitemap integration outputs sitemap-index.xml, but crawlers |
| 375 | + // and the agentdocsspec checker expect /sitemap.xml by convention. |
| 376 | + const sitemapIndex = path.join(outDir, "sitemap-index.xml"); |
| 377 | + const sitemapAlias = path.join(outDir, "sitemap.xml"); |
| 378 | + if (fs.existsSync(sitemapIndex) && !fs.existsSync(sitemapAlias)) { |
| 379 | + fs.copyFileSync(sitemapIndex, sitemapAlias); |
| 380 | + logger.info("Copied sitemap-index.xml → sitemap.xml"); |
| 381 | + } |
209 | 382 | }, |
210 | 383 | }, |
211 | 384 | }; |
|
0 commit comments