diff --git a/docusaurus.config.ts b/docusaurus.config.ts index a84771a782..8de19f790b 100644 --- a/docusaurus.config.ts +++ b/docusaurus.config.ts @@ -5,8 +5,12 @@ import type * as Preset from '@docusaurus/preset-classic'; import {editLinkUrl as editUrlFunction} from './src/editUrl'; import crossRepoLinksPlugin from './src/remark/cross-repo-links'; import {plugin as codeBlockSnippetsPlugin} from './src/remark/code-block-snippets'; +import llmsCapture from './src/remark/llms-capture'; +import llmsPlugin from './src/plugins/llms'; import {redirectsOptions} from './redirects'; +const siteDir = __dirname; + const config: Config = { title: 'Oasis Documentation', tagline: '', @@ -46,6 +50,7 @@ const config: Config = { converters:['pnpm','yarn'], // Package managers to use. }, ], + [llmsCapture, {siteDir}], ], routeBasePath: '/', showLastUpdateTime: true, @@ -89,6 +94,7 @@ const config: Config = { } }; }, + llmsPlugin, [ '@docusaurus/plugin-client-redirects', redirectsOptions, diff --git a/src/plugins/llms/capture.ts b/src/plugins/llms/capture.ts new file mode 100644 index 0000000000..f1aac3c013 --- /dev/null +++ b/src/plugins/llms/capture.ts @@ -0,0 +1,58 @@ +import * as crypto from 'crypto'; +import * as fs from 'fs'; +import * as path from 'path'; + +import type {MdastNode} from './types'; + +export const CAPTURE_TREE_VERSION = 1; + +function hashString(value: string): string { + return crypto.createHash('sha1').update(value).digest('hex'); +} + +export function captureDirForSite(siteDir: string): string { + return path.join(siteDir, '.docusaurus', 'llms-export', 'trees'); +} + +export function capturedTreePathForSource( + sourceAbsPath: string, + siteDir: string, +): string { + let absolutePath: string; + try { + absolutePath = fs.realpathSync(sourceAbsPath); + } catch { + absolutePath = path.resolve(sourceAbsPath); + } + return path.join( + captureDirForSite(siteDir), + `${hashString(absolutePath)}.json`, + ); +} + +export function wrapCapturedTree(tree: unknown): { + version: number; + tree: unknown; +} { + return {version: CAPTURE_TREE_VERSION, tree}; +} + +export function unwrapCapturedTree(payload: unknown): MdastNode | null { + if (!payload || typeof payload !== 'object') return null; + + const record = payload as Record; + + // Handle versioned format + if ('version' in record && 'tree' in record) { + const version = Number(record.version); + if (version !== CAPTURE_TREE_VERSION) return null; + return (record.tree as MdastNode) ?? null; + } + + // Legacy format support (version 1 only) + if (CAPTURE_TREE_VERSION === 1) { + return payload as MdastNode; + } + + return null; +} diff --git a/src/plugins/llms/generator.ts b/src/plugins/llms/generator.ts new file mode 100644 index 0000000000..0ca3c1a138 --- /dev/null +++ b/src/plugins/llms/generator.ts @@ -0,0 +1,671 @@ +import * as fsp from 'fs/promises'; +import * as path from 'path'; +import { + deriveDescriptionFromTree, + joinUrl, + loadCapturedTree, + renderMarkdownFromCapturedTree, + stripNoisyNodes, + toMarkdownUrl, + truncate, + urlForPermalink, +} from './markdown'; +import type {MdastNode, SidebarItemType, SidebarCategoryItem} from './types'; + +// Types +export interface DocInfo { + id: string; + title: string; + description?: string; + descriptionFromFrontMatter?: boolean; + permalink: string; + source?: string; + unlisted?: boolean; + draft?: boolean; +} + +export type SidebarItem = SidebarItemType; + +interface GeneratorParams { + siteDir: string; + outDir: string; + siteBase: string; + title: string; + summary: string; + docsById: Map; + docsByPermalink: Map; + docsSidebars: Record; + orderedSidebars: string[]; + optionalSidebars: string[]; + sectionTitleForSidebar: (name: string) => string; + llmsTxtFilename: string; + llmsFullTxtFilename: string; + maxDescriptionLength: number; +} + +interface NormalizedItem { + type: string; + id?: string; + docId?: string; + label?: string; + href?: string; + items?: SidebarItem[]; + link?: SidebarCategoryItem['link']; + description?: string; +} + +const treeCache = new Map(); + +function getSourcePath(doc: DocInfo, siteDir: string): string { + if (!doc.source) return ''; + if (doc.source.startsWith('@site/')) { + return path.join(siteDir, doc.source.slice(6)); + } + if (path.isAbsolute(doc.source)) { + return doc.source; + } + return path.join(siteDir, doc.source); +} + +function getCachedTree(doc: DocInfo, siteDir: string): MdastNode | null { + if (!doc.source) return null; + + const sourcePath = getSourcePath(doc, siteDir); + const cacheKey = `${siteDir}::${sourcePath}`; + + if (treeCache.has(cacheKey)) { + return treeCache.get(cacheKey) ?? null; + } + + const rawTree = loadCapturedTree(sourcePath, siteDir); + if (!rawTree) { + treeCache.set(cacheKey, null); + return null; + } + + const cleanedTree = stripNoisyNodes(structuredClone(rawTree)); + treeCache.set(cacheKey, cleanedTree); + return cleanedTree; +} + +// Sidebar Item Helpers +function normalizeItem(item: SidebarItem): NormalizedItem | null { + if (typeof item === 'string') { + return {type: 'doc', id: item}; + } + if (typeof item === 'object' && item !== null) { + return item as NormalizedItem; + } + return null; +} + +function getLinkedDoc( + item: SidebarItem, + docsById: Map, +): DocInfo | undefined { + const category = item as SidebarCategoryItem; + if (category?.link?.type !== 'doc') return undefined; + const docId = category.link.id ?? category.link.docId ?? ''; + return docsById.get(docId); +} + +function getCategoryDescription( + item: SidebarItem, + linkedDoc?: DocInfo, +): string { + const category = item as SidebarCategoryItem; + return ( + category?.description ?? + category?.link?.description ?? + linkedDoc?.description ?? + '' + ); +} + +function getGeneratedIndexSlug(item: SidebarItem): string | undefined { + const category = item as SidebarCategoryItem; + if (category?.link?.type !== 'generated-index') return undefined; + const slug = String(category.link.slug ?? '').trim(); + if (!slug) return undefined; + return slug.startsWith('/') ? slug : `/${slug}`; +} + +/** Check if a doc is visible (not unlisted and not a draft) */ +function isVisibleDoc(doc?: DocInfo): doc is DocInfo { + return Boolean(doc && !doc.unlisted && !doc.draft); +} + +/** Remove trailing colon from descriptions */ +function sanitizeDescription(desc: string): string { + return desc.replace(/:\s*$/, '').trim(); +} + +/** Check if description contains JSX/MDX noise that shouldn't be shown */ +function isNoisyDescription(desc: string): boolean { + return /[<{]|^import\s/i.test(desc); +} + +/** Join lines, removing consecutive blank lines */ +function joinLines(lines: string[]): string { + return ( + lines + .filter((line, i, arr) => !(line === '' && arr[i - 1] === '')) + .join('\n') + .trim() + '\n' + ); +} + +/** Filter out linked doc from items to avoid duplication */ +function filterOutLinkedDoc( + items: SidebarItem[], + linkedDoc?: DocInfo, +): SidebarItem[] { + if (!linkedDoc) return items ?? []; + return (items ?? []).filter((item) => { + const normalized = normalizeItem(item); + if (!normalized || normalized.type !== 'doc') return true; + const itemId = normalized.id ?? normalized.docId; + return itemId !== linkedDoc.id; + }); +} + +// ID Collection (for "Other" section) +function collectDocIds( + items: SidebarItem[], + docsById: Map, + ids: Set, +): void { + for (const item of items) { + const normalized = normalizeItem(item); + if (!normalized) continue; + + if (normalized.type === 'doc') { + const docId = normalized.id ?? normalized.docId; + if (docId) ids.add(docId); + } else if (normalized.type === 'category') { + const linkedDoc = getLinkedDoc(item, docsById); + if (linkedDoc) ids.add(linkedDoc.id); + if (Array.isArray(normalized.items)) { + collectDocIds(normalized.items, docsById, ids); + } + } + } +} + +// Flat list building for llms.txt and llms-full.txt +interface FlatDocEntry { + breadcrumb: string[]; + doc: DocInfo; + labelOverride?: string; +} + +function collectFlatDocEntries( + items: SidebarItem[], + docsById: Map, + breadcrumb: string[] = [], +): FlatDocEntry[] { + const entries: FlatDocEntry[] = []; + + for (const item of items) { + const normalized = normalizeItem(item); + if (!normalized) continue; + + if (normalized.type === 'doc') { + const doc = docsById.get(normalized.id ?? normalized.docId ?? ''); + if (isVisibleDoc(doc)) { + entries.push({ + breadcrumb, + doc, + labelOverride: normalized.label, + }); + } + } else if (normalized.type === 'category') { + const label = String(normalized.label ?? '').trim(); + const linkedDoc = getLinkedDoc(item, docsById); + + // Add linked doc for category (if exists) + if (isVisibleDoc(linkedDoc)) { + entries.push({ + breadcrumb, + doc: linkedDoc, + labelOverride: label || undefined, + }); + } + + // Recurse into children with updated breadcrumb + const childBreadcrumb = label ? [...breadcrumb, label] : breadcrumb; + const childItems = filterOutLinkedDoc(normalized.items ?? [], linkedDoc); + entries.push( + ...collectFlatDocEntries(childItems, docsById, childBreadcrumb), + ); + } + // Note: 'link' type items are skipped in flat mode (external links have no doc) + } + + return entries; +} + +function buildFlatDocLine( + entry: FlatDocEntry, + siteBase: string | undefined, + maxDescriptionLength: number, +): string { + const {breadcrumb, doc, labelOverride} = entry; + + const url = urlForPermalink(doc.permalink, siteBase); + const baseTitle = labelOverride ?? doc.title; + + // Prepend breadcrumb to title if present + const title = breadcrumb.length + ? `${breadcrumb.join(' / ')}: ${baseTitle}` + : baseTitle; + + const desc = + doc.description && !isNoisyDescription(doc.description) + ? `: ${sanitizeDescription( + truncate(doc.description, maxDescriptionLength), + )}` + : ''; + + return `- [${title}](${toMarkdownUrl(url, siteBase)})${desc}`; +} + +// File Generation +async function writeLlmsTxt(params: GeneratorParams): Promise { + const lines = [`# ${params.title}`, params.summary, '']; + const optionalSet = new Set(params.optionalSidebars); + + // Regular sidebars (non-optional) + for (const sidebarName of params.orderedSidebars) { + if (optionalSet.has(sidebarName)) continue; + + const items = params.docsSidebars[sidebarName]; + if (!Array.isArray(items) || !items.length) continue; + + lines.push(`## ${params.sectionTitleForSidebar(sidebarName)}`, ''); + + // Collect all docs as flat entries with breadcrumbs + const flatEntries = collectFlatDocEntries(items, params.docsById); + + for (const entry of flatEntries) { + lines.push( + buildFlatDocLine(entry, params.siteBase, params.maxDescriptionLength), + ); + } + lines.push(''); + } + + // "Other" section for docs not in any sidebar + const sidebarDocIds = new Set(); + for (const sidebarName of params.orderedSidebars) { + const items = params.docsSidebars[sidebarName]; + if (Array.isArray(items)) { + collectDocIds(items, params.docsById, sidebarDocIds); + } + } + + const otherDocs = Array.from(params.docsById.values()) + .filter((d) => isVisibleDoc(d) && !sidebarDocIds.has(d.id)) + .sort((a, b) => a.permalink.localeCompare(b.permalink)); + + if (otherDocs.length) { + lines.push('## Other', ''); + for (const doc of otherDocs) { + lines.push( + buildFlatDocLine( + {breadcrumb: [], doc}, + params.siteBase, + params.maxDescriptionLength, + ), + ); + } + lines.push(''); + } + + // "Optional" section for less critical content (per llmstxt.org spec) + const optionalSidebars = params.orderedSidebars.filter((name) => + optionalSet.has(name), + ); + + // Always include Optional section with llms-full.txt reference (like ElizaOS) + lines.push('## Optional', ''); + if (params.siteBase) { + const llmsFullUrl = joinUrl(params.siteBase, params.llmsFullTxtFilename); + lines.push(`- [Docs for LLMs](${llmsFullUrl})`); + } else { + lines.push(`- [Docs for LLMs](/${params.llmsFullTxtFilename})`); + } + + for (const sidebarName of optionalSidebars) { + const items = params.docsSidebars[sidebarName]; + if (!Array.isArray(items) || !items.length) continue; + + const flatEntries = collectFlatDocEntries(items, params.docsById); + for (const entry of flatEntries) { + lines.push( + buildFlatDocLine(entry, params.siteBase, params.maxDescriptionLength), + ); + } + } + lines.push(''); + + await fsp.writeFile( + path.join(params.outDir, params.llmsTxtFilename), + joinLines(lines), + 'utf8', + ); +} + +async function writeLlmsFullTxt(params: GeneratorParams): Promise { + const lines: string[] = []; + const processedDocIds = new Set(); + + // Helper to render a single doc in flat format (H1 title, natural heading levels) + function renderFlatDoc(doc: DocInfo): string[] { + const docLines: string[] = []; + const tree = getCachedTree(doc, params.siteDir); + if (!tree) return docLines; + + // H1 title for each doc (flat approach like ElizaOS/Coinbase) + docLines.push(`# ${doc.title}`); + + // Source URL + if (params.siteBase) { + docLines.push('', `Source: ${joinUrl(params.siteBase, doc.permalink)}`); + } + + // Render body with baseHeadingLevel: 1 (in-doc H1 stays H1, H2 stays H2, etc.) + const body = renderMarkdownFromCapturedTree(tree, { + docsByPermalink: params.docsByPermalink, + currentPermalink: doc.permalink, + siteBase: params.siteBase, + titleToStrip: doc.title, + baseHeadingLevel: 1, + }); + + if (body.trim()) { + docLines.push('', body.trim()); + } + + return docLines; + } + + // Process docs in sidebar order + for (const sidebarName of params.orderedSidebars) { + const items = params.docsSidebars[sidebarName]; + if (!Array.isArray(items) || !items.length) continue; + + const flatEntries = collectFlatDocEntries(items, params.docsById); + + for (const entry of flatEntries) { + const doc = entry.doc; + if (processedDocIds.has(doc.id)) continue; + processedDocIds.add(doc.id); + + const docLines = renderFlatDoc(doc); + if (docLines.length) { + if (lines.length > 0) lines.push(''); + lines.push(...docLines); + } + } + } + + // "Other" section for docs not in any sidebar + const sidebarDocIds = new Set(); + for (const sidebarName of params.orderedSidebars) { + const items = params.docsSidebars[sidebarName]; + if (Array.isArray(items)) { + collectDocIds(items, params.docsById, sidebarDocIds); + } + } + + const otherDocs = Array.from(params.docsById.values()) + .filter((d) => isVisibleDoc(d) && !sidebarDocIds.has(d.id)) + .sort((a, b) => a.permalink.localeCompare(b.permalink)); + + for (const doc of otherDocs) { + if (processedDocIds.has(doc.id)) continue; + processedDocIds.add(doc.id); + + const docLines = renderFlatDoc(doc); + if (docLines.length) { + if (lines.length > 0) lines.push(''); + lines.push(...docLines); + } + } + + await fsp.writeFile( + path.join(params.outDir, params.llmsFullTxtFilename), + joinLines(lines), + 'utf8', + ); +} + +async function writePerPageMarkdown(params: GeneratorParams): Promise { + const docs = Array.from(params.docsById.values()) + .filter(isVisibleDoc) + .sort((a, b) => a.permalink.localeCompare(b.permalink)); + + let count = 0; + for (const doc of docs) { + const tree = getCachedTree(doc, params.siteDir); + if (!tree) continue; + + const body = renderMarkdownFromCapturedTree(tree, { + docsByPermalink: params.docsByPermalink, + currentPermalink: doc.permalink, + siteBase: params.siteBase, + baseHeadingLevel: 1, + titleToStrip: doc.title, + linkTarget: 'md', + }); + + const slug = doc.permalink.replace(/^\//, '').replace(/\/$/, '') || 'index'; + const filePath = path.join(params.outDir, `${slug}.md`); + + await fsp.mkdir(path.dirname(filePath), {recursive: true}); + + const source = params.siteBase + ? `Source: ${joinUrl(params.siteBase, doc.permalink)}` + : ''; + const llmsTxtUrl = params.siteBase + ? joinUrl(params.siteBase, params.llmsTxtFilename) + : ''; + const footer = llmsTxtUrl + ? `---\n\n*To find navigation and other pages in this documentation, fetch the llms.txt file at: ${llmsTxtUrl}*` + : ''; + const content = joinLines([ + `# ${doc.title}`, + '', + source, + source ? '' : '', + body.trim(), + '', + footer, + ]); + + await fsp.writeFile(filePath, content, 'utf8'); + count++; + } + + return count; +} + +interface GeneratedIndexPage { + slug: string; + title: string; + description?: string; + childPermalinks: string[]; +} + +async function writeGeneratedIndexPages( + params: GeneratorParams, +): Promise { + const docs = Array.from(params.docsById.values()).filter(isVisibleDoc); + const existingPermalinks = new Set( + docs.map((d) => d.permalink.replace(/\/+$/, '') || '/'), + ); + const coveredSlugs = new Set(); + const pages: GeneratedIndexPage[] = []; + + function collectGeneratedIndexes( + items: SidebarItem[], + activePage?: GeneratedIndexPage, + ): void { + for (const item of items) { + const normalized = normalizeItem(item); + if (!normalized) continue; + + if (normalized.type === 'doc') { + const doc = params.docsById.get( + normalized.id ?? normalized.docId ?? '', + ); + if (isVisibleDoc(doc) && activePage) { + const permalink = doc.permalink.replace(/\/+$/, '') || '/'; + if (!activePage.childPermalinks.includes(permalink)) { + activePage.childPermalinks.push(permalink); + } + } + } else if (normalized.type === 'category') { + const linkedDoc = getLinkedDoc(item, params.docsById); + if (isVisibleDoc(linkedDoc) && activePage) { + const permalink = linkedDoc.permalink.replace(/\/+$/, '') || '/'; + if (!activePage.childPermalinks.includes(permalink)) { + activePage.childPermalinks.push(permalink); + } + } + + const generatedSlug = getGeneratedIndexSlug(item); + let newPage: GeneratedIndexPage | undefined; + + if (generatedSlug) { + const normalizedSlug = generatedSlug.replace(/\/+$/, '') || '/'; + if ( + !existingPermalinks.has(normalizedSlug) && + !coveredSlugs.has(normalizedSlug) + ) { + coveredSlugs.add(normalizedSlug); + newPage = { + slug: normalizedSlug, + title: String(normalized.label ?? '').trim() || normalizedSlug, + description: + getCategoryDescription(item, linkedDoc).trim() || undefined, + childPermalinks: [], + }; + pages.push(newPage); + } + } + + collectGeneratedIndexes(normalized.items ?? [], newPage ?? activePage); + } + } + } + + for (const sidebarName of params.orderedSidebars) { + const items = params.docsSidebars[sidebarName]; + if (Array.isArray(items)) { + collectGeneratedIndexes(items); + } + } + + let count = 0; + for (const page of pages) { + const slug = page.slug === '/' ? 'index' : page.slug.replace(/^\//, ''); + const filePath = path.join(params.outDir, `${slug}.md`); + + await fsp.mkdir(path.dirname(filePath), {recursive: true}); + + const childLinks = page.childPermalinks.map((permalink) => { + const doc = + params.docsByPermalink.get(permalink) ?? + params.docsByPermalink.get(permalink + '/'); + const title = doc?.title ?? permalink; + const url = toMarkdownUrl( + urlForPermalink(permalink, params.siteBase), + params.siteBase, + ); + return `- [${title}](${url})`; + }); + + const content = joinLines([ + `# ${page.title}`, + '', + page.description ?? '', + page.description ? '' : '', + '## Pages', + '', + ...childLinks, + '', + ]); + + await fsp.writeFile(filePath, content, 'utf8'); + count++; + } + + return count; +} + +// Main Export +export async function generateLlmsExports( + params: GeneratorParams, +): Promise { + // Get all visible docs + const allDocs = Array.from(params.docsById.values()) + .filter(isVisibleDoc) + .sort((a, b) => a.permalink.localeCompare(b.permalink)); + + // Check for missing trees and log warnings (graceful handling) + const skippedDocs: DocInfo[] = []; + const docs: DocInfo[] = []; + + for (const doc of allDocs) { + if (!doc.source || !getCachedTree(doc, params.siteDir)) { + skippedDocs.push(doc); + } else { + docs.push(doc); + } + } + + if (skippedDocs.length) { + console.warn( + `llms-export: warning: skipping ${skippedDocs.length} docs without captured trees`, + ); + for (const doc of skippedDocs) { + console.warn(` - ${doc.id} (${doc.permalink})`); + } + } + + // Derive descriptions for docs without frontmatter descriptions + for (const doc of docs) { + if (doc.descriptionFromFrontMatter) continue; + const tree = getCachedTree(doc, params.siteDir); + if (tree) { + const derived = deriveDescriptionFromTree(tree); + if (derived) doc.description = derived; + } + } + + // Generate all outputs + await writeLlmsTxt(params); + await writeLlmsFullTxt(params); + + const perPageCount = await writePerPageMarkdown(params); + console.info(`llms-export: generated ${perPageCount} per-page .md files`); + + const generatedIndexCount = await writeGeneratedIndexPages(params); + if (generatedIndexCount) { + console.info( + `llms-export: generated ${generatedIndexCount} generated-index .md files`, + ); + } + + // Final summary + if (skippedDocs.length) { + console.warn( + `llms-export: completed with ${skippedDocs.length} skipped docs (missing trees)`, + ); + } + + // Clear cache to free memory + treeCache.clear(); +} diff --git a/src/plugins/llms/index.ts b/src/plugins/llms/index.ts new file mode 100644 index 0000000000..e333aac587 --- /dev/null +++ b/src/plugins/llms/index.ts @@ -0,0 +1,200 @@ +import {joinUrl} from './markdown'; +import {type DocInfo, type SidebarItem, generateLlmsExports} from './generator'; +import type {DocusaurusContext, DocsVersion, DocMetadata} from './types'; + +type Options = { + llmsTxtFilename?: string; + llmsFullTxtFilename?: string; + maxDescriptionLength?: number; + sidebarOrder?: string[]; + sidebarTitles?: Record; + optionalSidebars?: string[]; +}; + +function isObject(value: unknown): value is Record { + return typeof value === 'object' && value !== null; +} + +// Default configuration (can be overridden via options) +const DEFAULT_SIDEBAR_ORDER = [ + 'developers', + 'general', + 'getInvolved', + 'operators', + 'oasisCore', + 'adrs', +]; + +const DEFAULT_SIDEBAR_TITLES: Record = { + developers: 'Build', + general: 'Learn', + getInvolved: 'Get Involved', + operators: 'Run Node', + oasisCore: 'Develop Core', + adrs: 'Architectural Decision Records', +}; + +const DEFAULT_OPTIONAL_SIDEBARS = ['adrs']; + +function getVersion(allContent: unknown): DocsVersion | null { + if (!isObject(allContent)) return null; + + const plugins = allContent as Record; + const docsPlugin = + plugins['docusaurus-plugin-content-docs']?.default ?? + plugins['@docusaurus/plugin-content-docs']?.default; + + const versions = docsPlugin?.loadedVersions; + if (!Array.isArray(versions) || versions.length === 0) return null; + + return versions[0]; +} + +function extractDocId(metadata: DocMetadata): string { + if (typeof metadata.unversionedId === 'string') return metadata.unversionedId; + if (typeof metadata.id === 'string') return metadata.id; + return ''; +} + +function extractDocs(version: DocsVersion): DocMetadata[] { + if (Array.isArray(version.docs)) return version.docs; + if (Array.isArray(version.loadedContent?.docs)) { + return version.loadedContent.docs; + } + return []; +} + +function getFrontMatterDescription(metadata: DocMetadata): string { + if (!isObject(metadata.frontMatter)) return ''; + const desc = metadata.frontMatter.description; + return typeof desc === 'string' ? desc.trim() : ''; +} + +function extractDescription(metadata: DocMetadata): { + description: string; + fromFrontMatter: boolean; +} { + const frontMatterDesc = getFrontMatterDescription(metadata); + if (frontMatterDesc) { + return {description: frontMatterDesc, fromFrontMatter: true}; + } + + const metadataDesc = + typeof metadata.description === 'string' ? metadata.description.trim() : ''; + return {description: metadataDesc, fromFrontMatter: false}; +} + +export default function llmsPlugin( + ctx: DocusaurusContext, + opts: unknown, +): {name: string; allContentLoaded: Function; postBuild: Function} { + const options = (isObject(opts) ? opts : {}) as Options; + const docsById = new Map(); + const docsByPermalink = new Map(); + let sidebars: Record = {}; + + const deployUrl = + process.env.DEPLOY_PRIME_URL || + process.env.DEPLOY_URL || + ctx.siteConfig?.url || + ''; + const normalizedUrl = String(deployUrl).replace(/\/$/, ''); + const siteBase = joinUrl( + normalizedUrl + '/', + String(ctx.siteConfig?.baseUrl ?? '/'), + ); + + return { + name: 'llms-export-plugin', + async allContentLoaded({allContent}: {allContent: unknown}) { + const version = getVersion(allContent); + if (!version) { + throw new Error('llms-export: failed to locate docs plugin content'); + } + + sidebars = (version.docsSidebars ?? version.sidebars ?? {}) as Record< + string, + SidebarItem[] + >; + if (Object.keys(sidebars).length === 0) { + console.warn('llms-export: docs sidebars not found or empty'); + } + + const docs = extractDocs(version); + for (const rawDoc of docs) { + const metadata: DocMetadata | undefined = + isObject(rawDoc) && isObject(rawDoc.metadata) + ? rawDoc.metadata + : rawDoc; + + if (!isObject(metadata)) continue; + + const id = extractDocId(metadata); + const title = typeof metadata.title === 'string' ? metadata.title : ''; + const permalink = + typeof metadata.permalink === 'string' ? metadata.permalink : ''; + + if (!id || !title || !permalink) continue; + + const {description, fromFrontMatter} = extractDescription(metadata); + const source = + typeof metadata.source === 'string' ? metadata.source : undefined; + + const info: DocInfo = { + id, + title, + description: description || undefined, + descriptionFromFrontMatter: fromFrontMatter, + permalink, + source, + unlisted: Boolean(metadata.unlisted), + draft: Boolean(metadata.draft), + }; + + docsById.set(id, info); + docsByPermalink.set(permalink, info); + } + }, + async postBuild({outDir}: {outDir: string}) { + const title = + String(ctx.siteConfig?.title ?? 'Documentation').trim() || + 'Documentation'; + const tagline = String(ctx.siteConfig?.tagline ?? '').trim(); + const defaultSummary = + '> Oasis Protocol developer documentation for Sapphire, ROFL, node operation, and Oasis Core.'; + const summary = tagline ? `> ${tagline}` : defaultSummary; + + // Use configured options with defaults + const sidebarOrder = options.sidebarOrder ?? DEFAULT_SIDEBAR_ORDER; + const sidebarTitles = { + ...DEFAULT_SIDEBAR_TITLES, + ...options.sidebarTitles, + }; + const optionalSidebars = + options.optionalSidebars ?? DEFAULT_OPTIONAL_SIDEBARS; + + const sidebarNames = Object.keys(sidebars ?? {}); + const orderedSidebars = [ + ...sidebarOrder.filter((name) => sidebarNames.includes(name)), + ...sidebarNames.filter((name) => !sidebarOrder.includes(name)), + ]; + + await generateLlmsExports({ + siteDir: ctx.siteDir, + outDir, + siteBase, + title, + summary, + docsById, + docsByPermalink, + docsSidebars: sidebars, + orderedSidebars, + optionalSidebars, + sectionTitleForSidebar: (name) => sidebarTitles[name] ?? name, + llmsTxtFilename: options.llmsTxtFilename ?? 'llms.txt', + llmsFullTxtFilename: options.llmsFullTxtFilename ?? 'llms-full.txt', + maxDescriptionLength: options.maxDescriptionLength ?? 200, + }); + }, + }; +} diff --git a/src/plugins/llms/markdown.test.ts b/src/plugins/llms/markdown.test.ts new file mode 100644 index 0000000000..38c127189a --- /dev/null +++ b/src/plugins/llms/markdown.test.ts @@ -0,0 +1,357 @@ +import {expect, test} from '@jest/globals'; + +import { + deriveDescriptionFromTree, + renderMarkdownFromCapturedTree, + stripNoisyNodes, + toMarkdownUrl, + truncate, +} from './markdown'; + +const t = (value: string) => ({type: 'text', value}); +const h = (depth: number, value: string) => ({ + type: 'heading', + depth, + children: [t(value)], +}); +const p = (value: string) => ({type: 'paragraph', children: [t(value)]}); +const link = (url: string, value: string) => ({ + type: 'link', + url, + children: [t(value)], +}); +const li = (...children: any[]) => ({type: 'listItem', children}); +const list = (...items: any[]) => ({type: 'list', children: items}); +const root = (...children: any[]) => ({type: 'root', children}); + +test('toMarkdownUrl: root maps to /index.md (preserves query/hash)', () => { + expect(toMarkdownUrl('/')).toBe('/index.md'); + expect(toMarkdownUrl('/?q=1#top')).toBe('/index.md?q=1#top'); +}); + +test('toMarkdownUrl: converts internal doc-like paths and preserves query/hash', () => { + expect(toMarkdownUrl('/build/rofl/?x=1#sec')).toBe('/build/rofl.md?x=1#sec'); + expect(toMarkdownUrl('/build/rofl/index.mdx?x=1#sec')).toBe( + '/build/rofl.md?x=1#sec', + ); + expect(toMarkdownUrl('/build/rofl/README.mdx?x=1#sec')).toBe( + '/build/rofl.md?x=1#sec', + ); +}); + +test('toMarkdownUrl: converts same-host absolute URLs when siteBase is provided', () => { + expect( + toMarkdownUrl( + 'https://docs.oasis.io/build/rofl/?x=1#sec', + 'https://docs.oasis.io/', + ), + ).toBe('https://docs.oasis.io/build/rofl.md?x=1#sec'); + + expect( + toMarkdownUrl('https://docs.oasis.io/?q=1#top', 'https://docs.oasis.io/'), + ).toBe('https://docs.oasis.io/index.md?q=1#top'); +}); + +test('toMarkdownUrl: leaves absolute URLs on other hosts unchanged', () => { + expect( + toMarkdownUrl('https://example.com/build/rofl/', 'https://docs.oasis.io/'), + ).toBe('https://example.com/build/rofl/'); +}); + +test('toMarkdownUrl: does not append .md to asset URLs with file extensions', () => { + expect(toMarkdownUrl('/static/files/whitepaper.pdf?download=1#page=2')).toBe( + '/static/files/whitepaper.pdf?download=1#page=2', + ); + expect(toMarkdownUrl('/img/logo.png#x')).toBe('/img/logo.png#x'); + expect( + toMarkdownUrl( + 'https://docs.oasis.io/static/files/whitepaper.pdf?download=1#page=2', + 'https://docs.oasis.io/', + ), + ).toBe('https://docs.oasis.io/static/files/whitepaper.pdf?download=1#page=2'); +}); + +test('truncate: never exceeds maxLength', () => { + const text = 'a'.repeat(10_000); + for (const maxLength of [1, 2, 3, 4, 10, 50, 200]) { + expect(truncate(text, maxLength)).toHaveLength(maxLength); + } +}); + +test('truncate: returns empty when maxLength is non-positive', () => { + expect(truncate('hello', 0)).toBe(''); + expect(truncate('hello', -1)).toBe(''); +}); + +test('truncate: returns input when already short enough', () => { + expect(truncate('hello', 10)).toBe('hello'); +}); + +test('truncate: uses word boundary + ellipsis when possible', () => { + const text = 'hello world from oasis'; + const value = truncate(text, 14); + expect(value.length).toBeLessThanOrEqual(14); + expect(value.endsWith('...')).toBe(true); +}); + +test('deriveDescriptionFromTree: skips 1-word paragraphs', () => { + const tree = root( + h(1, 'Title'), + p('Run'), + p('Run this command to get node status.'), + ); + + expect(deriveDescriptionFromTree(tree)).toBe( + 'Run this command to get node status.', + ); +}); + +test('deriveDescriptionFromTree: skips ADR metadata sections', () => { + const tree = root( + h(1, 'ADR 0001'), + h(2, 'Component'), + p('Oasis Core'), + h(2, 'Changelog'), + list(li(p('2020-01-01: Initial version'))), + h(2, 'Status'), + p('Accepted'), + h(2, 'Context'), + p('This ADR explains the decision.'), + ); + + expect(deriveDescriptionFromTree(tree)).toBe( + 'This ADR explains the decision.', + ); +}); + +test('deriveDescriptionFromTree: skips image-only paragraphs', () => { + const tree = root( + h(1, 'Title'), + p('Image: Diagram'), + p('This page explains how to deploy ROFL apps.'), + ); + + expect(deriveDescriptionFromTree(tree)).toBe( + 'This page explains how to deploy ROFL apps.', + ); +}); + +test('renderMarkdownFromCapturedTree: rewrites .mdx and index links to permalinks', () => { + const tree = root({ + type: 'paragraph', + children: [ + link('/build/rofl/quickstart.mdx#prereq', 'Quickstart'), + t(' '), + link('/build/rofl/index.mdx#top', 'ROFL'), + t(' '), + link('/build/rofl/README.mdx#top', 'ROFL README'), + ], + }); + const docsByPermalink = new Map([ + ['/build/rofl/quickstart', {permalink: '/build/rofl/quickstart'}], + ['/build/rofl/', {permalink: '/build/rofl/'}], + ]); + + const cleaned = stripNoisyNodes(structuredClone(tree)); + const output = renderMarkdownFromCapturedTree(cleaned, {docsByPermalink}); + + expect(output).toContain('](/build/rofl/quickstart#prereq)'); + expect(output).toContain('](/build/rofl/#top)'); + expect(output).not.toContain('/build/rofl/README.mdx'); +}); + +test('renderMarkdownFromCapturedTree: linkTarget md rewrites internal links to .md mirrors', () => { + const tree = root({ + type: 'paragraph', + children: [ + link('/build/rofl/quickstart.mdx#prereq', 'Quickstart'), + t(' '), + link('/build/rofl/index.mdx#top', 'ROFL'), + t(' '), + link('/build/rofl/README.mdx#top', 'ROFL README'), + t(' '), + link('../features/?x=1#sec', 'Features'), + ], + }); + + const docsByPermalink = new Map([ + ['/build/rofl/quickstart', {permalink: '/build/rofl/quickstart'}], + ['/build/rofl/', {permalink: '/build/rofl/'}], + ]); + + const cleaned = stripNoisyNodes(structuredClone(tree)); + const output = renderMarkdownFromCapturedTree(cleaned, { + docsByPermalink, + currentPermalink: '/build/rofl/workflow/create', + linkTarget: 'md', + }); + + expect(output).toContain('](/build/rofl/quickstart.md#prereq)'); + expect(output).toContain('](/build/rofl.md#top)'); + expect(output).not.toContain('/build/rofl/README.mdx'); + expect(output).toContain('](/build/rofl/features.md?x=1#sec)'); +}); + +test('renderMarkdownFromCapturedTree: linkTarget md preserves non-markdown asset links', () => { + const tree = root({ + type: 'paragraph', + children: [ + link('/static/files/whitepaper.pdf?download=1#page=2', 'Whitepaper'), + ], + }); + + const cleaned = stripNoisyNodes(structuredClone(tree)); + const output = renderMarkdownFromCapturedTree(cleaned, { + docsByPermalink: new Map(), + linkTarget: 'md', + }); + + expect(output).toContain('](/static/files/whitepaper.pdf?download=1#page=2)'); +}); + +test('renderMarkdownFromCapturedTree: rewrites relative reference definitions', () => { + const tree = root( + { + type: 'paragraph', + children: [ + { + type: 'linkReference', + identifier: 'qs', + label: 'qs', + referenceType: 'full', + children: [t('Quickstart')], + }, + ], + }, + { + type: 'definition', + identifier: 'qs', + label: 'qs', + url: '../rofl/quickstart#prereq', + }, + ); + + const docsByPermalink = new Map([ + ['/build/rofl/quickstart', {permalink: '/build/rofl/quickstart'}], + ]); + + const cleaned = stripNoisyNodes(structuredClone(tree)); + const output = renderMarkdownFromCapturedTree(cleaned, { + docsByPermalink, + currentPermalink: '/build/use-cases/key-generation', + }); + + expect(output).toContain('[qs]: /build/rofl/quickstart#prereq'); +}); + +test('renderMarkdownFromCapturedTree: rewrites relative inline links', () => { + const tree = root({ + type: 'paragraph', + children: [link('../rofl/quickstart#prereq', 'Quickstart')], + }); + + const docsByPermalink = new Map([ + ['/build/rofl/quickstart', {permalink: '/build/rofl/quickstart'}], + ]); + + const cleaned = stripNoisyNodes(structuredClone(tree)); + const output = renderMarkdownFromCapturedTree(cleaned, { + docsByPermalink, + currentPermalink: '/build/use-cases/key-generation', + }); + + expect(output).toContain('](/build/rofl/quickstart#prereq)'); +}); + +test('renderMarkdownFromCapturedTree: rewrites relative links without canonical docs', () => { + const tree = root({ + type: 'paragraph', + children: [link('../features/', 'Other features')], + }); + + const cleaned = stripNoisyNodes(structuredClone(tree)); + const output = renderMarkdownFromCapturedTree(cleaned, { + docsByPermalink: new Map(), + currentPermalink: '/build/rofl/workflow/create', + }); + + expect(output).toContain('](/build/rofl/features/)'); +}); + +test('stripNoisyNodes: preserves TabItem labels and HTML text', () => { + const tree = root( + { + type: 'mdxJsxFlowElement', + name: 'TabItem', + attributes: [{type: 'mdxJsxAttribute', name: 'label', value: 'macOS'}], + children: [p('Run the command.')], + }, + {type: 'html', value: 'Use x1
Next'}, + ); + + const cleaned = stripNoisyNodes(structuredClone(tree)); + const output = renderMarkdownFromCapturedTree(cleaned, { + docsByPermalink: new Map(), + }); + + expect(output).toContain('**Tab**: macOS'); + expect(output).toContain('Run the command.'); + expect(output).toContain('Use x1'); + expect(output).toContain('Next'); + expect(output).not.toContain(''); + expect(output).not.toContain('
'); +}); + +test('stripNoisyNodes: converts mdx img to alt text', () => { + const tree = root({ + type: 'paragraph', + children: [ + { + type: 'mdxJsxTextElement', + name: 'img', + attributes: [ + {type: 'mdxJsxAttribute', name: 'alt', value: 'Diagram'}, + {type: 'mdxJsxAttribute', name: 'src', value: 'diagram.svg'}, + ], + children: [], + }, + ], + }); + + const cleaned = stripNoisyNodes(structuredClone(tree)); + const output = renderMarkdownFromCapturedTree(cleaned, { + docsByPermalink: new Map(), + }); + + expect(output).toContain('Image: Diagram'); +}); + +test('stripNoisyNodes: converts DocCard to a link with proper title', () => { + const tree = root({ + type: 'mdxJsxFlowElement', + name: 'DocCard', + attributes: [ + { + type: 'mdxJsxAttribute', + name: 'item', + value: { + type: 'mdxJsxAttributeValueExpression', + value: "findSidebarItem('/build/rofl/workflow/deploy')", + }, + }, + ], + children: [], + }); + + const docsByPermalink = new Map([ + [ + '/build/rofl/workflow/deploy', + {permalink: '/build/rofl/workflow/deploy', title: 'Deploy'}, + ], + ]); + + const cleaned = stripNoisyNodes(structuredClone(tree)); + const output = renderMarkdownFromCapturedTree(cleaned, {docsByPermalink}); + + expect(output).toContain('* [Deploy](/build/rofl/workflow/deploy)'); +}); diff --git a/src/plugins/llms/markdown.ts b/src/plugins/llms/markdown.ts new file mode 100644 index 0000000000..97d8ad71dc --- /dev/null +++ b/src/plugins/llms/markdown.ts @@ -0,0 +1,931 @@ +import * as fs from 'fs'; +import * as path from 'path'; +import {toMarkdown as mdastToMarkdown} from 'mdast-util-to-markdown'; +import {gfmToMarkdown} from 'mdast-util-gfm'; +import remarkFrontmatter from 'remark-frontmatter'; +import remarkGfm from 'remark-gfm'; +import remarkMdx from 'remark-mdx'; +import remarkParse from 'remark-parse'; +import remarkDirective from 'remark-directive'; +import {unified} from 'unified'; +import {visit} from 'unist-util-visit'; +import { + capturedTreePathForSource, + unwrapCapturedTree, + wrapCapturedTree, +} from './capture'; +import type {MdastNode} from './types'; +import {isMdastNode, PHRASING_NODE_TYPES} from './types'; + +// Types +export interface PermalinkDoc { + permalink: string; + title?: string; + unlisted?: boolean; + draft?: boolean; +} + +type LinkTarget = 'html' | 'md'; + +// URL Utilities +export function normalizeTitle(value: string): string { + return value.trim().replace(/\s+/g, ' ').toLowerCase(); +} + +export function isExternalUrl(href: string): boolean { + return href.startsWith('//') || /^[a-z][a-z0-9+.-]*:/i.test(href); +} + +export function joinUrl(base: string, href: string): string { + try { + return new URL(href, base).toString(); + } catch { + return href; + } +} + +export function urlForPermalink(permalink: string, siteBase?: string): string { + const normalized = permalink.startsWith('/') ? permalink : `/${permalink}`; + return siteBase ? joinUrl(siteBase, normalized) : normalized; +} + +/** + * Determine if a pathname should be converted to .md extension. + * Returns true for doc-like paths, false for assets with file extensions. + */ +function shouldConvertToMarkdown(pathname: string): boolean { + const normalized = pathname.replace(/\/+$/, ''); + if (!normalized) return true; + if (/\.(md|mdx)$/i.test(normalized)) return true; + + const lastSegment = normalized.split('/').pop() ?? ''; + // If last segment has a file extension, it's an asset - don't convert + return !/\.[a-z0-9]+$/i.test(lastSegment); +} + +/** + * Convert a pathname to its .md equivalent. + */ +function toMarkdownPathname(pathname: string): string { + const normalized = pathname.replace(/\/+$/, ''); + if (!normalized) return '/index.md'; + + const withoutExtension = normalized.replace(/\.(md|mdx)$/i, ''); + + // Handle /foo/index -> /foo.md + if (withoutExtension.endsWith('/index')) { + const parent = withoutExtension.slice(0, -'/index'.length) || '/'; + return parent === '/' ? '/index.md' : `${parent}.md`; + } + + // Handle /foo/README -> /foo.md (common Docusaurus pattern) + if (/\/readme$/i.test(withoutExtension)) { + const parent = withoutExtension.slice(0, -'/README'.length) || '/'; + return parent === '/' ? '/index.md' : `${parent}.md`; + } + + return `${withoutExtension}.md`; +} + +/** + * Convert a URL to its markdown mirror equivalent. + * Handles both relative URLs and absolute URLs with siteBase. + */ +export function toMarkdownUrl(url: string, siteBase?: string): string { + if (!url) return url; + + if (!siteBase) { + if (isExternalUrl(url)) return url; + try { + const parsed = new URL(url, 'https://_/'); + if (!shouldConvertToMarkdown(parsed.pathname)) { + return parsed.pathname + parsed.search + parsed.hash; + } + parsed.pathname = toMarkdownPathname(parsed.pathname); + return parsed.pathname + parsed.search + parsed.hash; + } catch { + return url; + } + } + + try { + const baseUrl = new URL(siteBase); + const targetUrl = new URL(url, baseUrl); + + // Don't convert URLs on different hosts + if (isExternalUrl(url) && targetUrl.host !== baseUrl.host) { + return url; + } + + if (!shouldConvertToMarkdown(targetUrl.pathname)) { + return targetUrl.toString(); + } + + targetUrl.pathname = toMarkdownPathname(targetUrl.pathname); + return targetUrl.toString(); + } catch { + return url; + } +} + +/** + * Truncate text to maxLength, preferring sentence boundaries. + */ +export function truncate(text: string, maxLength: number): string { + const cleaned = text.trim().replace(/\s+/g, ' '); + if (!cleaned || maxLength <= 0) return ''; + if (cleaned.length <= maxLength) return cleaned; + + const clipped = cleaned.slice(0, maxLength); + + // Try to break at sentence boundary + const sentenceEnds = [ + clipped.lastIndexOf('. '), + clipped.lastIndexOf('! '), + clipped.lastIndexOf('? '), + ]; + const lastSentenceEnd = Math.max(...sentenceEnds); + + if (lastSentenceEnd > maxLength * 0.4) { + return cleaned.slice(0, lastSentenceEnd + 1).trim(); + } + + // Fall back to word boundary with ellipsis + if (maxLength <= 3) return clipped.trim(); + + const forEllipsis = cleaned.slice(0, maxLength - 3); + const lastSpace = forEllipsis.lastIndexOf(' '); + + // Prefer breaking at word boundary if we're past halfway point + const breakAtWord = lastSpace > forEllipsis.length * 0.5; + const truncated = breakAtWord ? forEllipsis.slice(0, lastSpace) : forEllipsis; + + return truncated.trimEnd() + '...'; +} + +// Tree Loading +let hasLoggedFallbackParse = false; + +function atomicWriteJson(filename: string, payload: unknown): void { + const dir = path.dirname(filename); + const base = path.basename(filename); + const tempFile = path.join(dir, `.${base}.${process.pid}.${Date.now()}.tmp`); + + fs.mkdirSync(dir, {recursive: true}); + fs.writeFileSync(tempFile, JSON.stringify(payload), 'utf8'); + + try { + fs.renameSync(tempFile, filename); + } catch (error) { + // Retry with force removal on Windows or other systems with file locking + try { + fs.rmSync(filename, {force: true}); + fs.renameSync(tempFile, filename); + } catch { + try { + fs.rmSync(tempFile, {force: true}); + } catch { + // Ignore cleanup failures + } + throw error; + } + } +} + +function parseSourceTree(sourceAbsPath: string): MdastNode | null { + let content: string; + try { + content = fs.readFileSync(sourceAbsPath, 'utf8'); + } catch { + return null; + } + + const parse = (input: string, {mdx}: {mdx: boolean}): MdastNode | null => { + try { + const processor = unified() + .use(remarkParse) + .use(remarkGfm) + .use(remarkDirective) + .use(remarkFrontmatter, ['yaml']); + + if (mdx) processor.use(remarkMdx); + + const tree = processor.parse(input) as unknown as MdastNode; + return tree && typeof tree === 'object' ? tree : null; + } catch { + return null; + } + }; + + // Try full MDX parsing first. + const mdxTree = parse(content, {mdx: true}); + if (mdxTree) return mdxTree; + + // Some docs include HTML comments like `` which + // are not valid JSX and can make MDX parsing fail. Strip them and retry. + const withoutHtmlComments = content.replace(//g, ''); + const strippedMdxTree = parse(withoutHtmlComments, {mdx: true}); + if (strippedMdxTree) return strippedMdxTree; + + // Final fallback: parse as plain Markdown. + return parse(content, {mdx: false}); +} + +export function loadCapturedTree( + sourceAbsPath: string, + siteDir: string, +): MdastNode | null { + const treePath = capturedTreePathForSource(sourceAbsPath, siteDir); + + try { + if (fs.existsSync(treePath)) { + const content = fs.readFileSync(treePath, 'utf8'); + return unwrapCapturedTree(JSON.parse(content)) ?? null; + } + } catch { + // Fall through to source parsing + } + + // Fallback: parse the markdown source directly. This guards against builds + // where Webpack persistent cache skips running remark plugins, so capture + // files aren't produced. + const parsedTree = parseSourceTree(sourceAbsPath); + if (!parsedTree) return null; + + if (!hasLoggedFallbackParse) { + console.warn( + 'llms-export: warning: captured trees missing; falling back to parsing source files (webpack cache may have skipped remark plugins)', + ); + hasLoggedFallbackParse = true; + } + + // Best-effort write-through so future cached builds can reuse the parsed tree. + try { + atomicWriteJson(treePath, wrapCapturedTree(parsedTree)); + } catch (error) { + const message = error instanceof Error ? error.message : 'unknown error'; + console.warn( + `llms-export: warning: failed to write fallback tree: ${message}`, + ); + } + + return parsedTree; +} + +// AST Text Extraction +function extractNodeText(node: unknown): string { + if (!isMdastNode(node)) return ''; + if (typeof node.value === 'string') return node.value; + if (!Array.isArray(node.children)) return ''; + + return node.children + .map(extractNodeText) + .filter(Boolean) + .join(' ') + .replace(/\s+/g, ' ') + .trim(); +} + +function stripHeadingId(text: string): string { + return text.replace(/\s*\{#[^}]+\}\s*$/, '').trim(); +} + +// Description Derivation +const IGNORED_SECTIONS = new Set(['component', 'status', 'changelog']); + +/** + * Extract a description from a document tree. + * Skips metadata sections, single-word paragraphs, and image placeholders. + */ +export function deriveDescriptionFromTree(tree: MdastNode | unknown): string { + if (!isMdastNode(tree) || !Array.isArray(tree.children)) return ''; + + let currentSection = ''; + + for (const child of tree.children) { + if (!child || typeof child !== 'object') continue; + + // Track current section for filtering + if (child.type === 'heading') { + currentSection = stripHeadingId(extractNodeText(child)); + continue; + } + + // Look for paragraphs (directly or in lists) + if (child.type === 'paragraph' || child.type === 'list') { + const paragraphs = + child.type === 'paragraph' + ? [child] + : (child.children ?? []).flatMap((listItem: MdastNode) => + (listItem?.children ?? []).filter( + (c: MdastNode) => c?.type === 'paragraph', + ), + ); + + for (const paragraph of paragraphs) { + // Skip ignored sections (like ADR metadata) + if (IGNORED_SECTIONS.has(normalizeTitle(currentSection))) continue; + + const text = extractNodeText(paragraph) + .replace(/\s+([,.;:!?\)\]\}])/g, '$1') + .replace(/([\(\[\{])\s+/g, '$1') + .replace(/\s+/g, ' ') + .trim(); + + // Skip empty, image placeholders, or admonition labels + if (!text) continue; + if (/^image:/i.test(text)) continue; + if (/^(Tip|Note|Info|Warning|Caution|Danger):$/i.test(text)) continue; + + // Require at least 2 words + const wordCount = text.split(/\s+/).filter(Boolean).length; + if (wordCount > 1) return text; + } + } + } + + return ''; +} + +/** + * Find the canonical permalink for a given pathname. + * Handles trailing slashes, .md/.mdx extensions, and /index paths. + */ +function findCanonicalPermalink( + docs: ReadonlyMap, + pathname: string, +): string | null { + function checkPermalink(path: string): string | null { + const doc = docs.get(path); + if (!doc || doc.unlisted || doc.draft) return null; + return doc.permalink; + } + + // Try direct match first + let result = checkPermalink(pathname); + if (result) return result; + + // Try with trailing slash + result = checkPermalink(pathname.endsWith('/') ? pathname : pathname + '/'); + if (result) return result; + + // Try without trailing slash (if applicable) + if (pathname.endsWith('/') && pathname !== '/') { + result = checkPermalink(pathname.slice(0, -1)); + if (result) return result; + } + + // Try without .md/.mdx extension + const withoutExtension = pathname.replace(/\.(md|mdx)$/i, ''); + if (withoutExtension !== pathname) { + const stripped = + checkPermalink(withoutExtension) ?? + checkPermalink(withoutExtension + '/'); + if (stripped) return stripped; + + // Handle /foo/index.md -> /foo + if (withoutExtension.endsWith('/index')) { + const parent = withoutExtension.slice(0, -'/index'.length) || '/'; + const indexed = checkPermalink(parent) ?? checkPermalink(parent + '/'); + if (indexed) return indexed; + } + + // Handle /foo/README.md -> /foo (common Docusaurus pattern) + if (/\/readme$/i.test(withoutExtension)) { + const parent = withoutExtension.slice(0, -'/README'.length) || '/'; + const readmed = checkPermalink(parent) ?? checkPermalink(parent + '/'); + if (readmed) return readmed; + } + } + + return null; +} + +// Link Rewriting +interface LinkRewriteOptions { + docsByPermalink: ReadonlyMap; + currentPermalink?: string; + siteBase?: string; + linkTarget?: LinkTarget; +} + +function rewriteLinks(tree: MdastNode, options: LinkRewriteOptions): void { + const { + docsByPermalink, + currentPermalink, + siteBase, + linkTarget = 'html', + } = options; + + function ensureLeadingSlash(path?: string): string { + if (!path) return ''; + return path.startsWith('/') ? path : `/${path}`; + } + + function applyLinkTarget(url: string): string { + return linkTarget === 'md' ? toMarkdownUrl(url, siteBase) : url; + } + + function rewriteUrl(url: string): string | null { + if (!url || url.startsWith('#') || isExternalUrl(url)) return null; + + // Parse URL components + const hashIndex = url.indexOf('#'); + const beforeHash = hashIndex === -1 ? url : url.slice(0, hashIndex); + const hash = hashIndex === -1 ? '' : url.slice(hashIndex); + + const queryIndex = beforeHash.indexOf('?'); + const path = + queryIndex === -1 ? beforeHash : beforeHash.slice(0, queryIndex); + const query = queryIndex === -1 ? '' : beforeHash.slice(queryIndex); + + // Handle absolute paths + if (path.startsWith('/')) { + const canonical = findCanonicalPermalink(docsByPermalink, path); + if (canonical) { + return applyLinkTarget( + urlForPermalink(canonical, siteBase) + query + hash, + ); + } + const absoluteUrl = siteBase + ? joinUrl(siteBase, path) + query + hash + : path + query + hash; + return applyLinkTarget(absoluteUrl); + } + + // Handle relative paths + const basePath = ensureLeadingSlash(currentPermalink); + if (!basePath) return null; + + try { + const resolved = new URL( + url, + 'https://_/' + basePath.replace(/^\/+/, ''), + ); + const canonical = findCanonicalPermalink( + docsByPermalink, + resolved.pathname, + ); + if (canonical) { + return applyLinkTarget( + urlForPermalink(canonical, siteBase) + + resolved.search + + resolved.hash, + ); + } + const relativeUrl = siteBase + ? joinUrl(siteBase, resolved.pathname) + resolved.search + resolved.hash + : resolved.pathname + resolved.search + resolved.hash; + return applyLinkTarget(relativeUrl); + } catch { + return null; + } + } + + /** + * Resolve a link text that looks like a path to its doc title. + */ + function resolveLinkTitle(text: string): string | null { + if (!text) return null; + + const hashIndex = text.indexOf('#'); + const beforeHash = hashIndex === -1 ? text : text.slice(0, hashIndex); + const queryIndex = beforeHash.indexOf('?'); + const path = ( + queryIndex === -1 ? beforeHash : beforeHash.slice(0, queryIndex) + ).trim(); + + if (!path) return null; + + let resolvedPath = path; + const isRelativePath = + !path.startsWith('/') && + currentPermalink && + (path.startsWith('./') || path.startsWith('../')); + + if (isRelativePath) { + try { + resolvedPath = new URL( + path, + 'https://_/' + + ensureLeadingSlash(currentPermalink).replace(/^\/+/, ''), + ).pathname; + } catch { + return null; + } + } else if (!path.startsWith('/')) { + return null; + } + + const canonical = findCanonicalPermalink(docsByPermalink, resolvedPath); + if (!canonical) return null; + return docsByPermalink.get(canonical)?.title ?? null; + } + + visit(tree, ['link', 'definition'], (node) => { + const mdastNode = node as MdastNode; + const url = typeof mdastNode.url === 'string' ? mdastNode.url : ''; + + if (url) { + const rewritten = rewriteUrl(url); + if (rewritten) mdastNode.url = rewritten; + } + + // Try to resolve link text that looks like a path + const children = Array.isArray(mdastNode.children) + ? mdastNode.children + : []; + if (children.length === 1 && children[0]?.type === 'text') { + const linkText = String(children[0].value ?? '').trim(); + if (linkText && !linkText.includes(' ')) { + const title = resolveLinkTitle(linkText); + if (title) children[0].value = title; + } + } + }); +} + +// Markdown Rendering +interface RenderOptions { + docsByPermalink: ReadonlyMap; + currentPermalink?: string; + siteBase?: string; + titleToStrip?: string; + baseHeadingLevel?: number; + linkTarget?: LinkTarget; +} + +export function renderMarkdownFromCapturedTree( + tree: MdastNode, + options: RenderOptions, +): string { + const cloned: MdastNode = structuredClone(tree); + + // Strip leading title if it matches titleToStrip + if ( + options.titleToStrip && + Array.isArray(cloned.children) && + cloned.children[0]?.type === 'heading' + ) { + const headingText = stripHeadingId(extractNodeText(cloned.children[0])); + if (normalizeTitle(headingText) === normalizeTitle(options.titleToStrip)) { + cloned.children.shift(); + } + } + + // Adjust heading levels + if (typeof options.baseHeadingLevel === 'number') { + let minDepth = Infinity; + + visit(cloned, ['heading'], (node) => { + const depth = Number((node as MdastNode).depth); + if (Number.isFinite(depth)) { + minDepth = Math.min(minDepth, depth); + } + }); + + if (Number.isFinite(minDepth)) { + const offset = Math.min(6, options.baseHeadingLevel + 1) - minDepth; + if (offset) { + visit(cloned, ['heading'], (node) => { + const mdastNode = node as MdastNode; + mdastNode.depth = Math.max( + 1, + Math.min(6, Number(mdastNode.depth) + offset), + ); + }); + } + } + } + + // Rewrite links + rewriteLinks(cloned, options); + + // Normalize block children (wrap stray inline nodes in paragraphs) + normalizeBlockChildren(cloned); + + try { + return mdastToMarkdown(cloned as any, { + extensions: [gfmToMarkdown()], + }).trim(); + } catch (error) { + const message = error instanceof Error ? error.message : 'unknown'; + console.warn(`llms-export: markdown serialize error: ${message}`); + return extractNodeText(cloned).trim(); + } +} + +function normalizeBlockChildren(node: unknown): void { + if (!isMdastNode(node) || !Array.isArray(node.children)) return; + + if ( + node.type === 'root' || + node.type === 'blockquote' || + node.type === 'listItem' + ) { + const newChildren: MdastNode[] = []; + const pendingInline: MdastNode[] = []; + + const flushPending = () => { + if (pendingInline.length) { + newChildren.push({type: 'paragraph', children: [...pendingInline]}); + pendingInline.length = 0; + } + }; + + for (const child of node.children) { + if (isMdastNode(child) && PHRASING_NODE_TYPES.has(child.type ?? '')) { + pendingInline.push(child); + } else { + flushPending(); + if (isMdastNode(child)) newChildren.push(child); + } + } + + flushPending(); + node.children = newChildren; + } + + for (const child of node.children) { + normalizeBlockChildren(child); + } +} + +// MDX/HTML Processing +function getMdxAttribute(node: MdastNode, name: string): string { + const attributes = (node.attributes ?? []) as Array<{ + type?: string; + name?: string; + value?: string | {value?: string} | null; + }>; + + for (const attr of attributes) { + if (attr?.type !== 'mdxJsxAttribute' || attr?.name !== name) continue; + const value = attr.value; + if (typeof value === 'string') return value.trim(); + if (typeof value?.value === 'string') return String(value.value).trim(); + } + + return ''; +} + +// HTML entity decoding +const HTML_ENTITIES: Record = { + amp: '&', + lt: '<', + gt: '>', + quot: '"', + apos: "'", + nbsp: ' ', + copy: '©', + reg: '®', + hellip: '…', + mdash: '—', + ndash: '–', +}; + +function decodeHtmlEntities(text: string): string { + if (!text) return text; + + return text.replace( + /&(#x[0-9a-fA-F]+|#[0-9]+|[a-zA-Z][a-zA-Z0-9]+);/g, + (match, entity: string) => { + // Hex numeric entity: &#x...; + if (entity.startsWith('#x') || entity.startsWith('#X')) { + try { + const codePoint = parseInt(entity.slice(2), 16); + return codePoint === 160 ? ' ' : String.fromCodePoint(codePoint); + } catch { + return match; + } + } + + // Decimal numeric entity: &#...; + if (entity.startsWith('#')) { + try { + const codePoint = parseInt(entity.slice(1), 10); + return codePoint === 160 ? ' ' : String.fromCodePoint(codePoint); + } catch { + return match; + } + } + + // Named entity + return HTML_ENTITIES[entity.toLowerCase()] ?? match; + }, + ); +} + +function stripHtmlTags(html: string): string { + return html + .replace(//gi, '') + .replace(//gi, '') + .replace(/<[^>]+>/g, ''); +} + +function parseHtmlToNodes(html: string): MdastNode[] | null { + if (!html) return null; + + // Convert tags to markdown link syntax + const withMarkdownLinks = html.replace( + /]*href=(["'])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, + (_, _quote, href, content) => { + const cleanHref = String(href ?? '').trim(); + const cleanText = decodeHtmlEntities(stripHtmlTags(content ?? '')) + .replace(/\s+/g, ' ') + .trim(); + + if (cleanHref && cleanText) { + return `[${cleanText}](${cleanHref})`; + } + if (cleanHref) { + return cleanHref; + } + return cleanText; + }, + ); + + // Split on
tags + const parts = withMarkdownLinks.split(//gi); + const nodes: MdastNode[] = []; + + for (let i = 0; i < parts.length; i++) { + const text = decodeHtmlEntities(stripHtmlTags(parts[i])) + .replace(/\s+/g, ' ') + .trim(); + + if (text) { + nodes.push({type: 'text', value: text}); + } + if (i < parts.length - 1) { + nodes.push({type: 'break'}); + } + } + + // Remove trailing breaks + while (nodes.length && nodes[nodes.length - 1].type === 'break') { + nodes.pop(); + } + + return nodes.length ? nodes : null; +} + +// Node types to completely drop +const DROP_NODE_TYPES = new Set([ + 'mdxjsEsm', + 'mdxFlowExpression', + 'mdxTextExpression', + 'yaml', + 'toml', + 'jsx', +]); + +// Helper functions for building AST nodes +const createTextNode = (value: string): MdastNode => ({type: 'text', value}); +const createParagraph = (children: MdastNode[]): MdastNode => ({ + type: 'paragraph', + children, +}); +const createStrong = (text: string): MdastNode => ({ + type: 'strong', + children: [createTextNode(text)], +}); +const createLink = (url: string, text: string): MdastNode => ({ + type: 'link', + url, + children: [createTextNode(text)], +}); +const createListItem = (children: MdastNode[]): MdastNode => ({ + type: 'listItem', + children, +}); +const createList = (items: MdastNode[]): MdastNode => ({ + type: 'list', + children: items, +}); + +type MdxHandler = ( + node: MdastNode, + isFlowElement: boolean, +) => MdastNode[] | null; + +const MDX_HANDLERS: Record = { + img: (node) => { + const alt = getMdxAttribute(node, 'alt'); + return alt ? [createTextNode(`Image: ${alt}`)] : []; + }, + + DocCard: (node) => { + const itemAttr = getMdxAttribute(node, 'item'); + const match = itemAttr.match(/findSidebarItem\(\s*['"]([^'"]+)['"]\s*\)/); + if (!match) return []; + + const path = match[1]; + return [ + createList([createListItem([createParagraph([createLink(path, path)])])]), + ]; + }, + + TabItem: (node, isFlowElement) => { + if (!isFlowElement) return null; + const label = + getMdxAttribute(node, 'label') || getMdxAttribute(node, 'value'); + return label + ? [createParagraph([createStrong('Tab'), createTextNode(`: ${label}`)])] + : []; + }, +}; + +export function stripNoisyNodes(tree: MdastNode): MdastNode { + processNode(tree); + return tree; +} + +function processNode(node: MdastNode): void { + if (!node || typeof node !== 'object') return; + + // Remove position and data for cleaner output + delete node.position; + delete node.data; + + if (!Array.isArray(node.children)) return; + + const newChildren: MdastNode[] = []; + for (const child of node.children) { + if (child && typeof child === 'object') { + newChildren.push(...transformChild(child)); + } + } + node.children = newChildren; +} + +function transformChild(child: MdastNode): MdastNode[] { + const nodeType = String(child.type ?? ''); + + // Handle standard image nodes + if (nodeType === 'image') { + const alt = typeof child.alt === 'string' ? child.alt.trim() : ''; + return alt ? [createTextNode(`Image: ${alt}`)] : []; + } + + // Handle raw HTML + if (nodeType === 'html') { + return parseHtmlToNodes(child.value ?? '') ?? []; + } + + // Handle MDX JSX elements + if (nodeType === 'mdxJsxFlowElement' || nodeType === 'mdxJsxTextElement') { + const elementName = typeof child.name === 'string' ? child.name.trim() : ''; + const handler = elementName ? MDX_HANDLERS[elementName] : undefined; + const isFlowElement = nodeType === 'mdxJsxFlowElement'; + + if (handler) { + const result = handler(child, isFlowElement); + if (result?.length) { + processNode(child); + return [...result, ...(child.children ?? [])]; + } + return result ?? []; + } + + // Unknown MDX element - unwrap children + processNode(child); + return child.children ?? []; + } + + // Handle directives (admonitions, etc.) + if ( + nodeType === 'containerDirective' || + nodeType === 'leafDirective' || + nodeType === 'textDirective' + ) { + const directiveName = + typeof (child as any).name === 'string' ? (child as any).name.trim() : ''; + const title = + (child as any).attributes?.title ?? (child as any).label ?? ''; + + processNode(child); + + const prefix: MdastNode[] = directiveName + ? [ + createParagraph([ + createStrong( + directiveName.charAt(0).toUpperCase() + directiveName.slice(1), + ), + createTextNode(title ? `: ${title}` : ':'), + ]), + ] + : []; + + return [...prefix, ...(child.children ?? [])]; + } + + // Drop known noisy node types + if (DROP_NODE_TYPES.has(nodeType) || nodeType.startsWith('mdx')) { + return []; + } + + // Keep other nodes, processing their children + processNode(child); + return [child]; +} diff --git a/src/plugins/llms/types.ts b/src/plugins/llms/types.ts new file mode 100644 index 0000000000..c5597c2eea --- /dev/null +++ b/src/plugins/llms/types.ts @@ -0,0 +1,105 @@ +import type {Node} from 'unist'; + +export interface MdastNode extends Node { + type: string; + children?: MdastNode[]; + value?: string; + url?: string; + alt?: string; + title?: string; + depth?: number; + name?: string; + attributes?: { + type: string; + name: string; + value: string | {value: string} | null; + }[]; + label?: string; + data?: Record; + position?: { + start: {line: number; column: number; offset?: number}; + end: {line: number; column: number; offset?: number}; + }; +} + +export interface SidebarCategoryItem { + type: 'category'; + label?: string; + items?: SidebarItemType[]; + link?: { + type: 'doc' | 'generated-index'; + id?: string; + docId?: string; + slug?: string; + description?: string; + }; + description?: string; +} + +export type SidebarItemType = + | string + | { + type: string; + id?: string; + docId?: string; + label?: string; + href?: string; + items?: SidebarItemType[]; + link?: any; + description?: string; + value?: string; + }; + +export interface DocusaurusContext { + siteDir: string; + siteConfig: { + url?: string; + baseUrl?: string; + title?: string; + tagline?: string; + }; +} + +export interface DocsVersion { + docs?: DocMetadata[]; + loadedContent?: {docs?: DocMetadata[]}; + docsSidebars?: Record; + sidebars?: Record; + loadedVersions?: DocsVersion[]; +} + +export interface DocMetadata { + id?: string; + unversionedId?: string; + title?: string; + description?: string; + permalink?: string; + source?: string; + unlisted?: boolean; + draft?: boolean; + frontMatter?: {description?: string}; + metadata?: DocMetadata; +} + +export const PHRASING_NODE_TYPES = new Set([ + 'text', + 'inlineCode', + 'emphasis', + 'strong', + 'delete', + 'break', + 'image', + 'imageReference', + 'link', + 'linkReference', + 'footnoteReference', +]); + +export function isMdastNode(value: unknown): value is MdastNode { + return ( + typeof value === 'object' && + value !== null && + 'type' in value && + typeof (value as MdastNode).type === 'string' + ); +} diff --git a/src/remark/llms-capture.ts b/src/remark/llms-capture.ts new file mode 100644 index 0000000000..352b4eb9c3 --- /dev/null +++ b/src/remark/llms-capture.ts @@ -0,0 +1,81 @@ +import * as fs from 'fs'; +import * as path from 'path'; + +import type {Transformer} from 'unified'; + +import { + captureDirForSite, + capturedTreePathForSource, + wrapCapturedTree, +} from '../plugins/llms/capture'; + +interface Options { + siteDir?: string; +} + +let hasLoggedCaptureDir = false; + +function cleanupTempFile(tempFile: string): void { + try { + fs.rmSync(tempFile, {force: true}); + } catch { + // Ignore cleanup failures + } +} + +function atomicWriteJson(filename: string, payload: unknown): void { + const dir = path.dirname(filename); + const base = path.basename(filename); + const tempFile = path.join(dir, `.${base}.${process.pid}.${Date.now()}.tmp`); + + fs.writeFileSync(tempFile, JSON.stringify(payload), 'utf8'); + + try { + fs.renameSync(tempFile, filename); + } catch (error) { + // Retry with force removal on Windows or other systems with file locking + try { + fs.rmSync(filename, {force: true}); + fs.renameSync(tempFile, filename); + } catch { + cleanupTempFile(tempFile); + throw error; + } + } +} + +export default function remarkLlmsCapture(options: Options = {}): Transformer { + const siteDir = + typeof options.siteDir === 'string' && options.siteDir.trim() + ? options.siteDir + : process.cwd(); + + return (tree, vfile) => { + const sourcePath = vfile.history?.[0] ?? vfile.path; + if (!sourcePath) return; + + try { + const resolvedSource = path.isAbsolute(sourcePath) + ? sourcePath + : path.join(siteDir, sourcePath); + + const captureDir = captureDirForSite(siteDir); + fs.mkdirSync(captureDir, {recursive: true}); + + if (!hasLoggedCaptureDir) { + console.info(`llms-export: capturing mdx trees to ${captureDir}`); + hasLoggedCaptureDir = true; + } + + const outputPath = capturedTreePathForSource(resolvedSource, siteDir); + atomicWriteJson(outputPath, wrapCapturedTree(tree)); + } catch (error) { + const message = error instanceof Error ? error.message : 'unknown error'; + const location = sourcePath ? ` (${sourcePath})` : ''; + // Log warning instead of throwing to allow build to continue + console.warn( + `llms-export: warning: failed to capture mdx tree${location}: ${message}`, + ); + } + }; +}