From 3c00f5d1519e060f8986993cec75fbb9b199c351 Mon Sep 17 00:00:00 2001 From: Evan Jacobs Date: Tue, 18 Nov 2025 18:15:48 -0500 Subject: [PATCH] feat: add llms.txt endpoint for LLM-optimized documentation Add /llms.txt endpoint that serves a concatenated, text-only version of all Tailwind CSS documentation pages optimized for Large Language Model consumption. - Extract text from MDX files, removing JSX components and preserving code blocks - Remove standalone HTML blocks (not in code blocks) - Extract meaningful content from custom components (ApiTable, ResponsiveDesign, etc.) - Statically generate the output at build time - Include all 185 documentation files in proper order with sections --- src/app/api/llms-txt/extract-text.test.ts | 684 ++++++++++++++++++++++ src/app/api/llms-txt/extract-text.ts | 331 +++++++++++ src/app/llms.txt/route.ts | 132 +++++ 3 files changed, 1147 insertions(+) create mode 100644 src/app/api/llms-txt/extract-text.test.ts create mode 100644 src/app/api/llms-txt/extract-text.ts create mode 100644 src/app/llms.txt/route.ts diff --git a/src/app/api/llms-txt/extract-text.test.ts b/src/app/api/llms-txt/extract-text.test.ts new file mode 100644 index 000000000..79d66950e --- /dev/null +++ b/src/app/api/llms-txt/extract-text.test.ts @@ -0,0 +1,684 @@ +// Note: can run these tests with `bun test` or `node --test` + +import { describe, test } from "node:test"; +import assert from "node:assert"; +import { extractTextFromMDX } from "./extract-text"; +import dedent from "dedent"; + +describe("extractTextFromMDX", () => { + describe("extracting title and description", () => { + test("extracts title and description from exports", (t) => { + let input = dedent` + export const title = "Test Title"; + export const description = "Test description"; + + ## Content + Some content here. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("Test Title")); + assert.ok(result.includes("Test description")); + assert.ok(result.includes("## Content")); + assert.ok(result.includes("Some content here")); + }); + + test("handles missing title and description", (t) => { + let input = dedent` + ## Content + Some content here. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("## Content")); + assert.ok(result.includes("Some content here")); + }); + }); + + describe("extracting markdown headings", () => { + test("preserves heading hierarchy", (t) => { + let input = dedent` + ## Heading 2 + Content under h2 + + ### Heading 3 + Content under h3 + + #### Heading 4 + Content under h4 + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("## Heading 2")); + assert.ok(result.includes("### Heading 3")); + assert.ok(result.includes("#### Heading 4")); + }); + + test("preserves heading text with special characters", (t) => { + let input = dedent` + ## Heading with \`code\` and **bold** + Content here + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("## Heading")); + }); + }); + + describe("extracting paragraph text", () => { + test("extracts paragraph text", (t) => { + let input = dedent` + ## Title + + This is a paragraph with some text. + + This is another paragraph. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("This is a paragraph with some text")); + assert.ok(result.includes("This is another paragraph")); + }); + + test("preserves line breaks in paragraphs", (t) => { + let input = dedent` + This is a paragraph + that spans multiple lines. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("This is a paragraph")); + assert.ok(result.includes("that spans multiple lines")); + }); + }); + + describe("extracting code blocks", () => { + test("preserves code block content with language hint", (t) => { + let input = dedent` + ## Code Example + + \`\`\`html +
Hello
+ \`\`\` + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("```html")); + assert.ok(result.includes('
Hello
')); + }); + + test("removes code example directives from code blocks", (t) => { + let input = dedent` + \`\`\`html + +
Hello
+ \`\`\` + `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("[!code")); + assert.ok(result.includes('
Hello
')); + }); + + test("preserves multiple code blocks", (t) => { + let input = dedent` + \`\`\`html +
HTML
+ \`\`\` + + \`\`\`css + .test { color: red; } + \`\`\` + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("```html")); + assert.ok(result.includes("
HTML
")); + assert.ok(result.includes("```css")); + assert.ok(result.includes(".test { color: red; }")); + }); + }); + + describe("stripping import statements", () => { + test("removes import statements", (t) => { + let input = dedent` + import { Example } from "@/components/example"; + import dedent from "dedent"; + + ## Content + Some content here. + `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("import")); + assert.ok(result.includes("## Content")); + assert.ok(result.includes("Some content here")); + }); + + test("removes various import formats", (t) => { + let input = dedent` + import React from "react"; + import type { Component } from "./types"; + import { Example, Figure } from "@/components"; + + ## Content + `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("import")); + assert.ok(result.includes("## Content")); + }); + }); + + describe("stripping JSX/React component syntax", () => { + test("removes JSX component tags", (t) => { + let input = dedent` + ## Title + + +
Content
+
+ + More content here. + `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("")); + assert.ok(!result.includes("")); + assert.ok(result.includes("More content here")); + }); + + test("removes component props", (t) => { + let input = dedent` +
+ + Content + +
+ `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("hint=")); + assert.ok(!result.includes("padding=")); + }); + + test("extracts text content from JSX", (t) => { + let input = dedent` + +
This is text content
+

More text here

+
+ `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("This is text content") || result.includes("More text here")); + }); + }); + + describe("handling ApiTable components", () => { + test("extracts table data as text", (t) => { + let input = dedent` + + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("inline")); + assert.ok(result.includes("display: inline")); + assert.ok(result.includes("block")); + assert.ok(result.includes("display: block")); + }); + + test("handles ApiTable with complex values", (t) => { + let input = dedent` + + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("sr-only")); + assert.ok(result.includes("position: absolute")); + }); + }); + + describe("handling links", () => { + test("converts markdown links to plain text", (t) => { + let input = dedent` + Check out the [display docs](/docs/display) for more info. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("display docs")); + assert.ok(result.includes("/docs/display")); + }); + + test("handles links in headings", (t) => { + let input = dedent` + ## See [this page](/docs/page) + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("this page")); + assert.ok(result.includes("/docs/page")); + }); + }); + + describe("handling code example directives", () => { + test("removes [!code ...] directives from code", (t) => { + let input = dedent` + \`\`\`html + +
Hello
+ + \`\`\` + `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("[!code")); + assert.ok(result.includes('
Hello
')); + }); + + test("removes [!code filename:...] directives", (t) => { + let input = dedent` + \`\`\`html + +
Content
+ \`\`\` + `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("[!code filename")); + assert.ok(result.includes("
Content
")); + }); + }); + + describe("edge cases", () => { + test("handles empty file", (t) => { + let result = extractTextFromMDX(""); + assert.strictEqual(typeof result, "string"); + }); + + test("handles file with only exports", (t) => { + let input = dedent` + export const title = "Title"; + export const description = "Description"; + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("Title")); + assert.ok(result.includes("Description")); + }); + + test("handles special characters", (t) => { + let input = dedent` + ## Title with "quotes" and 'apostrophes' + Content with and & entities. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("Title")); + }); + + test("handles nested components", (t) => { + let input = dedent` +
+ +
Nested content
+
+
+ `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("
")); + assert.ok(!result.includes("")); + }); + }); + + describe("removing export statements", () => { + test("removes export const title and description", (t) => { + let input = dedent` + export const title = "Test Title"; + export const description = "Test description"; + + ## Content + Some content here. + `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("export const title")); + assert.ok(!result.includes("export const description")); + assert.ok(result.includes("## Content")); + assert.ok(result.includes("Some content here")); + }); + + test("removes export statements with template literals", (t) => { + let input = dedent` + export const title = \`Test Title\`; + export const description = \`Test description\`; + + ## Content + `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("export const title")); + assert.ok(!result.includes("export const description")); + }); + }); + + describe("handling content components", () => { + test("extracts text from ResponsiveDesign component", (t) => { + let input = dedent` + ### Responsive design + + + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("### Responsive design")); + assert.ok(result.includes("Prefix a color utility")); + assert.ok(result.includes("breakpoint variant like md:")); + assert.ok(result.includes("md:text-green-600")); + assert.ok(!result.includes(" { + let input = dedent` + ## Customizing your theme + + + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("## Customizing your theme")); + assert.ok(result.includes("Use the --spacing-* theme variables")); + assert.ok(!result.includes(" { + let input = dedent` + ### Targeting specific states + + + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("### Targeting specific states")); + assert.ok(result.includes("Prefix a background-color utility")); + assert.ok(result.includes("variant like hover:")); + assert.ok(!result.includes(" { + let input = dedent` + + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("Use responsive variants")); + assert.ok(!result.includes(" { + test("removes HTML blocks that are not in code blocks", (t) => { + let input = dedent` + ## Example + +
+

Basic Tee

+

$35

+
+ + More content here. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("## Example")); + assert.ok(result.includes("More content here")); + assert.ok(!result.includes("
")); + assert.ok(!result.includes("

Basic Tee

")); + }); + + test("preserves HTML in code blocks", (t) => { + let input = dedent` + ## Example + + \`\`\`html +
+

Basic Tee

+
+ \`\`\` + + More content. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("```html")); + assert.ok(result.includes("
")); + assert.ok(result.includes("

Basic Tee

")); + assert.ok(result.includes("More content")); + }); + + test("removes nested HTML structures", (t) => { + let input = dedent` +
+
+

Text

+
+
+ + Content after HTML. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("Content after HTML")); + assert.ok(!result.includes("
")); + assert.ok(!result.includes("

Text

")); + }); + + test("removes HTML with empty elements", (t) => { + let input = dedent` +
+ +

Content

+
+ `; + + let result = extractTextFromMDX(input); + assert.ok(!result.includes("
")); + assert.ok(!result.includes("")); + assert.ok(!result.includes("

Content

")); + }); + }); + + describe("handling JSX expressions with nested braces", () => { + test("extracts table content from JSX expression", (t) => { + let input = dedent` + ## Breakpoints + + { + + + + + + + + + + + + + +
BreakpointWidth
sm640px
+ } + + More content here. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("Breakpoint")); + assert.ok(result.includes("Width")); + assert.ok(result.includes("sm")); + assert.ok(result.includes("640px")); + assert.ok(result.includes("More content here")); + assert.ok(!result.includes("{")); + assert.ok(!result.includes("}")); + }); + + test("handles nested JSX expressions", (t) => { + let input = dedent` + { +
+

Outer content

+ { + Inner content + } +
+ } + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("Outer content") || result.includes("Inner content")); + }); + + test("handles JSX expressions with code blocks inside", (t) => { + let input = dedent` + { +
+ \`\`\`html +
Code example
+ \`\`\` +
+ } + `; + + let result = extractTextFromMDX(input); + // Code blocks should be preserved + assert.ok(result.includes("```html") || result.includes("Code example")); + }); + + test("handles multiple JSX expressions", (t) => { + let input = dedent` + First expression: + { + + +
Row 1
+ } + + Second expression: + { +
Content
+ } + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("Row 1") || result.includes("Content")); + }); + }); + + describe("real-world examples", () => { + test("handles responsive-design.mdx with JSX table expression", (t) => { + let input = dedent` + export const title = "Responsive design"; + export const description = "Using responsive utility variants."; + + ## Overview + + There are five breakpoints: + + { + + + + + + + + + + + + + +
Breakpoint prefixMinimum width
sm40rem (640px)
+ } + + ### Customizing your theme + + More content here. + `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("Responsive design")); + assert.ok(result.includes("Using responsive utility variants")); + assert.ok(result.includes("## Overview")); + assert.ok(result.includes("Breakpoint prefix")); + assert.ok(result.includes("Minimum width")); + assert.ok(result.includes("sm")); + assert.ok(result.includes("40rem")); + assert.ok(result.includes("### Customizing your theme")); + assert.ok(result.includes("More content here")); + assert.ok(!result.includes("export const")); + assert.ok(!result.includes("{")); + assert.ok(!result.includes("}")); + }); + + test("handles display.mdx structure", (t) => { + let input = dedent` + import dedent from "dedent"; + import { ApiTable } from "@/components/api-table.tsx"; + import { Example } from "@/components/example.tsx"; + import { Figure } from "@/components/figure.tsx"; + + export const title = "display"; + export const description = "Utilities for controlling the display box type of an element."; + + + + ## Examples + + ### Block and Inline + + Use the \`inline\`, \`inline-block\`, and \`block\` utilities: + +
+ + {
Example content
} +
+ + \`\`\`html + +
Content
+ \`\`\` +
+ `; + + let result = extractTextFromMDX(input); + assert.ok(result.includes("display")); + assert.ok(result.includes("Utilities for controlling the display box type")); + assert.ok(result.includes("## Examples")); + assert.ok(result.includes("### Block and Inline")); + assert.ok(result.includes("Use the")); + assert.ok(result.includes("```html")); + assert.ok(result.includes('
Content
')); + assert.ok(!result.includes("import")); + assert.ok(!result.includes("")); + assert.ok(!result.includes("[!code")); + }); + }); +}); diff --git a/src/app/api/llms-txt/extract-text.ts b/src/app/api/llms-txt/extract-text.ts new file mode 100644 index 000000000..dbec0dfc8 --- /dev/null +++ b/src/app/api/llms-txt/extract-text.ts @@ -0,0 +1,331 @@ +export function extractTextFromMDX(mdxContent: string): string { + if (!mdxContent) return ""; + + let content = mdxContent; + + // Extract title and description from exports + let title = extractExport("title", content); + let description = extractExport("description", content); + + // Remove import statements + content = removeImports(content); + + // Handle code blocks first - preserve them but clean directives + let { content: processedContent, codeBlocks } = preserveCodeBlocks(content); + + // Clean directives from code blocks before restoring + codeBlocks = codeBlocks.map((block) => removeCodeDirectives(block)); + + // Remove JSX component tags and extract their content + processedContent = removeJSXComponents(processedContent); + + // Remove code example directives from non-code content + processedContent = removeCodeDirectives(processedContent); + + // Restore code blocks + processedContent = restoreCodeBlocks(processedContent, codeBlocks); + + // Clean up extra whitespace + processedContent = cleanWhitespace(processedContent); + + // Build result with title and description if available + let result = ""; + if (title) { + result += `# ${title}\n\n`; + } + if (description) { + result += `${description}\n\n`; + } + if (title || description) { + result += "---\n\n"; + } + + result += processedContent; + + return result.trim(); +} + +function extractExport(name: string, content: string): string | null { + // Match: export const title = "value"; or export const title = 'value'; + let match = content.match(new RegExp(`export\\s+const\\s+${name}\\s*=\\s*["']([^"']+)["']`, "s")); + if (match) { + return match[1]; + } + + // Match: export const title = `value`; + match = content.match(new RegExp(`export\\s+const\\s+${name}\\s*=\\s*\`([^\`]+)\``, "s")); + if (match) { + return match[1]; + } + + return null; +} + +function removeImports(content: string): string { + // Remove all import statements (single and multi-line) + content = content.replace(/^import\s+.*?from\s+["'][^"']+["'];?\s*$/gm, ""); + // Remove export statements (but keep the content after them) + // Handle both single and double quotes, and template literals + content = content.replace(/^export\s+const\s+(title|description)\s*=\s*["']([^"']+)["'];?\s*$/gm, ""); + content = content.replace(/^export\s+const\s+(title|description)\s*=\s*`([^`]+)`;?\s*$/gm, ""); + // Also handle exports that might be on the same line as other content + content = content.replace(/export\s+const\s+(title|description)\s*=\s*["']([^"']+)["'];?/g, ""); + content = content.replace(/export\s+const\s+(title|description)\s*=\s*`([^`]+)`;?/g, ""); + return content; +} + +function preserveCodeBlocks(content: string): { content: string; codeBlocks: string[] } { + // Extract code blocks temporarily to protect them from JSX removal + let codeBlocks: string[] = []; + let blockIndex = 0; + + // Match code blocks with optional language + let processedContent = content.replace(/```(\w+)?\n([\s\S]*?)```/g, (match, lang, code) => { + let placeholder = `__CODE_BLOCK_${blockIndex}__`; + codeBlocks[blockIndex] = `\`\`\`${lang || ""}\n${code}\`\`\``; + blockIndex++; + return placeholder; + }); + + return { content: processedContent, codeBlocks }; +} + +function restoreCodeBlocks(content: string, codeBlocks: string[]): string { + for (let i = 0; i < codeBlocks.length; i++) { + content = content.replace(`__CODE_BLOCK_${i}__`, codeBlocks[i]); + } + + return content; +} + +function extractApiTableData(match: string): string { + let rowsMatch = match.match(/rows\s*=\s*\{\s*\[([\s\S]*?)\]\s*\}/); + if (rowsMatch) { + let rowsContent = rowsMatch[1]; + let extracted = rowsContent + .replace(/\[([^\]]+)\]/g, "$1") + .replace(/["']([^"']+)["']/g, "$1") + .replace(/dedent\s*`([^`]+)`/g, "$1") + .replace(/dedent\\`([^`]+)\\`/g, "$1") + .replace(/,/g, " | ") + .replace(/\s+/g, " ") + .trim(); + return extracted ? `\n${extracted}\n` : ""; + } + return ""; +} + +function removeJSXComponents(content: string): string { + // Handle ApiTable components - extract table data + content = content.replace(//g, extractApiTableData); + content = content.replace(//g, extractApiTableData); + + // Handle content components that have meaningful text - extract their props to generate text + // ResponsiveDesign, CustomizingYourTheme, TargetingSpecificStates, etc. + content = content.replace(/]*\/>/g, (match) => { + // Extract property and featuredClass to generate descriptive text + let propertyMatch = match.match(/property=["']([^"']+)["']/); + let featuredClassMatch = match.match(/featuredClass=["']([^"']+)["']/); + let breakpointMatch = match.match(/breakpoint=["']([^"']+)["']/); + let breakpoint = breakpointMatch ? breakpointMatch[1] : "md"; + let property = propertyMatch ? propertyMatch[1] : "utility"; + let featuredClass = featuredClassMatch ? featuredClassMatch[1] : ""; + + if (property && featuredClass) { + return `\n\nPrefix a ${property} utility with a breakpoint variant like ${breakpoint}: to only apply the utility at ${breakpoint} screen sizes and above. Use ${breakpoint}:${featuredClass} to apply ${featuredClass} at the ${breakpoint} breakpoint and above.\n\n`; + } + return "\n\nUse responsive variants to apply utilities at specific breakpoints.\n\n"; + }); + + content = content.replace(/]*\/>/g, (match) => { + let utilityMatch = match.match(/utility=["']([^"']+)["']/); + let utility = utilityMatch ? utilityMatch[1] : "utility"; + return `\n\nUse the --${utility}-* theme variables to customize the ${utility} utilities in your project.\n\n`; + }); + + content = content.replace(/]*\/>/g, (match) => { + let propertyMatch = match.match(/property=["']([^"']+)["']/); + let variantMatch = match.match(/variant=["']([^"']+)["']/); + let property = propertyMatch ? propertyMatch[1] : "utility"; + let variant = variantMatch ? variantMatch[1] : "hover"; + return `\n\nPrefix a ${property} utility with a variant like ${variant}: to only apply the utility in that state.\n\n`; + }); + + // Remove other component tags (Figure, Example, etc.) but extract text content first + // Extract text from component children before removing the component tags + // Match each component type separately to ensure opening/closing tags match + for (let componentName of ["Figure", "Example", "CodeExampleWrapper", "CodeExampleStack"]) { + let regex = new RegExp(`<${componentName}[^>]*>([\\s\\S]*?)<\\/${componentName}>`, "g"); + content = content.replace(regex, (match, innerContent) => { + // Extract text from the inner content, removing HTML tags + let text = innerContent + .replace(/<[^>]+>/g, "") // Remove all HTML tags + .replace(/\s+/g, " ") // Normalize whitespace + .trim(); + return text ? `\n${text}\n` : ""; + }); + } + + // Remove any remaining self-closing component tags + content = content.replace(/<(Figure|Example|CodeExampleWrapper|CodeExampleStack)[^>]*\/>/g, ""); + + // Extract text from JSX expressions like {...
} before removing them + // Handle multi-line JSX expressions with nested content + // We need to match braces properly, handling nested braces + let braceDepth = 0; + let startIndex = -1; + let result = ""; + let i = 0; + + while (i < content.length) { + if (content[i] === "{" && (i === 0 || content[i - 1] !== "\\")) { + if (braceDepth === 0) { + startIndex = i; + } + braceDepth++; + } else if (content[i] === "}" && (i === 0 || content[i - 1] !== "\\")) { + braceDepth--; + if (braceDepth === 0 && startIndex !== -1) { + let innerContent = content.slice(startIndex + 1, i); + // If it contains HTML tags, extract the text content only (not HTML structure) + if (innerContent.includes("<")) { + // Extract text from HTML - remove all tags and get just the text + let text = innerContent + .replace(/<[A-Z][a-zA-Z0-9]*[^>]*>/g, "") // Remove React component tags + .replace(/<\/[A-Z][a-zA-Z0-9]*>/g, "") + .replace(/<[^>]+>/g, "") // Remove all remaining HTML tags + .replace(/\s+/g, " ") // Normalize whitespace + .trim(); + // Only add if there's actual text content + if (text) { + result += text; + } + } + // Skip the braces and continue + startIndex = -1; + i++; + continue; + } + } + + if (braceDepth === 0) { + result += content[i]; + } + i++; + } + + // If we ended with unclosed braces, add the rest + if (braceDepth > 0) { + result += content.slice(startIndex); + } + + content = result; + + // Remove remaining JSX component tags and their props (but keep HTML tags like ,
, etc.) + // Only remove tags that start with uppercase (React components), not lowercase HTML tags + content = content.replace(/<[A-Z][a-zA-Z0-9]*[^>]*\/>/g, ""); + content = content.replace(/<[A-Z][a-zA-Z0-9]*[^>]*>/g, ""); + content = content.replace(/<\/[A-Z][a-zA-Z0-9]*>/g, ""); + + // Remove JSX attributes from remaining tags + content = content.replace(/<([a-z][a-z0-9]*)\s+[^>]*>/g, "<$1>"); + + // Remove standalone HTML blocks (not in code blocks) + // These are leftover HTML structures that aren't useful for LLM consumption + content = removeStandaloneHTML(content); + + return content; +} + +function removeStandaloneHTML(content: string): string { + // First, protect code blocks by replacing them with placeholders + let codeBlockPlaceholders: string[] = []; + let placeholderIndex = 0; + + content = content.replace(/```[\s\S]*?```/g, (match) => { + let placeholder = `__CODE_BLOCK_PLACEHOLDER_${placeholderIndex}__`; + codeBlockPlaceholders[placeholderIndex] = match; + placeholderIndex++; + return placeholder; + }); + + // Remove nested HTML structures that span multiple lines + // These are leftover HTML blocks that aren't in code blocks + let lines = content.split("\n"); + let result: string[] = []; + let inHTMLBlock = false; + let htmlBlockDepth = 0; + + for (let i = 0; i < lines.length; i++) { + let line = lines[i]; + let trimmed = line.trim(); + + let startsWithOpeningTag = /^\s*<[a-z][a-z0-9]*[^>]*>\s*$/.test(trimmed); + let startsWithClosingTag = /^\s*<\/[a-z][a-z0-9]*>\s*$/.test(trimmed); + let isSelfClosingTag = /^\s*<[a-z][a-z0-9]*[^>]*\/>\s*$/.test(trimmed); + let hasOpeningTag = /<[a-z][a-z0-9]*[^>]*>/.test(trimmed); + let hasClosingTag = /<\/[a-z][a-z0-9]*>/.test(trimmed); + let isCompleteHTMLElement = /^\s*<[a-z][a-z0-9]*[^>]*>.*<\/[a-z][a-z0-9]*>\s*$/.test(trimmed) || isSelfClosingTag; + + if (isSelfClosingTag && !inHTMLBlock) { + continue; + } else if (isCompleteHTMLElement && !inHTMLBlock) { + continue; + } else if (startsWithOpeningTag && !inHTMLBlock) { + inHTMLBlock = true; + htmlBlockDepth = 1; + } else if (hasOpeningTag && inHTMLBlock && !hasClosingTag) { + htmlBlockDepth++; + } else if (startsWithClosingTag && inHTMLBlock) { + htmlBlockDepth--; + if (htmlBlockDepth === 0) { + inHTMLBlock = false; + continue; + } + } else if (inHTMLBlock) { + continue; + } else { + result.push(line); + } + } + + content = result.join("\n"); + + // Restore code blocks + for (let i = 0; i < codeBlockPlaceholders.length; i++) { + content = content.replace(`__CODE_BLOCK_PLACEHOLDER_${i}__`, codeBlockPlaceholders[i]); + } + + return content; +} + +function removeCodeDirectives(content: string): string { + // Remove [!code ...] directives + // These can appear as comments in various formats + content = content.replace(//g, ""); + content = content.replace(/\/\*\s*\[!code[^\]]+\]\s*\*\//g, ""); + content = content.replace(/#\s*\[!code[^\]]+\]/g, ""); + content = content.replace(/\/\/\s*\[!code[^\]]+\]/g, ""); + + // Remove prettier-ignore comments + content = content.replace(//g, ""); + content = content.replace(/\/\*\s*prettier-ignore\s*\*\//g, ""); + content = content.replace(/#\s*prettier-ignore/g, ""); + content = content.replace(/\/\/\s*prettier-ignore/g, ""); + + return content; +} + +function cleanWhitespace(content: string): string { + // Remove excessive blank lines (more than 2 consecutive) + content = content.replace(/\n{3,}/g, "\n\n"); + + // Trim each line + content = content + .split("\n") + .map((line) => line.trimEnd()) + .join("\n"); + + // Remove leading/trailing whitespace + return content.trim(); +} diff --git a/src/app/llms.txt/route.ts b/src/app/llms.txt/route.ts new file mode 100644 index 000000000..3dd31136f --- /dev/null +++ b/src/app/llms.txt/route.ts @@ -0,0 +1,132 @@ +import { NextResponse } from "next/server"; +import fs from "node:fs/promises"; +import path from "node:path"; +import { getDocPageSlugs } from "../(docs)/docs/api"; +import { extractTextFromMDX } from "../api/llms-txt/extract-text"; +import index from "../(docs)/docs/index"; + +export const dynamic = "force-static"; +export const revalidate = false; + +export async function GET() { + let output = "# Tailwind CSS Documentation\n\n"; + output += + "This file contains a concatenated, text-only version of all Tailwind CSS documentation pages, optimized for Large Language Model consumption.\n\n"; + output += "---\n\n"; + + let slugs = await getDocPageSlugs(); + + // Build a map of slugs to their section and title from the index + let slugToSection = new Map(); + for (let [section, entries] of Object.entries(index)) { + for (let entry of entries) { + let [title, docPath] = entry; + let slug = docPath.replace("/docs/", ""); + slugToSection.set(slug, { section, title }); + + // Handle nested children + if (entry.length > 2 && Array.isArray(entry[2])) { + for (let [childTitle, childPath] of entry[2]) { + let childSlug = childPath.replace("/docs/", ""); + slugToSection.set(childSlug, { section, title: childTitle }); + } + } + } + } + + // Process each slug in the order defined by the index + let processedSlugs = new Set(); + let currentSection = ""; + + for (let [section, entries] of Object.entries(index)) { + if (section !== currentSection) { + if (currentSection !== "") { + output += "\n"; + } + output += `## ${section}\n\n`; + currentSection = section; + } + + for (let entry of entries) { + let [title, docPath] = entry; + let slug = docPath.replace("/docs/", ""); + + if (processedSlugs.has(slug)) continue; + processedSlugs.add(slug); + + output += await processSlug(slug, title); + + // Handle nested children + if (entry.length > 2 && Array.isArray(entry[2])) { + for (let [childTitle, childPath] of entry[2]) { + let childSlug = childPath.replace("/docs/", ""); + if (processedSlugs.has(childSlug)) continue; + processedSlugs.add(childSlug); + output += await processSlug(childSlug, childTitle); + } + } + } + } + + // Process any remaining slugs that weren't in the index + for (let slug of slugs) { + if (!processedSlugs.has(slug)) { + let sectionInfo = slugToSection.get(slug); + let title = sectionInfo?.title || slug; + output += await processSlug(slug, title); + } + } + + return new NextResponse(output, { + headers: { + "Content-Type": "text/plain; charset=utf-8", + "Cache-Control": process.env.NODE_ENV === "development" ? "no-cache" : "public, max-age=3600", + }, + }); +} + +async function processSlug(slug: string, title: string): Promise { + try { + let filePath = path.join(process.cwd(), "./src/docs", `${slug}.mdx`); + let content = await fs.readFile(filePath, "utf8"); + + // Extract title and description from exports + let titleMatch = content.match(/export\s+const\s+title\s*=\s*["']([^"']+)["']/); + let descriptionMatch = content.match(/export\s+const\s+description\s*=\s*["']([^"']+)["']/); + + let pageTitle = titleMatch ? titleMatch[1] : title; + let description = descriptionMatch ? descriptionMatch[1] : ""; + + // Extract text from MDX + let extractedText = extractTextFromMDX(content); + + // Remove the title/description header that extractTextFromMDX adds (we'll format it ourselves) + if (extractedText.startsWith("# ")) { + let lines = extractedText.split("\n"); + // Skip the title line, description line(s), and separator + let startIndex = 0; + for (let i = 0; i < lines.length; i++) { + if (lines[i].startsWith("---")) { + startIndex = i + 1; + break; + } + } + extractedText = lines.slice(startIndex).join("\n").trim(); + } + + // Format the page + let pageOutput = `### ${pageTitle}\n\n`; + if (description) { + pageOutput += `${description}\n\n`; + } + pageOutput += `URL: /docs/${slug}\n\n`; + pageOutput += `${extractedText}\n\n`; + pageOutput += "---\n\n"; + + return pageOutput; + } catch (error) { + // Skip files that can't be read + console.error(`Error processing ${slug}:`, error); + return ""; + } +}