diff --git a/.github/workflows/llmstxt.yml b/.github/workflows/llmstxt.yml index 9e437b782..55fc3091e 100644 --- a/.github/workflows/llmstxt.yml +++ b/.github/workflows/llmstxt.yml @@ -1,9 +1,9 @@ name: Generate LLMs.txt on: - schedule: - - cron: "0 0 * * 0" # Run at 00:00 every Sunday workflow_dispatch: + pull_request: + types: [opened, synchronize, reopened] permissions: contents: write @@ -27,7 +27,8 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - ref: main + ref: ${{ github.event.pull_request.head.ref || github.ref }} + token: ${{ secrets.DOCS_PUBLISHABLE_GH_TOKEN }} - name: Install dependencies run: npm install -g pnpm @@ -40,8 +41,26 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - # commit the changes and make a PR (branch protection) - - name: Create Pull Request + - name: Check for changes + id: check-changes + run: | + if [ -n "$(git status --porcelain)" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + - name: Commit changes to PR + if: steps.check-changes.outputs.has_changes == 'true' && github.event_name == 'pull_request' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add public/llms.txt + git commit -m "šŸ¤– Regenerate LLMs.txt" + git push + + - name: Create Pull Request (for scheduled/manual runs) + if: steps.check-changes.outputs.has_changes == 'true' && github.event_name != 'pull_request' id: cpr uses: peter-evans/create-pull-request@v7 with: @@ -55,6 +74,7 @@ jobs: torresmateo - name: Enable Pull Request Automerge + if: steps.check-changes.outputs.has_changes == 'true' && github.event_name != 'pull_request' run: gh pr merge --squash --auto ${{ steps.cpr.outputs.pull-request-number }} env: GH_TOKEN: ${{ secrets.DOCS_PUBLISHABLE_GH_TOKEN }} diff --git a/public/llms.txt b/public/llms.txt index 2c16d680f..ade6477d4 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -1,3 +1,5 @@ + + # Arcade > Arcade is the only runtime for MCP @@ -10,6 +12,10 @@ Arcade delivers three core capabilities: Deploy agents even your security team w - [page](https://docs.arcade.dev/en/home.md): The Arcade Docs page serves as a comprehensive resource for users, providing all the necessary information about Arcade. It aims to help users understand and utilize the features and functionalities of Arcade effectively. +## Agent Frameworks Overview + +- [Arcade with Agent Frameworks and MCP Clients](https://docs.arcade.dev/en/home/agent-frameworks-overview.md): This documentation page provides guidance on integrating Arcade with various agent frameworks and MCP clients, enabling users to enhance their AI applications with advanced tool-calling functionalities. It lists supported frameworks and clients, such as LangChain, OpenAI Agents, and Claude Desktop, + ## Agentic Development - [Agentic Development](https://docs.arcade.dev/en/home/agentic-development.md): The "Agentic Development" documentation page guides users on how to enhance their coding experience by integrating AI IDEs with Arcade.dev's documentation through the LLMs.txt file format and the Context7 MCP server. It provides instructions for setting up these tools @@ -39,7 +45,7 @@ Arcade delivers three core capabilities: Deploy agents even your security team w - [Authorized Tool Calling](https://docs.arcade.dev/en/home/auth/auth-tool-calling.md): The "Authorized Tool Calling" documentation page provides a comprehensive guide for developers on how to implement authorization for tools that require user consent, enabling AI agents to securely access external services like Gmail and Zoom. It covers the steps for initializing the Arcade client, author - [Checking Tool Authorization Status](https://docs.arcade.dev/en/home/auth/tool-auth-status.md): This documentation page provides a comprehensive guide on how to check the authorization status of tools before execution, helping users understand permission requirements and tool availability. It includes instructions for initializing the client in Python or JavaScript, checking the status for all tools or specific tools - [Direct Third-Party API Call](https://docs.arcade.dev/en/home/auth/call-third-party-apis-directly.md): This documentation page provides a comprehensive guide on how to obtain an authorization token using Arcade to directly call third-party APIs, such as Google's Gmail API. Users will learn to manage user authentication flows, handle authorization requests, and utilize the obtained tokens to interact with -- [How Arcade helps with Agent Authorization](https://docs.arcade.dev/en/home/auth/how-arcade-helps.md): This documentation page explains how Arcade facilitates agent authorization, enabling AI applications to securely access and act on user-specific data from external services like Gmail and Google Calendar. It details the challenges of authentication and outlines how Arcade's authorization system, which supports OAuth 2 +- [How Arcade helps with Agent Authorization](https://docs.arcade.dev/en/home/auth/how-arcade-helps.md): This documentation page explains how Arcade facilitates agent authorization for AI applications, enabling them to securely access and act on behalf of users with external services like Gmail and Google Calendar. It outlines the challenges of authentication and the OAuth 2.0 framework, detailing how - [How Arcade helps with Agent Authorization](https://docs.arcade.dev/en/home/auth/how-arcade-helps.md): This documentation page explains how Arcade facilitates agent authorization for AI applications, enabling them to securely access and act on behalf of users with external services like Gmail and Google Calendar. It outlines the challenges of authentication and the OAuth 2.0 framework, detailing how - [Secure and Brand the Auth Flow in Production](https://docs.arcade.dev/en/home/auth/secure-auth-production.md): This documentation page guides users on how to secure and customize authentication flows in production using Arcade.dev. It explains the use of the default Arcade user verifier for development and the implementation of a custom user verifier for production environments, ensuring user safety and a tailored user @@ -80,6 +86,7 @@ Arcade delivers three core capabilities: Deploy agents even your security team w ## Build Tools - [Add user authorization to your MCP tools](https://docs.arcade.dev/en/home/build-tools/create-a-tool-with-auth.md): This documentation page guides users in adding user authorization to their custom MCP tools using Arcade, OAuth, and various auth providers, specifically focusing on integrating with Reddit. It covers the necessary prerequisites, explains how auth providers function, and provides step-by-step instructions for +- [Adding Resource Server Auth to Your MCP Server](https://docs.arcade.dev/en/home/build-tools/secure-your-mcp-server.md): This documentation page guides users on how to secure their HTTP MCP server using OAuth 2.1 Resource Server authentication, enabling tool-level authorization and secrets management. It covers the necessary prerequisites, configuration steps, and the benefits of implementing Resource Server auth, ensuring - [Call tools from MCP clients](https://docs.arcade.dev/en/home/build-tools/call-tools-from-mcp-clients.md): This documentation page provides guidance on configuring MCP clients to call tools from an MCP server, detailing the necessary prerequisites and steps involved. Users will learn how to utilize the `arcade configure` command to set up their clients, customize transport types, and manage - [Create an MCP tool with secrets](https://docs.arcade.dev/en/home/build-tools/create-a-tool-with-secrets.md): This documentation page guides users on how to create custom MCP tools that securely handle sensitive information, or "secrets," using the Arcade platform. It covers the process of reading secrets from various sources, such as environment files and the Arcade Dashboard, and emphasizes - [Creating an MCP Server with Arcade](https://docs.arcade.dev/en/home/build-tools/create-a-mcp-server.md): This documentation page provides a comprehensive guide on how to create, test, deploy, and publish a custom MCP Server using the Arcade framework. It outlines the necessary tools, such as the `arcade_mcp_server` and `arcade-mcp` @@ -87,6 +94,7 @@ Arcade delivers three core capabilities: Deploy agents even your security team w - [Organize your MCP server and tools](https://docs.arcade.dev/en/home/build-tools/organize-mcp-server-tools.md): This documentation page provides best practices for organizing your MCP server and tools, including how to define and import tools from separate files and other packages. Users will learn to maintain a clean project structure while effectively utilizing decorators and managing tool imports. The guide also includes - [Providing useful tool errors](https://docs.arcade.dev/en/home/build-tools/providing-useful-tool-errors.md): This documentation page guides users on how to effectively handle errors when building tools with Arcade MCP, emphasizing the importance of robust error management. It explains the automatic error adaptation process, outlines when to raise specific errors, and provides insights into common error scenarios. Users - [RetryableToolError in Arcade](https://docs.arcade.dev/en/home/build-tools/retry-tools-with-improved-prompt.md): This documentation page explains how to use the `RetryableToolError` in the Arcade Tool SDK to enhance tool call outcomes by providing additional context for input parameters. It outlines when to raise this error and includes an example demonstrating its application in a tool that +- [Server-Level vs Tool-Level Authorization](https://docs.arcade.dev/en/home/build-tools/server-level-vs-tool-level-auth.md): This documentation page explains the differences between server-level authorization (Resource Server auth) and tool-level authorization in Arcade MCP servers, highlighting their roles in securing access to the server and third-party APIs, respectively. It provides guidance on when to implement each type of - [Understanding `Context` and tools](https://docs.arcade.dev/en/home/build-tools/tool-context.md): This documentation page explains the `Context` class used in Arcade's tools, detailing how it provides runtime capabilities and tool-specific data access. Users will learn how to utilize the `Context` object to access essential features such as OAuth tokens, secrets, user ## Changelog @@ -157,6 +165,10 @@ Arcade delivers three core capabilities: Deploy agents even your security team w - [Use Arcade in Visual Studio Code](https://docs.arcade.dev/en/home/mcp-clients/visual-studio-code.md): This documentation page provides a step-by-step guide for connecting Visual Studio Code to an Arcade MCP Gateway, enabling users to integrate and utilize Arcade tools within their development environment. It outlines the necessary prerequisites, including account creation and API key retrieval, and details the - [Use Arcade with Claude Desktop](https://docs.arcade.dev/en/home/mcp-clients/claude-desktop.md): This documentation page provides a step-by-step guide for connecting Claude Desktop to a local Arcade server, enabling users to access Arcade tools. It outlines the necessary prerequisites, including creating an Arcade account and obtaining an API key, and details the configuration process within Claude +## Mcp Gateway Quickstart + +- [Call a tool in your IDE/MCP Client](https://docs.arcade.dev/en/home/mcp-gateway-quickstart.md): This documentation page guides users on how to create and utilize an MCP Gateway within their IDE or MCP Client to efficiently call tools from multiple MCP servers. It outlines the steps needed to set up the gateway, select relevant tools, and integrate them into coding agents + ## Mcp Gateways - [MCP Gateways](https://docs.arcade.dev/en/home/mcp-gateways.md): This documentation page provides a comprehensive guide on configuring and using MCP Gateways, which facilitate the connection of multiple MCP Servers to enhance tool management and access within projects. Users will learn how to create and customize an MCP Gateway, including selecting tools from different servers @@ -176,6 +188,8 @@ Arcade delivers three core capabilities: Deploy agents even your security team w - [CustomerioTrackApi](https://docs.arcade.dev/en/mcp-servers/customer-support/customerio-track-api.md): The CustomerioTrackApi documentation provides users with a set of tools to manage customer data and interactions within the Customer.io platform. It includes functionalities for retrieving API credentials, updating customer information, managing devices, and handling customer events, among others. Users can - [FreshserviceApi](https://docs.arcade.dev/en/mcp-servers/customer-support/freshservice-api.md): The FreshserviceApi documentation provides a comprehensive guide for developers to programmatically interact with the Freshservice platform using various API tools. Users can learn to manage organizational data, assets, software, service catalogs, users, tickets, and more, enabling automation of - [IntercomApi](https://docs.arcade.dev/en/mcp-servers/customer-support/intercom-api.md): The IntercomApi documentation provides users with a comprehensive set of tools for managing and interacting with the Intercom platform through OAuth2 authentication. It includes detailed descriptions of various functionalities, such as retrieving admin information, managing articles and collections, and handling company data +- [PagerDuty](https://docs.arcade.dev/en/mcp-servers/customer-support/pagerduty.md): This documentation page provides users with guidance on utilizing the PagerDuty MCP Server, which allows agents to access and manage incidents, on-call information, services, and teams through read-only tools. It includes detailed descriptions of available tools, code snippets in Python and +- [Pylon](https://docs.arcade.dev/en/mcp-servers/customer-support/pylon.md): The Pylon documentation provides agents with the necessary tools to manage issues, contacts, users, and teams within the Pylon MCP Server using an admin-generated API token. Users can learn to list, search, assign, and update issues, as well as - [Zendesk](https://docs.arcade.dev/en/mcp-servers/customer-support/zendesk.md): This documentation page provides users with tools and instructions for managing customer support tickets and knowledge base articles within Zendesk. Users can learn how to list, comment on, and mark tickets as solved, as well as search for Help Center articles, streamlining their - [Zendesk Reference](https://docs.arcade.dev/en/mcp-servers/customer-support/zendesk/reference.md): The Zendesk Reference documentation provides a comprehensive list of enumerations related to ticket statuses, sorting orders, and article sorting criteria used in the Zendesk MCP Server. Users can refer to this page to understand the specific values and options available for managing tickets and @@ -195,6 +209,7 @@ Arcade delivers three core capabilities: Deploy agents even your security team w - [CursorAgentsApi](https://docs.arcade.dev/en/mcp-servers/development/cursor-agents-api.md): The CursorAgentsApi documentation provides users with tools to manage and inspect background agents, including functionalities for listing, retrieving status, deleting agents, and accessing authentication and model recommendations. It outlines various API endpoints and their usage, enabling users to interact effectively with the - [DatadogApi](https://docs.arcade.dev/en/mcp-servers/development/datadog-api.md): The DatadogApi documentation provides users with tools and guidance for interacting with the Datadog API, enabling them to manage datastores, application keys, and action connections effectively. It outlines the authentication process, including required environment variables, and offers - [E2B](https://docs.arcade.dev/en/mcp-servers/development/e2b.md): The E2B documentation page provides users with tools to run code in a sandboxed environment and create static matplotlib charts, facilitating the development of agents and AI applications. It outlines available tools, authentication requirements, and example code snippets for implementation in Python and +- [Figma](https://docs.arcade.dev/en/mcp-servers/development/figma.md): This documentation page provides a comprehensive guide for using the Figma MCP Server, which allows users to interact with Figma's design files, components, and collaboration features through various tools. Users can access file structures, manage components, add comments, and retrieve - [Firecrawl](https://docs.arcade.dev/en/mcp-servers/development/firecrawl.md): The Firecrawl documentation provides users with tools and guidance for building agents that can scrape, crawl, and map websites effectively. It outlines various functionalities, including scraping URLs, crawling websites, retrieving crawl statuses, and managing ongoing crawls, all accessible via an - [GitHub](https://docs.arcade.dev/en/mcp-servers/development/github.md): This documentation page provides an overview of the Arcade GitHub MCP Server, which enables users to build agents and AI applications that interact with GitHub repositories, issues, and pull requests. It outlines the necessary configuration for using GitHub Apps, including permissions required - [GithubApi](https://docs.arcade.dev/en/mcp-servers/development/github-api.md): The GitHubApi documentation page provides tools that enable users to interact with the GitHub API, facilitating the management of repositories, issues, pull requests, and webhooks. It outlines various functionalities, including creating and managing webhooks, organizations, and LDAP @@ -249,6 +264,7 @@ Arcade delivers three core capabilities: Deploy agents even your security team w - [Jira Environment Variables](https://docs.arcade.dev/en/mcp-servers/productivity/jira/environment-variables.md): This documentation page provides an overview of key environment variables for configuring Jira API interactions within the Arcade platform. Users can learn how to set limits on concurrent requests, manage API request timeouts, and optimize caching strategies to enhance performance during tool execution. Each variable - [Jira Reference](https://docs.arcade.dev/en/mcp-servers/productivity/jira/reference.md): This documentation page provides a reference for enumerations used in the Jira MCP Server, specifically detailing the various sprint states, priority scheme ordering, and issue comment ordering options available through the Jira API. Users can learn how to filter sprints and order issues effectively - [Linear](https://docs.arcade.dev/en/mcp-servers/productivity/linear.md): This documentation page provides users with a comprehensive guide to the Linear MCP Server, enabling them to interact with Linear's issue tracking, project management, and team collaboration features. Users can learn how to create, manage, and update issues, projects, and initiatives +- [LumaApi](https://docs.arcade.dev/en/mcp-servers/productivity/luma-api.md): The LumaApi documentation provides users with tools and guidance for managing events and calendars within the Luma platform through its API. It covers functionalities such as creating and updating events, managing guest information, and handling tickets and coupons, all aimed at enhancing event - [MailchimpMarketingApi](https://docs.arcade.dev/en/mcp-servers/productivity/mailchimp-marketing-api.md): The Mailchimp Marketing API documentation provides tools and resources for users to effectively manage and optimize their email marketing campaigns through direct interaction with the Mailchimp Marketing API. Users can learn how to retrieve account information, manage audience contacts, and create or modify automation workflows - [MiroApi](https://docs.arcade.dev/en/mcp-servers/productivity/miro-api.md): The MiroApi documentation provides users with a comprehensive set of tools for managing Miro boards and organizational settings through the Miro API. It enables users to perform various actions, such as retrieving board information, updating classifications, managing legal holds, and creating - [Notion](https://docs.arcade.dev/en/mcp-servers/productivity/notion.md): This documentation page provides users with a comprehensive guide to the Arcade Notion MCP Server, which enables the creation of agents and AI applications that interact with Notion. It outlines various tools available for tasks such as retrieving page content, creating new pages, and diff --git a/scripts/generate-llmstxt.ts b/scripts/generate-llmstxt.ts index 68569e942..2d5790c51 100644 --- a/scripts/generate-llmstxt.ts +++ b/scripts/generate-llmstxt.ts @@ -1,3 +1,4 @@ +import { execSync } from "node:child_process"; import fs from "node:fs/promises"; import path from "node:path"; import glob from "fast-glob"; @@ -19,6 +20,11 @@ type Section = { }>; }; +type LlmsTxtMetadata = { + gitSha: string; + generationDate: string; +}; + const BASE_URL = "https://docs.arcade.dev"; const OUTPUT_PATH = path.join(process.cwd(), "public", "llms.txt"); @@ -29,16 +35,123 @@ const MDX_SUFFIX_REGEX = /\.mdx$/; const TITLE_H1_REGEX = /^#\s+(.+)$/m; const EN_LOCALE_PREFIX_REGEX = /^en\//; const MD_EXTENSION_REGEX = /\.md$/; +const METADATA_REGEX = + /^/; +const LINK_REGEX = /- \[([^\]]+)\]\(([^)]+)\):\s*(.+)$/gm; // Constants for content processing const MAX_CONTENT_LENGTH = 4000; const BATCH_DELAY_MS = 1000; +const SHA_SHORT_LENGTH = 7; // Initialize OpenAI client const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, }); +/** + * Gets the current git SHA + */ +function getCurrentGitSha(): string { + try { + return execSync("git rev-parse HEAD", { encoding: "utf-8" }).trim(); + } catch (_error) { + console.error( + pc.red("āœ— Could not get git SHA. Make sure you're in a git repository.") + ); + throw new Error("Failed to get git SHA"); + } +} + +/** + * Parses metadata from existing llms.txt file + */ +async function parseLlmsTxtMetadata(): Promise { + try { + const content = await fs.readFile(OUTPUT_PATH, "utf-8"); + const metadataMatch = content.match(METADATA_REGEX); + if (metadataMatch) { + return { + gitSha: metadataMatch[1], + generationDate: metadataMatch[2], + }; + } + } catch (_error) { + // File doesn't exist or can't be read - that's okay + } + return null; +} + +/** + * Gets changed files since the last git SHA + */ +function getChangedFilesSince(lastSha: string): Set { + try { + // Get files that were added, modified, or deleted + const added = execSync( + `git diff --name-only --diff-filter=A ${lastSha} HEAD`, + { + encoding: "utf-8", + } + ) + .trim() + .split("\n") + .filter((line) => line.length > 0); + + const modified = execSync( + `git diff --name-only --diff-filter=M ${lastSha} HEAD`, + { encoding: "utf-8" } + ) + .trim() + .split("\n") + .filter((line) => line.length > 0); + + const deleted = execSync( + `git diff --name-only --diff-filter=D ${lastSha} HEAD`, + { + encoding: "utf-8", + } + ) + .trim() + .split("\n") + .filter((line) => line.length > 0); + + const allChanged = new Set([...added, ...modified, ...deleted]); + return allChanged; + } catch (_error) { + console.warn( + pc.yellow( + `⚠ Could not get changed files since ${lastSha}, processing all files` + ) + ); + return new Set(); + } +} + +/** + * Extracts existing page summaries from llms.txt + */ +async function extractExistingSummaries(): Promise< + Map +> { + const summaries = new Map(); + try { + const content = await fs.readFile(OUTPUT_PATH, "utf-8"); + // Match markdown links with descriptions: - [title](url): description + let match: RegExpExecArray | null; + // biome-ignore lint/suspicious/noAssignInExpressions: needed for regex.exec loop + while ((match = LINK_REGEX.exec(content)) !== null) { + const title = match[1]; + const url = match[2]; + const description = match[3].trim(); + summaries.set(url, { title, description }); + } + } catch (_error) { + // File doesn't exist or can't be read - that's okay + } + return summaries; +} + /** * Discovers all MDX pages in the documentation */ @@ -217,9 +330,18 @@ function formatSectionName(segment: string): string { /** * Generates the llms.txt file content */ -function generateLlmsTxt(sections: Section[]): string { +function generateLlmsTxt( + sections: Section[], + metadata: LlmsTxtMetadata +): string { const lines: string[] = []; + // Metadata comment (hidden in markdown but parseable) + lines.push( + `` + ); + lines.push(""); + // Header lines.push("# Arcade"); lines.push(""); @@ -263,6 +385,134 @@ function generateLlmsTxt(sections: Section[]): string { return lines.join("\n"); } +/** + * Determines which pages need summarization based on changes + */ +function determinePagesToSummarize( + pages: PageMetadata[], + previousMetadata: LlmsTxtMetadata | null, + existingSummaries: Map +): { + pagesToSummarize: PageMetadata[]; + pagesToKeep: Array; + hasChanges: boolean; +} { + const pagesToSummarize: PageMetadata[] = []; + const pagesToKeep: Array< + PageMetadata & { title: string; description: string } + > = []; + let hasChanges = false; + + if (previousMetadata && previousMetadata.gitSha !== "unknown") { + // Get changed files since last generation + const changedFiles = getChangedFilesSince(previousMetadata.gitSha); + console.log( + pc.blue( + `\nšŸ“Š Found ${changedFiles.size} changed files since last generation` + ) + ); + + // Create a set of current page URLs for quick lookup + const currentPageUrls = new Set(pages.map((page) => page.url)); + + // Identify deleted pages (pages that exist in previous llms.txt but not in current filesystem) + const deletedPageUrls = Array.from(existingSummaries.keys()).filter( + (url) => !currentPageUrls.has(url) + ); + + if (deletedPageUrls.length > 0) { + hasChanges = true; + console.log( + pc.yellow( + `\nšŸ—‘ļø Found ${deletedPageUrls.length} deleted pages (will be removed from output)` + ) + ); + } + + // Filter pages based on changes + for (const page of pages) { + const url = page.url; + const existingSummary = existingSummaries.get(url); + + // Check if this page's file was changed + const isChanged = changedFiles.has(page.path); + + if (isChanged || !existingSummary) { + // Need to summarize this page + pagesToSummarize.push(page); + hasChanges = true; + } else { + // Keep existing summary + pagesToKeep.push({ + ...page, + title: existingSummary.title, + description: existingSummary.description, + }); + } + } + + console.log( + pc.green( + `āœ“ ${pagesToKeep.length} pages unchanged, ${pagesToSummarize.length} pages to summarize${deletedPageUrls.length > 0 ? `, ${deletedPageUrls.length} pages deleted` : ""}` + ) + ); + } else { + // No previous generation or can't determine, summarize all pages + console.log( + pc.yellow("⚠ No previous generation found, summarizing all pages") + ); + pagesToSummarize.push(...pages); + hasChanges = true; // Always regenerate if no previous metadata + } + + return { pagesToSummarize, pagesToKeep, hasChanges }; +} + +/** + * Summarizes pages in batches + */ +async function summarizePagesInBatches( + pagesToSummarize: PageMetadata[], + pagesToKeep: Array +): Promise> { + const summarizedPages: Array< + PageMetadata & { title: string; description: string } + > = [...pagesToKeep]; + + if (pagesToSummarize.length === 0) { + return summarizedPages; + } + + console.log(pc.blue("\nšŸ“ Summarizing pages with OpenAI...")); + // Process in batches to avoid rate limits + const batchSize = 5; + for (let i = 0; i < pagesToSummarize.length; i += batchSize) { + const batch = pagesToSummarize.slice(i, i + batchSize); + const batchResults = await Promise.all(batch.map(summarizePage)); + + for (let j = 0; j < batch.length; j += 1) { + summarizedPages.push({ + ...batch[j], + ...batchResults[j], + }); + } + + console.log( + pc.gray( + ` Processed ${Math.min(i + batchSize, pagesToSummarize.length)}/${pagesToSummarize.length} pages` + ) + ); + + // Add a small delay between batches + if (i + batchSize < pagesToSummarize.length) { + await new Promise((resolve) => setTimeout(resolve, BATCH_DELAY_MS)); + } + } + + console.log(pc.green(`āœ“ Summarized ${pagesToSummarize.length} pages`)); + return summarizedPages; +} + /** * Main execution function */ @@ -276,54 +526,63 @@ async function main() { } try { - // Step 1: Discover all pages - const pages = await discoverPages(); - - // Step 2: Summarize each page using OpenAI - console.log(pc.blue("\nšŸ“ Summarizing pages with OpenAI...")); - const summarizedPages: Array< - PageMetadata & { title: string; description: string } - > = []; - - // Process in batches to avoid rate limits - const batchSize = 5; - for (let i = 0; i < pages.length; i += batchSize) { - const batch = pages.slice(i, i + batchSize); - const batchResults = await Promise.all(batch.map(summarizePage)); - - for (let j = 0; j < batch.length; j += 1) { - summarizedPages.push({ - ...batch[j], - ...batchResults[j], - }); - } + // Step 0: Get current git SHA and check for previous generation + const currentSha = getCurrentGitSha(); + const previousMetadata = await parseLlmsTxtMetadata(); + const existingSummaries = await extractExistingSummaries(); + console.log(pc.blue(`šŸ“Œ Current git SHA: ${currentSha}`)); + if (previousMetadata) { console.log( pc.gray( - ` Processed ${Math.min(i + batchSize, pages.length)}/${pages.length} pages` + ` Previous generation: ${previousMetadata.generationDate} (SHA: ${previousMetadata.gitSha.substring(0, SHA_SHORT_LENGTH)})` ) ); - - // Add a small delay between batches - if (i + batchSize < pages.length) { - await new Promise((resolve) => setTimeout(resolve, BATCH_DELAY_MS)); - } } - console.log(pc.green(`āœ“ Summarized ${summarizedPages.length} pages`)); + // Step 1: Discover all pages + const pages = await discoverPages(); + + // Step 2: Determine which pages need summarization and identify deleted pages + const { pagesToSummarize, pagesToKeep, hasChanges } = + determinePagesToSummarize(pages, previousMetadata, existingSummaries); + + // Step 3: Summarize changed/new pages using OpenAI + const summarizedPages = await summarizePagesInBatches( + pagesToSummarize, + pagesToKeep + ); - // Step 3: Organize into sections + // Step 4: Organize into sections console.log(pc.blue("\nšŸ“‚ Organizing sections...")); const sections = organizeSections(summarizedPages); console.log(pc.green(`āœ“ Created ${sections.length} sections`)); - // Step 4: Generate llms.txt content + // Step 5: Generate llms.txt content console.log(pc.blue("\nāœļø Generating llms.txt content...")); - const content = generateLlmsTxt(sections); - - // Step 5: Write to file + // Only update metadata if there are changes, otherwise keep previous metadata + const metadata: LlmsTxtMetadata = hasChanges + ? { + gitSha: currentSha, + generationDate: new Date().toISOString(), + } + : previousMetadata || { + gitSha: currentSha, + generationDate: new Date().toISOString(), + }; + const content = generateLlmsTxt(sections, metadata); + + // Step 6: Write to file await fs.writeFile(OUTPUT_PATH, content, "utf-8"); - console.log(pc.green(`āœ“ Generated llms.txt at ${OUTPUT_PATH}`)); + if (hasChanges) { + console.log(pc.green(`āœ“ Generated llms.txt at ${OUTPUT_PATH}`)); + } else { + console.log( + pc.gray( + "āœ“ No changes detected, llms.txt unchanged (SHA and date preserved)" + ) + ); + } console.log(pc.bold(pc.green("\n✨ Done!\n"))); } catch (error) {