Implement basic llms.txt endpoint (#56225)

heiskr · web-flow · commit 00004bce75b4 · 2025-06-23T17:18:20.000Z
diff --git a/src/article-api/README.md b/src/article-api/README.md
@@ -5,6 +5,8 @@ This subject folder contains the code for the Article API endpoints:
 - `/api/article/body`
 - `/api/article/meta`
 
+Related: The `/llms.txt` endpoint (middleware in `src/frame/middleware/llms-txt.ts`) provides AI-friendly content discovery using these APIs.
+
 ## What it does
 
 Article API endpoints allow consumers to query GitHub Docs for listings of current articles, and for specific article information.
diff --git a/src/frame/middleware/index.ts b/src/frame/middleware/index.ts
@@ -26,6 +26,7 @@ import findPage from './find-page.js'
 import blockRobots from './block-robots'
 import archivedEnterpriseVersionsAssets from '@/archives/middleware/archived-enterprise-versions-assets'
 import api from './api'
+import llmsTxt from './llms-txt'
 import healthcheck from './healthcheck'
 import manifestJson from './manifest-json'
 import buildInfo from './build-info'
@@ -229,6 +230,7 @@ export default function (app: Express) {
 
   // *** Rendering, 2xx responses ***
   app.use('/api', api)
+  app.use('/llms.txt', llmsTxt)
   app.get('/_build', buildInfo)
   app.get('/_req-headers', reqHeaders)
   app.use(asyncMiddleware(manifestJson))
diff --git a/src/frame/middleware/llms-txt.ts b/src/frame/middleware/llms-txt.ts
@@ -0,0 +1,72 @@
+import type { Response } from 'express'
+import express from 'express'
+
+import type { ExtendedRequest } from '@/types'
+import { defaultCacheControl } from '@/frame/middleware/cache-control.js'
+import catchMiddlewareError from '@/observability/middleware/catch-middleware-error.js'
+import statsd from '@/observability/lib/statsd.js'
+import languages from '@/languages/lib/languages.js'
+import { allVersions } from '@/versions/lib/all-versions.js'
+
+const router = express.Router()
+const BASE_API_URL = 'https://docs.github.com/api/pagelist'
+
+/**
+ * Serves an llms.txt file following the specification at https://llmstxt.org/
+ * This provides LLM-friendly content discovery for GitHub Docs
+ * @route GET /llms.txt
+ * @returns {string} Markdown content following llms.txt specification
+ */
+router.get(
+  '/',
+  catchMiddlewareError(async function (req: ExtendedRequest, res: Response) {
+    // Generate basic llms.txt content
+    const llmsTxtContent = generateBasicLlmsTxt()
+
+    statsd.increment('api.llms-txt.lookup', 1)
+    defaultCacheControl(res)
+
+    res.type('text/markdown').send(llmsTxtContent)
+  }),
+)
+
+function generateBasicLlmsTxt(): string {
+  // Generate translations section dynamically
+  const translationsSection = Object.entries(languages)
+    .filter(([code]) => code !== 'en') // Exclude English since it's the default
+    .map(([code, lang]) => {
+      const nativeName = lang.nativeName ? ` (${lang.nativeName})` : ''
+      return `- [${lang.name}${nativeName}](${BASE_API_URL}/${code}/free-pro-team@latest)`
+    })
+    .join('\n')
+
+  // Generate all versions dynamically
+  const versionsSection = Object.values(allVersions)
+    .map((version) => {
+      const versionKey = version.version
+      const title = version.versionTitle
+      return `- [${title}](${BASE_API_URL}/en/${versionKey})`
+    })
+    .join('\n')
+
+  return `# GitHub Docs
+
+> Help for wherever you are on your GitHub journey.
+
+## Docs Content
+
+- [Page List API](${BASE_API_URL}/en/free-pro-team@latest)
+- [Article API](https://docs.github.com/api/article)
+- [Search API](https://docs.github.com/api/search)
+
+## Translations
+
+${translationsSection}
+
+## Versions
+
+${versionsSection}
+`
+}
+
+export default router
diff --git a/src/frame/tests/llms-txt.ts b/src/frame/tests/llms-txt.ts
@@ -0,0 +1,138 @@
+import { describe, expect, test } from 'vitest'
+import { get } from '@/tests/helpers/e2etest.js'
+
+describe('llms.txt endpoint', () => {
+  test('returns 200 OK', async () => {
+    const res = await get('/llms.txt')
+    expect(res.statusCode).toBe(200)
+  })
+
+  test('returns markdown content type', async () => {
+    const res = await get('/llms.txt')
+    expect(res.headers['content-type']).toMatch(/text\/markdown/)
+  })
+
+  test('includes GitHub Docs title', async () => {
+    const res = await get('/llms.txt')
+    const content = res.body
+
+    // Should contain GitHub in the title
+    expect(content).toMatch(/^# .*GitHub.*Docs/m)
+  })
+
+  test('includes programmatic access section', async () => {
+    const res = await get('/llms.txt')
+    const content = res.body
+
+    // Should mention the existing APIs
+    expect(content).toMatch(/Article API/i)
+    expect(content).toMatch(/Page List API/i)
+    expect(content).toMatch(/api\/article/i)
+    expect(content).toMatch(/api\/pagelist\/en\/free-pro-team@latest/i)
+  })
+
+  test('includes all main sections', async () => {
+    const res = await get('/llms.txt')
+    const content = res.body
+
+    // Should have all the main sections we expect
+    expect(content).toMatch(/## Docs Content/i)
+    expect(content).toMatch(/## Translations/i)
+    expect(content).toMatch(/## Versions/i)
+  })
+
+  test('contains valid markdown links', async () => {
+    const res = await get('/llms.txt')
+    const content = res.body
+
+    // Extract all markdown links
+    const linkRegex = /\[([^\]]+)\]\(([^)]+)\)/g
+    const links = Array.from(content.matchAll(linkRegex))
+
+    expect(links.length).toBeGreaterThan(0)
+
+    // Check that links are properly formatted
+    for (const match of links) {
+      const [, linkText, linkUrl] = match as RegExpMatchArray
+      expect(linkText.trim()).not.toBe('')
+      expect(linkUrl.trim()).not.toBe('')
+
+      // All links should be absolute GitHub docs URLs
+      expect(linkUrl).toMatch(/^https:\/\/docs\.github\.com/i)
+    }
+  })
+
+  test('has proper cache headers', async () => {
+    const res = await get('/llms.txt')
+
+    // Should have cache control headers set by defaultCacheControl
+    expect(res.headers).toHaveProperty('cache-control')
+  })
+
+  test('references pagelist API for content discovery', async () => {
+    const res = await get('/llms.txt')
+    const content = res.body
+
+    // Should prominently feature the pagelist API as the main content source
+    expect(content).toMatch(/Page List API.*api\/pagelist\/en\/free-pro-team@latest/i)
+    expect(content).not.toMatch(/Machine-readable list/i) // Removed descriptions
+  })
+
+  test.each(['free-pro-team@latest', 'enterprise-cloud@latest'])(
+    'includes %s version in versions section',
+    async (versionPattern) => {
+      const res = await get('/llms.txt')
+      const content = res.body
+
+      // Should include versions section
+      expect(content).toMatch(/## Versions/i)
+
+      // Should include this specific version pattern
+      expect(content).toMatch(new RegExp(`api/pagelist/en/${versionPattern}`))
+    },
+  )
+
+  test('includes enterprise server versions', async () => {
+    const res = await get('/llms.txt')
+    const content = res.body
+
+    // Should include enterprise server versions with pattern
+    expect(content).toMatch(/api\/pagelist\/en\/enterprise-server@\d+\.\d+/)
+  })
+
+  test('follows llms.txt specification structure and has reasonable length', async () => {
+    const res = await get('/llms.txt')
+    const content = res.body
+
+    // Check for required H1 title
+    expect(content).toMatch(/^# .+/m)
+
+    // Check for blockquote description
+    expect(content).toMatch(/^> .+/m)
+
+    // Check for H2 sections
+    expect(content).toMatch(/^## .+/m)
+
+    // Check for markdown links
+    expect(content).toMatch(/\[.+\]\(.+\)/m)
+
+    // Should include translations and versions but still be reasonable
+    expect(content.length).toBeGreaterThan(500)
+    expect(content.length).toBeLessThan(5000)
+
+    // Split into lines for structure analysis
+    const lines = content.split('\n')
+
+    // First non-empty line should be H1
+    const firstContentLine = lines.find((line: string) => line.trim() !== '')
+    expect(firstContentLine).toMatch(/^# /)
+
+    // Should contain blockquote after title
+    const hasBlockquote = lines.some((line: string) => line.trim().startsWith('>'))
+    expect(hasBlockquote).toBe(true)
+
+    // Should have multiple H2 sections (Docs Content, Translations, Versions)
+    const h2Sections = lines.filter((line: string) => line.trim().startsWith('## '))
+    expect(h2Sections.length).toBeGreaterThanOrEqual(3)
+  })
+})
diff --git a/src/languages/tests/llms-txt-translations.ts b/src/languages/tests/llms-txt-translations.ts
@@ -0,0 +1,37 @@
+import { describe, expect, test } from 'vitest'
+import { get } from '@/tests/helpers/e2etest.js'
+import { languageKeys } from '@/languages/lib/languages.js'
+
+const langs = languageKeys.filter((lang) => lang !== 'en')
+
+describe('llms.txt translations', () => {
+  test('includes translations section with all languages', async () => {
+    const res = await get('/llms.txt')
+    const content = res.body
+
+    // Should include translations with language codes
+    expect(content).toMatch(/## Translations/i)
+    expect(content).toMatch(/api\/pagelist\/[a-z]{2}\/free-pro-team@latest/i)
+
+    // Extract translation section
+    const translationsMatch = content.match(/## Translations\n([\s\S]*?)(?=\n## |$)/)
+    expect(translationsMatch).toBeTruthy()
+
+    if (translationsMatch) {
+      const translationsSection = translationsMatch[1]
+      const languageLinks = translationsSection.match(/- \[.*?\]\(.*?\)/g)
+      expect(languageLinks).toBeTruthy()
+      expect(languageLinks!.length).toBeGreaterThan(5) // Should have multiple languages
+    }
+  })
+
+  test.each(langs)('includes %s language with proper formatting', async (lang) => {
+    const res = await get('/llms.txt')
+    const content = res.body
+
+    // Should include this language with proper markdown link format
+    expect(content).toMatch(
+      new RegExp(`\\[.*?\\]\\(.*?api/pagelist/${lang}/free-pro-team@latest\\)`),
+    )
+  })
+})