Skip to content

Commit 00004bc

Browse files
authored
Implement basic llms.txt endpoint (#56225)
1 parent 0bd4bc7 commit 00004bc

File tree

5 files changed

+251
-0
lines changed

5 files changed

+251
-0
lines changed

src/article-api/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ This subject folder contains the code for the Article API endpoints:
55
- `/api/article/body`
66
- `/api/article/meta`
77

8+
Related: The `/llms.txt` endpoint (middleware in `src/frame/middleware/llms-txt.ts`) provides AI-friendly content discovery using these APIs.
9+
810
## What it does
911

1012
Article API endpoints allow consumers to query GitHub Docs for listings of current articles, and for specific article information.

src/frame/middleware/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import findPage from './find-page.js'
2626
import blockRobots from './block-robots'
2727
import archivedEnterpriseVersionsAssets from '@/archives/middleware/archived-enterprise-versions-assets'
2828
import api from './api'
29+
import llmsTxt from './llms-txt'
2930
import healthcheck from './healthcheck'
3031
import manifestJson from './manifest-json'
3132
import buildInfo from './build-info'
@@ -229,6 +230,7 @@ export default function (app: Express) {
229230

230231
// *** Rendering, 2xx responses ***
231232
app.use('/api', api)
233+
app.use('/llms.txt', llmsTxt)
232234
app.get('/_build', buildInfo)
233235
app.get('/_req-headers', reqHeaders)
234236
app.use(asyncMiddleware(manifestJson))

src/frame/middleware/llms-txt.ts

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import type { Response } from 'express'
2+
import express from 'express'
3+
4+
import type { ExtendedRequest } from '@/types'
5+
import { defaultCacheControl } from '@/frame/middleware/cache-control.js'
6+
import catchMiddlewareError from '@/observability/middleware/catch-middleware-error.js'
7+
import statsd from '@/observability/lib/statsd.js'
8+
import languages from '@/languages/lib/languages.js'
9+
import { allVersions } from '@/versions/lib/all-versions.js'
10+
11+
const router = express.Router()
12+
const BASE_API_URL = 'https://docs.github.com/api/pagelist'
13+
14+
/**
15+
* Serves an llms.txt file following the specification at https://llmstxt.org/
16+
* This provides LLM-friendly content discovery for GitHub Docs
17+
* @route GET /llms.txt
18+
* @returns {string} Markdown content following llms.txt specification
19+
*/
20+
router.get(
21+
'/',
22+
catchMiddlewareError(async function (req: ExtendedRequest, res: Response) {
23+
// Generate basic llms.txt content
24+
const llmsTxtContent = generateBasicLlmsTxt()
25+
26+
statsd.increment('api.llms-txt.lookup', 1)
27+
defaultCacheControl(res)
28+
29+
res.type('text/markdown').send(llmsTxtContent)
30+
}),
31+
)
32+
33+
function generateBasicLlmsTxt(): string {
34+
// Generate translations section dynamically
35+
const translationsSection = Object.entries(languages)
36+
.filter(([code]) => code !== 'en') // Exclude English since it's the default
37+
.map(([code, lang]) => {
38+
const nativeName = lang.nativeName ? ` (${lang.nativeName})` : ''
39+
return `- [${lang.name}${nativeName}](${BASE_API_URL}/${code}/free-pro-team@latest)`
40+
})
41+
.join('\n')
42+
43+
// Generate all versions dynamically
44+
const versionsSection = Object.values(allVersions)
45+
.map((version) => {
46+
const versionKey = version.version
47+
const title = version.versionTitle
48+
return `- [${title}](${BASE_API_URL}/en/${versionKey})`
49+
})
50+
.join('\n')
51+
52+
return `# GitHub Docs
53+
54+
> Help for wherever you are on your GitHub journey.
55+
56+
## Docs Content
57+
58+
- [Page List API](${BASE_API_URL}/en/free-pro-team@latest)
59+
- [Article API](https://docs.github.com/api/article)
60+
- [Search API](https://docs.github.com/api/search)
61+
62+
## Translations
63+
64+
${translationsSection}
65+
66+
## Versions
67+
68+
${versionsSection}
69+
`
70+
}
71+
72+
export default router

src/frame/tests/llms-txt.ts

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import { describe, expect, test } from 'vitest'
2+
import { get } from '@/tests/helpers/e2etest.js'
3+
4+
describe('llms.txt endpoint', () => {
5+
test('returns 200 OK', async () => {
6+
const res = await get('/llms.txt')
7+
expect(res.statusCode).toBe(200)
8+
})
9+
10+
test('returns markdown content type', async () => {
11+
const res = await get('/llms.txt')
12+
expect(res.headers['content-type']).toMatch(/text\/markdown/)
13+
})
14+
15+
test('includes GitHub Docs title', async () => {
16+
const res = await get('/llms.txt')
17+
const content = res.body
18+
19+
// Should contain GitHub in the title
20+
expect(content).toMatch(/^# .*GitHub.*Docs/m)
21+
})
22+
23+
test('includes programmatic access section', async () => {
24+
const res = await get('/llms.txt')
25+
const content = res.body
26+
27+
// Should mention the existing APIs
28+
expect(content).toMatch(/Article API/i)
29+
expect(content).toMatch(/Page List API/i)
30+
expect(content).toMatch(/api\/article/i)
31+
expect(content).toMatch(/api\/pagelist\/en\/free-pro-team@latest/i)
32+
})
33+
34+
test('includes all main sections', async () => {
35+
const res = await get('/llms.txt')
36+
const content = res.body
37+
38+
// Should have all the main sections we expect
39+
expect(content).toMatch(/## Docs Content/i)
40+
expect(content).toMatch(/## Translations/i)
41+
expect(content).toMatch(/## Versions/i)
42+
})
43+
44+
test('contains valid markdown links', async () => {
45+
const res = await get('/llms.txt')
46+
const content = res.body
47+
48+
// Extract all markdown links
49+
const linkRegex = /\[([^\]]+)\]\(([^)]+)\)/g
50+
const links = Array.from(content.matchAll(linkRegex))
51+
52+
expect(links.length).toBeGreaterThan(0)
53+
54+
// Check that links are properly formatted
55+
for (const match of links) {
56+
const [, linkText, linkUrl] = match as RegExpMatchArray
57+
expect(linkText.trim()).not.toBe('')
58+
expect(linkUrl.trim()).not.toBe('')
59+
60+
// All links should be absolute GitHub docs URLs
61+
expect(linkUrl).toMatch(/^https:\/\/docs\.github\.com/i)
62+
}
63+
})
64+
65+
test('has proper cache headers', async () => {
66+
const res = await get('/llms.txt')
67+
68+
// Should have cache control headers set by defaultCacheControl
69+
expect(res.headers).toHaveProperty('cache-control')
70+
})
71+
72+
test('references pagelist API for content discovery', async () => {
73+
const res = await get('/llms.txt')
74+
const content = res.body
75+
76+
// Should prominently feature the pagelist API as the main content source
77+
expect(content).toMatch(/Page List API.*api\/pagelist\/en\/free-pro-team@latest/i)
78+
expect(content).not.toMatch(/Machine-readable list/i) // Removed descriptions
79+
})
80+
81+
test.each(['free-pro-team@latest', 'enterprise-cloud@latest'])(
82+
'includes %s version in versions section',
83+
async (versionPattern) => {
84+
const res = await get('/llms.txt')
85+
const content = res.body
86+
87+
// Should include versions section
88+
expect(content).toMatch(/## Versions/i)
89+
90+
// Should include this specific version pattern
91+
expect(content).toMatch(new RegExp(`api/pagelist/en/${versionPattern}`))
92+
},
93+
)
94+
95+
test('includes enterprise server versions', async () => {
96+
const res = await get('/llms.txt')
97+
const content = res.body
98+
99+
// Should include enterprise server versions with pattern
100+
expect(content).toMatch(/api\/pagelist\/en\/enterprise-server@\d+\.\d+/)
101+
})
102+
103+
test('follows llms.txt specification structure and has reasonable length', async () => {
104+
const res = await get('/llms.txt')
105+
const content = res.body
106+
107+
// Check for required H1 title
108+
expect(content).toMatch(/^# .+/m)
109+
110+
// Check for blockquote description
111+
expect(content).toMatch(/^> .+/m)
112+
113+
// Check for H2 sections
114+
expect(content).toMatch(/^## .+/m)
115+
116+
// Check for markdown links
117+
expect(content).toMatch(/\[.+\]\(.+\)/m)
118+
119+
// Should include translations and versions but still be reasonable
120+
expect(content.length).toBeGreaterThan(500)
121+
expect(content.length).toBeLessThan(5000)
122+
123+
// Split into lines for structure analysis
124+
const lines = content.split('\n')
125+
126+
// First non-empty line should be H1
127+
const firstContentLine = lines.find((line: string) => line.trim() !== '')
128+
expect(firstContentLine).toMatch(/^# /)
129+
130+
// Should contain blockquote after title
131+
const hasBlockquote = lines.some((line: string) => line.trim().startsWith('>'))
132+
expect(hasBlockquote).toBe(true)
133+
134+
// Should have multiple H2 sections (Docs Content, Translations, Versions)
135+
const h2Sections = lines.filter((line: string) => line.trim().startsWith('## '))
136+
expect(h2Sections.length).toBeGreaterThanOrEqual(3)
137+
})
138+
})
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import { describe, expect, test } from 'vitest'
2+
import { get } from '@/tests/helpers/e2etest.js'
3+
import { languageKeys } from '@/languages/lib/languages.js'
4+
5+
const langs = languageKeys.filter((lang) => lang !== 'en')
6+
7+
describe('llms.txt translations', () => {
8+
test('includes translations section with all languages', async () => {
9+
const res = await get('/llms.txt')
10+
const content = res.body
11+
12+
// Should include translations with language codes
13+
expect(content).toMatch(/## Translations/i)
14+
expect(content).toMatch(/api\/pagelist\/[a-z]{2}\/free-pro-team@latest/i)
15+
16+
// Extract translation section
17+
const translationsMatch = content.match(/## Translations\n([\s\S]*?)(?=\n## |$)/)
18+
expect(translationsMatch).toBeTruthy()
19+
20+
if (translationsMatch) {
21+
const translationsSection = translationsMatch[1]
22+
const languageLinks = translationsSection.match(/- \[.*?\]\(.*?\)/g)
23+
expect(languageLinks).toBeTruthy()
24+
expect(languageLinks!.length).toBeGreaterThan(5) // Should have multiple languages
25+
}
26+
})
27+
28+
test.each(langs)('includes %s language with proper formatting', async (lang) => {
29+
const res = await get('/llms.txt')
30+
const content = res.body
31+
32+
// Should include this language with proper markdown link format
33+
expect(content).toMatch(
34+
new RegExp(`\\[.*?\\]\\(.*?api/pagelist/${lang}/free-pro-team@latest\\)`),
35+
)
36+
})
37+
})

0 commit comments

Comments
 (0)