Skip to content
Merged
Show file tree
Hide file tree
Changes from 36 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
1e18609
wip
khromov Dec 9, 2024
d665b86
Update +server.ts
khromov Dec 9, 2024
d6041ce
Update +server.ts
khromov Dec 9, 2024
b74efca
wip
khromov Dec 10, 2024
437464f
wip
khromov Dec 10, 2024
15ed068
Update +server.ts
khromov Dec 10, 2024
98a9f90
Update +server.ts
khromov Dec 10, 2024
16046c0
wip
khromov Dec 10, 2024
c2534f9
cleanup
khromov Dec 10, 2024
0d29584
wip
khromov Dec 10, 2024
6d05a5a
refactor
khromov Dec 10, 2024
464630f
cleanupo
khromov Dec 10, 2024
4238887
Update +server.ts
khromov Dec 10, 2024
d494334
Update content.ts
khromov Dec 10, 2024
4315f7a
wip
khromov Dec 10, 2024
7bb3ebd
Create +server.ts
khromov Dec 10, 2024
8612605
minimize llms.txt
khromov Dec 10, 2024
8b2544c
Filter llms.txt
khromov Dec 10, 2024
02dbf8d
clean up
khromov Dec 10, 2024
3470a0b
Merge branch 'main' into llms-txt
khromov Dec 11, 2024
7d89403
package
khromov Dec 11, 2024
918b627
chore: naming
khromov Dec 11, 2024
b327991
Update +server.ts
khromov Dec 11, 2024
e51cf30
Dynamic path names
khromov Dec 11, 2024
ea0c646
clean up
khromov Dec 11, 2024
08f1aea
fix
khromov Dec 11, 2024
13e1cc5
Merge branch 'main' into llms-txt
dummdidumm Dec 12, 2024
c1e57a2
under_score
dummdidumm Dec 12, 2024
190ff05
code style
dummdidumm Dec 12, 2024
72a1fd1
use real document titles, filter out empty files
dummdidumm Dec 12, 2024
aaee7c6
move llms.txt to llms-small.txt
khromov Dec 12, 2024
a7f0f8d
llms.txt index
khromov Dec 12, 2024
c5fac95
Update +server.ts
khromov Dec 12, 2024
29b4726
Update +server.ts
khromov Dec 12, 2024
9eeaca6
Fix index
khromov Dec 12, 2024
9c8469c
fix
khromov Dec 12, 2024
e9d7e70
revert VERCEL_URL usage
khromov Dec 12, 2024
855a197
Update apps/svelte.dev/src/lib/server/content.ts
khromov Dec 12, 2024
f0c91cc
move llm stuff into its own module
Rich-Harris Dec 13, 2024
086c48f
revert whitespace changes
Rich-Harris Dec 13, 2024
d7f9180
snake_case
Rich-Harris Dec 13, 2024
d541799
tweak
Rich-Harris Dec 13, 2024
8137d82
snake_case etc
Rich-Harris Dec 13, 2024
edc5d43
make ignores work
Rich-Harris Dec 13, 2024
bdf0381
simplify
Rich-Harris Dec 13, 2024
e7bd8d9
unused
Rich-Harris Dec 13, 2024
f5df782
reduce indirection
Rich-Harris Dec 13, 2024
aa3a4e6
more
Rich-Harris Dec 13, 2024
84ed768
move template into separate .md file
Rich-Harris Dec 13, 2024
b2233b8
add a section to /docs
Rich-Harris Dec 13, 2024
c717f6c
Merge branch 'main' into llms-txt
Rich-Harris Dec 13, 2024
a521cc9
advent of svelte
Rich-Harris Dec 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions apps/svelte.dev/.env
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ SUPABASE_KEY=

GITHUB_CLIENT_ID=
GITHUB_CLIENT_SECRET=

VERCEL_URL=
1 change: 1 addition & 0 deletions apps/svelte.dev/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
"lightningcss": "^1.25.1",
"magic-string": "^0.30.11",
"marked": "^14.1.2",
"minimatch": "^10.0.1",
"prettier": "^3.3.2",
"prettier-plugin-svelte": "^3.2.4",
"satori": "^0.10.13",
Expand Down
197 changes: 196 additions & 1 deletion apps/svelte.dev/src/lib/server/content.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import { dev } from '$app/environment';
import { read } from '$app/server';
import type { Document } from '@sveltejs/site-kit';
import { create_index } from '@sveltejs/site-kit/server/content';
import { minimatch } from 'minimatch';

const documents = import.meta.glob<string>('../../../content/**/*.md', {
eager: true,
Expand Down Expand Up @@ -124,5 +126,198 @@ function create_docs() {
}

export const docs = create_docs();

export const examples = index.examples.children;

export const packages = Array.from(
new Set(
Object.keys(docs.topics)
.map((topic) => topic.split('/')[1])
.filter(Boolean)
)
);

export const DOCUMENTATION_NAMES: Record<string, string> = {
svelte: 'Svelte',
kit: 'SvelteKit',
cli: 'Svelte CLI'
};

export function get_documentation_title(type: string): string {
return `This is the developer documentation for ${DOCUMENTATION_NAMES[type]}.`;
}

export function get_documentation_start_title(type: string): string {
return `# Start of ${DOCUMENTATION_NAMES[type]} documentation`;
}

interface MinimizeOptions {
removeLegacy: boolean;
removeNoteBlocks: boolean;
removeDetailsBlocks: boolean;
removePlaygroundLinks: boolean;
removePrettierIgnore: boolean;
normalizeWhitespace: boolean;
}

const defaultOptions: MinimizeOptions = {
removeLegacy: false,
removeNoteBlocks: false,
removeDetailsBlocks: false,
removePlaygroundLinks: false,
removePrettierIgnore: false,
normalizeWhitespace: false
};

function remove_quote_blocks(content: string, blockType: string): string {
return content
.split('\n')
.reduce((acc: string[], line: string, index: number, lines: string[]) => {
// If we find a block (with or without additional text), skip it and all subsequent blockquote lines
if (line.trim().startsWith(`> [!${blockType}]`)) {
// Skip all subsequent lines that are part of the blockquote
let i = index;
while (i < lines.length && (lines[i].startsWith('>') || lines[i].trim() === '')) {
i++;
}
// Update the index to skip all these lines
index = i - 1;
return acc;
}

// Only add the line if it's not being skipped
acc.push(line);
return acc;
}, [])
.join('\n');
}

function minimize_content(content: string, options?: Partial<MinimizeOptions>): string {
// Merge with defaults, but only for properties that are defined
const settings: MinimizeOptions = options ? { ...defaultOptions, ...options } : defaultOptions;

let minimized = content;

if (settings.removeLegacy) {
minimized = remove_quote_blocks(minimized, 'LEGACY');
}

if (settings.removeNoteBlocks) {
minimized = remove_quote_blocks(minimized, 'NOTE');
}

if (settings.removeDetailsBlocks) {
minimized = remove_quote_blocks(minimized, 'DETAILS');
}

if (settings.removePlaygroundLinks) {
// Replace playground URLs with /[link] but keep the original link text
minimized = minimized.replace(/\[([^\]]+)\]\(\/playground[^)]+\)/g, '[$1](/REMOVED)');
}

if (settings.removePrettierIgnore) {
minimized = minimized
.split('\n')
.filter((line) => line.trim() !== '<!-- prettier-ignore -->')
.join('\n');
}

if (settings.normalizeWhitespace) {
minimized = minimized.replace(/\s+/g, ' ');
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this whitespace normalization messes up the markdown, as all newlines are removed. What was the reason to add this, in which way do llms benefit from this? Or is this just about removing as much extra content as possible? (in which case, I'm not sure if this helps / is doable, because of the markdown rules)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if these llms count tokens and whitespace isn't counted, then i am not sure we need this at all and should rather retain the more readable format for easier human consumption and possible markdown rendering to html?

Copy link
Contributor Author

@khromov khromov Dec 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👋 Spaces are counted as tokens, so the reason to remove whitespace is purely to reduce the token count. That's why we apply the whitespace transformation to /llms.txt but not to /llms-full.txt in this PR.

As for raw numbers, I ran the current llms.txt through OpenAI tokenizer to test the effect:

Without normalizeWhitespace:
Tokens 197,260
Characters 797,617

With normalizeWhitespace:
Tokens 188,595
Characters 781,754

The reduction for token count is ~4.4%.

As for the invalid markdown, since LLMs don't really parse Markdown, it does not need to have a detrimental effect on performance. As for the performance, there is no real way to measure without creating a "SvelteBench" benchmark that compares minified vs unminified and test it for all major LLMs.

From my anecdotal evidence of having used the content.json file from the v4 site for 6+ months for LLM coding (which is half HTML encoded 😅 ) I haven't found a huge performance difference vs the text version of the docs.

What does make a difference is whether you can fit the file into context or not. If you are using a full context approach (as shown in my recent YouTube video) then it is absolutely required for the text file to fit inside the LLM context, otherwise you cannot use it at all. That's why I've made a compromise with a smaller llms.txt (which I think we should slim down further with time, but this sets a starting point) and a larger llms-full.txt for people who have huge context or use other approaches like RAG to search the docs, where the size does not matter as much.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to https://llmstxt.org/ the main llms.txt would provide only a short overview along with links to more details (in our case, we could link to the different packages). It would also remove the need for llms-full.txt. Why do we need both then?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👋 I mentioned this in another comment, I will change it to have an index.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👋 Spaces are counted as tokens, so the reason to remove whitespace is purely to reduce the token count.

if whitespace counts as tokens it means make an actual difference in interpretation. esp removing newlines could end up turning a list into a single line thats "parsed" differently.

is there a way to compare both variants? would leaving newlines in only be a reasonable compromise?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}

minimized = minimized.trim();

return minimized;
}

function should_include_file_llm_docs(filename: string, ignore: string[] = []): boolean {
const shouldIgnore = ignore.some((pattern) => minimatch(filename, pattern));
if (shouldIgnore) {
if (dev) console.log(`❌ Ignored by pattern: ${filename}`);
return false;
}

return true;
}

interface GenerateLlmContentOptions {
prefix?: string;
ignore?: string[];
minimize?: Partial<MinimizeOptions>;
package?: string;
}

export function generate_llm_content(options: GenerateLlmContentOptions = {}): string {
const { prefix, ignore = [], minimize: minimizeOptions, package: pkg } = options;

let content = '';
if (prefix) {
content = `${prefix}\n\n`;
}

let current_section = '';
const paths = sort_documentation_paths();

for (const path of paths) {
if (!should_include_file_llm_docs(path, ignore)) continue;

// If a specific package is provided, only include its docs
if (pkg) {
if (!path.includes(`docs/${pkg}/`)) continue;
} else {
// For combined content, only include paths that match any package
const doc_type = packages.find((p) => path.includes(`docs/${p}/`));
if (!doc_type) continue;

const section = get_documentation_start_title(doc_type);
if (section !== current_section) {
if (current_section) content += '\n';
content += `${section}\n\n`;
current_section = section;
}
}

const docContent = minimizeOptions
? minimize_content(index[path].body, minimizeOptions)
: index[path].body;
if (docContent.trim() === '') continue;

content += `\n# ${index[path].metadata.title}\n\n`;
content += docContent;
content += '\n';
}

return content;
}

function get_documentation_section_priority(path: string): number {
if (path.includes('docs/svelte/')) return 0;
if (path.includes('docs/kit/')) return 1;
if (path.includes('docs/cli/')) return 2;
return 3;
}

function sort_documentation_paths(): string[] {
return Object.keys(index).sort((a, b) => {
a = index[a].file;
b = index[b].file;
// First compare by section priority
const priorityA = get_documentation_section_priority(a);
const priorityB = get_documentation_section_priority(b);
if (priorityA !== priorityB) return priorityA - priorityB;

// Get directory paths
const dirA = a.split('/').slice(0, -1).join('/');
const dirB = b.split('/').slice(0, -1).join('/');

// If in the same directory, prioritize index.md
if (dirA === dirB) {
if (a.endsWith('index.md')) return -1;
if (b.endsWith('index.md')) return 1;
return a.localeCompare(b);
}

// Otherwise sort by directory path
return dirA.localeCompare(dirB);
});
}
27 changes: 27 additions & 0 deletions apps/svelte.dev/src/routes/docs/[...path]/llms.txt/+server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import { error } from '@sveltejs/kit';
import { generate_llm_content, get_documentation_title, packages } from '$lib/server/content';

export const prerender = true;

export function entries() {
return packages.map((type) => ({ path: type }));
}

export function GET({ params }) {
const pkg = params.path;

if (!packages.includes(pkg)) {
error(404, 'Not Found');
}

const prefix = `<SYSTEM>${get_documentation_title(pkg)}</SYSTEM>`;
const content = `${prefix}\n\n${generate_llm_content({ package: pkg })}`;

return new Response(content, {
status: 200,
headers: {
'Content-Type': 'text/plain; charset=utf-8',
'Cache-Control': 'public, max-age=3600'
}
});
}
15 changes: 15 additions & 0 deletions apps/svelte.dev/src/routes/llms-full.txt/+server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { generate_llm_content } from '$lib/server/content';

export const prerender = true;

export function GET() {
const content = `<SYSTEM>This is the full developer documentation for Svelte and SvelteKit.</SYSTEM>\n\n${generate_llm_content()}`;

return new Response(content, {
status: 200,
headers: {
'Content-Type': 'text/plain; charset=utf-8',
'Cache-Control': 'public, max-age=3600'
}
});
}
47 changes: 47 additions & 0 deletions apps/svelte.dev/src/routes/llms-small.txt/+server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { generate_llm_content } from '$lib/server/content';

export function GET() {
const main_content = generate_llm_content({
ignore: [
// Svelte ignores
'../../../content/docs/svelte/07-misc/04-custom-elements.md',
'../../../content/docs/svelte/07-misc/06-v4-migration-guide.md',
'../../../content/docs/svelte/07-misc/07-v5-migration-guide.md',
'../../../content/docs/svelte/07-misc/99-faq.md',
'../../../content/docs/svelte/07-misc/xx-reactivity-indepth.md',
'../../../content/docs/svelte/98-reference/21-svelte-legacy.md',
'../../../content/docs/svelte/99-legacy/**/*.md',
'../../../content/docs/svelte/98-reference/30-runtime-errors.md',
'../../../content/docs/svelte/98-reference/30-runtime-warnings.md',
'../../../content/docs/svelte/98-reference/30-compiler-errors.md',
'../../../content/docs/svelte/98-reference/30-compiler-warnings.md',
'**/xx-*.md',

// SvelteKit ignores
'../../../content/docs/kit/25-build-and-deploy/*adapter-*.md',
'../../../content/docs/kit/25-build-and-deploy/99-writing-adapters.md',
'../../../content/docs/kit/30-advanced/70-packaging.md',
'../../../content/docs/kit/40-best-practices/05-performance.md',
'../../../content/docs/kit/60-appendix/**/*.md'
],
minimize: {
removeLegacy: true,
removeNoteBlocks: true,
removeDetailsBlocks: true,
removePlaygroundLinks: true,
removePrettierIgnore: true,
normalizeWhitespace: true
}
});
const content = `<SYSTEM>This is the abridged developer documentation for Svelte and SvelteKit.</SYSTEM>\n\n${main_content}`;

return new Response(content, {
status: 200,
headers: {
'Content-Type': 'text/plain; charset=utf-8',
'Cache-Control': 'public, max-age=3600'
}
});
}

export const prerender = true;
42 changes: 42 additions & 0 deletions apps/svelte.dev/src/routes/llms.txt/+server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import { VERCEL_URL } from '$env/static/private';
import { get_documentation_title, packages, DOCUMENTATION_NAMES } from '$lib/server/content';

const DOMAIN = VERCEL_URL ? `https://${VERCEL_URL}` : '';

export const prerender = true;

export function GET() {
const package_docs = packages
.map(
(pkg) =>
`- [${DOCUMENTATION_NAMES[pkg]} documentation](${DOMAIN}/docs/${pkg}/llms.txt): ${get_documentation_title(pkg)}`
)
.join('\n');

const content = `# Svelte Documentation for LLMs

> Svelte is a UI framework that uses a compiler to let you write breathtakingly concise components that do minimal work in the browser, using languages you already know — HTML, CSS and JavaScript.

## Documentation Sets

- [Abridged documentation](${DOMAIN}/llms-small.txt): A minimal version of the Svelte and SvelteKit documentation, with examples and non-essential content removed
- [Complete documentation](${DOMAIN}/llms-full.txt): The complete Svelte and SvelteKit documentation including all examples and additional content

## Individual Package Documentation

${package_docs}

## Notes

- The abridged documentation excludes legacy compatibility notes, detailed examples, and supplementary information
- The complete documentation includes all content from the official documentation
- Package-specific documentation files contain only the content relevant to that package
- The content is automatically generated from the same source as the official documentation`;

return new Response(content, {
headers: {
'Content-Type': 'text/plain; charset=utf-8',
'Cache-Control': 'public, max-age=3600'
}
});
}
11 changes: 11 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading