Skip to content

Commit be24b28

Browse files
authored
feat: Improve llms.txt file indentation (#1918)
closes: #1883 Indentation of `llms.txt` based on the route segments and route depth. Unfortunately, the depth param from the Docusaurus plugin is not for indenting. So the custom script had to be done.
1 parent e78d09c commit be24b28

File tree

3 files changed

+157
-1
lines changed

3 files changed

+157
-1
lines changed

package-lock.json

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
"lint:code": "eslint .",
4242
"lint:code:fix": "eslint . --fix",
4343
"postinstall": "patch-package",
44-
"postbuild": "node ./scripts/joinLlmsFiles.mjs"
44+
"postbuild": "node ./scripts/joinLlmsFiles.mjs && node ./scripts/indentLlmsFile.mjs"
4545
},
4646
"devDependencies": {
4747
"@apify/eslint-config": "^1.0.0",

scripts/indentLlmsFile.mjs

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import fs from 'node:fs/promises';
2+
import path from 'node:path';
3+
4+
const BUILD_DIR = path.resolve('build');
5+
const LLMS_FILE = path.join(BUILD_DIR, 'llms.txt');
6+
7+
const INDENT_LEVEL = 2;
8+
9+
const MAIN_SECTIONS = ['/api.md', '/api/v2.md'];
10+
11+
const BASE_URL = process.env.APIFY_DOCS_ABSOLUTE_URL || 'https://docs.apify.com';
12+
13+
/**
14+
* Extracts the path from a URL, removing the base URL and query parameters
15+
*/
16+
function extractPathFromUrl(url) {
17+
const urlObj = new URL(url);
18+
return urlObj.pathname;
19+
}
20+
21+
/**
22+
* Calculates the hierarchical depth of a URL path.
23+
* This counts directory levels, not including the filename.
24+
*/
25+
function getUrlHierarchyDepth(url) {
26+
const urlPath = extractPathFromUrl(url);
27+
const segments = urlPath.split('/').filter((segment) => segment && segment !== '');
28+
const nonFileSegments = segments.filter((segment) => !segment.endsWith('.md'));
29+
30+
return nonFileSegments.length;
31+
}
32+
33+
/**
34+
* Determines if a URL is a main section page (level 0)
35+
*/
36+
function isMainSectionPage(url) {
37+
const urlPath = extractPathFromUrl(url);
38+
const segments = urlPath.split('/').filter((segment) => segment && segment !== '');
39+
40+
// Main pages are those with only one segment (the .md file)
41+
if (segments.length === 1) {
42+
return true;
43+
}
44+
45+
// Special cases for main API pages
46+
if (MAIN_SECTIONS.includes(urlPath)) {
47+
return true;
48+
}
49+
50+
return false;
51+
}
52+
53+
/**
54+
* Determines the indentation level for a documentation link based on its URL hierarchy.
55+
*/
56+
function getLinkIndentation(url) {
57+
// Main section pages get no indentation
58+
if (isMainSectionPage(url)) {
59+
return 0;
60+
}
61+
62+
const depth = getUrlHierarchyDepth(url);
63+
64+
// The first level after main sections gets 1 level of indentation
65+
// Each subsequent level gets another level of indentation
66+
return Math.min(depth * INDENT_LEVEL, INDENT_LEVEL * 4);
67+
}
68+
69+
/**
70+
* Determines the indentation level for a line based on its content type and URL.
71+
*/
72+
function getIndentationLevel(line, lineIndex, allLines) {
73+
if (line.startsWith('# ') || line.startsWith('## ')) {
74+
return 0;
75+
}
76+
77+
if (line.startsWith('### ')) {
78+
return INDENT_LEVEL;
79+
}
80+
81+
if (line.startsWith('#### ')) {
82+
return INDENT_LEVEL * 2;
83+
}
84+
85+
// Handle markdown links with URLs
86+
if (line.startsWith('- [') && line.includes(`](${BASE_URL}/`)) {
87+
// Extract URL from markdown link format: - [Link Text](https://docs.apify.com/path/to/page)
88+
// Example: "- [API Reference](https://docs.apify.com/api/v2)" → extracts "https://docs.apify.com/api/v2"
89+
const urlMatch = line.match(new RegExp(`\\]\\((${BASE_URL.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/[^)]+)\\)`));
90+
if (!urlMatch) {
91+
return INDENT_LEVEL;
92+
}
93+
return getLinkIndentation(urlMatch[1]);
94+
}
95+
96+
// For other content, use the same indentation as the previous line
97+
if (lineIndex > 0) {
98+
const prevLine = allLines[lineIndex - 1];
99+
const prevIndentMatch = prevLine.match(/^(\s*)/);
100+
return prevIndentMatch ? prevIndentMatch[1].length : INDENT_LEVEL;
101+
}
102+
103+
return INDENT_LEVEL;
104+
}
105+
106+
/**
107+
* Applies hierarchical indentation to content based on URL structure and content type.
108+
*/
109+
function indentContent(content) {
110+
const lines = content.split('\n');
111+
const indentedLines = [];
112+
113+
for (let i = 0; i < lines.length; i++) {
114+
const line = lines[i];
115+
const trimmedLine = line.trim();
116+
117+
// Preserve empty lines (add them without indentation)
118+
if (!trimmedLine) {
119+
indentedLines.push('');
120+
continue;
121+
}
122+
123+
const indent = getIndentationLevel(trimmedLine, i, lines);
124+
const indentStr = ' '.repeat(indent);
125+
indentedLines.push(indentStr + trimmedLine);
126+
}
127+
128+
return indentedLines.join('\n');
129+
}
130+
131+
/**
132+
* Main function to indent the LLMs file.
133+
* Reads the file, applies indentation, and writes it back.
134+
*/
135+
async function indentLlmsFile() {
136+
try {
137+
await fs.access(LLMS_FILE);
138+
const content = await fs.readFile(LLMS_FILE, 'utf8');
139+
const indentedContent = indentContent(content);
140+
await fs.writeFile(LLMS_FILE, indentedContent, 'utf8');
141+
console.log('Successfully indented llms.txt file');
142+
} catch (error) {
143+
if (error.code === 'ENOENT') {
144+
console.log('llms.txt file not found, skipping indentation');
145+
} else {
146+
console.error('Error indenting llms.txt file:', error);
147+
process.exit(1);
148+
}
149+
}
150+
}
151+
152+
await indentLlmsFile();

0 commit comments

Comments
 (0)