Skip to content

Commit 9c8baf5

Browse files
committed
[TOOL-3562] Portal: Add llms.txt and llms-full.txt generation script
1 parent 36b4ea5 commit 9c8baf5

File tree

14 files changed

+294
-100
lines changed

14 files changed

+294
-100
lines changed

apps/portal/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ next-env.d.ts
3939

4040
# generated files
4141
searchIndex.json
42+
public/llms.txt
43+
public/llms-full.txt
4244

4345
.env
4446
public/sitemap*.xml

apps/portal/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@
3434
"date-fns": "4.1.0",
3535
"flexsearch": "^0.7.43",
3636
"github-slugger": "^2.0.0",
37+
"he": "^1.2.0",
3738
"lucide-react": "0.476.0",
3839
"next": "15.2.0",
3940
"nextjs-toploader": "^1.6.12",
41+
"node-html-markdown": "^1.3.0",
4042
"node-html-parser": "^6.1.13",
4143
"posthog-js": "1.67.1",
4244
"prettier": "3.3.3",
@@ -55,6 +57,7 @@
5557
"devDependencies": {
5658
"@next/eslint-plugin-next": "15.2.0",
5759
"@types/flexsearch": "^0.7.6",
60+
"@types/he": "^1.2.3",
5861
"@types/mdx": "^2.0.13",
5962
"@types/node": "22.13.5",
6063
"@types/react": "19.0.10",
Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import { writeFileSync } from "node:fs";
2-
import { extractSearchData } from "../src/app/api/search/extraction";
2+
import { extractContent } from "../src/app/api/search/extraction";
33

44
async function main() {
55
const rootDir = process.cwd();
6-
const websiteData = await extractSearchData(rootDir);
7-
writeFileSync("./searchIndex.json", JSON.stringify(websiteData, null, 2));
6+
const { searchData, llmContent, llmFullContent } =
7+
await extractContent(rootDir);
8+
writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
9+
writeFileSync("./public/llms.txt", llmContent);
10+
writeFileSync("./public/llms-full.txt", llmFullContent);
811
}
912

1013
main();

apps/portal/src/app/account/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/api/search/extraction/index.ts

Lines changed: 178 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import { readFile } from "node:fs/promises";
2+
import he from "he";
3+
import { NodeHtmlMarkdown } from "node-html-markdown";
24
import {
35
CommentNode as X_CommentNode,
46
HTMLElement as X_HTMLElement,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113
import { ignoreHeadings } from "./settings";
1214
import { trimExtraSpace } from "./trimExtraSpace";
1315

14-
export async function extractSearchData(rootDir: string): Promise<PageData[]> {
16+
type ExtractedContent = {
17+
searchData: PageData[];
18+
llmContent: string;
19+
llmFullContent: string;
20+
};
21+
22+
const llmsContentHeader = `\
23+
# thirdweb
24+
25+
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26+
27+
## Docs
28+
`;
29+
30+
const llmsFullContentHeader = `\
31+
# thirdweb
32+
33+
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34+
`;
35+
36+
export async function extractContent(
37+
rootDir: string,
38+
): Promise<ExtractedContent> {
1539
const nextOutputDir = `${rootDir}/.next/server/app`;
1640
const htmlFiles = getFilesRecursive(nextOutputDir, "html");
1741

1842
const pages: PageData[] = [];
43+
let llmContent = "";
44+
let llmFullContent = "";
1945

2046
const noMainFound: string[] = [];
2147
const noH1Found: string[] = [];
@@ -25,8 +51,9 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2551
const htmlContent = await readFile(filePath, "utf-8");
2652
const mainEl = parse(htmlContent, {
2753
comment: false,
54+
// fixNestedATags: true,
2855
blockTextElements: {
29-
pre: false, // parse text inside <pre> elements instead of treating it as text
56+
pre: true,
3057
},
3158
}).querySelector("main");
3259

@@ -38,24 +65,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3865
}
3966

4067
const noIndex = mainEl.getAttribute("data-noindex");
41-
42-
if (noIndex) {
68+
if (noIndex === "true") {
4369
return;
4470
}
4571

4672
const pageTitle = mainEl.querySelector("h1")?.text;
47-
4873
if (!pageTitle) {
4974
noH1Found.push(
5075
filePath.split(".next/server/app")[1]?.replace(".html", "") || "",
5176
);
5277
}
5378

54-
pages.push({
55-
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
56-
title: pageTitle ? trimExtraSpace(pageTitle) : "",
57-
sections: getPageSections(mainEl),
58-
});
79+
// Important: do the search index collection first - we will modify the main element in the next step
80+
// Extract search data
81+
const pageData = extractPageSearchData(
82+
mainEl,
83+
filePath,
84+
nextOutputDir,
85+
pageTitle,
86+
);
87+
if (pageData) {
88+
pages.push(pageData);
89+
}
90+
91+
// Extract LLM content
92+
const { links, full } = extractPageLLMContent(
93+
mainEl,
94+
pageTitle,
95+
filePath,
96+
nextOutputDir,
97+
);
98+
llmContent += links ? `${links}\n` : "";
99+
llmFullContent += full ? `${full}\n` : "";
59100
}),
60101
);
61102

@@ -77,13 +118,136 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77118
console.warn("\n");
78119
}
79120

80-
return pages;
121+
return {
122+
searchData: pages,
123+
llmContent: `${llmsContentHeader}\n${llmContent}`,
124+
llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`,
125+
};
126+
}
127+
128+
function extractPageSearchData(
129+
main: X_HTMLElement,
130+
filePath: string,
131+
nextOutputDir: string,
132+
pageTitle: string | undefined,
133+
): PageData | null {
134+
if (main.getAttribute("data-noindex") === "true") {
135+
return null;
136+
}
137+
138+
return {
139+
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
140+
title: pageTitle ? trimExtraSpace(pageTitle) : "",
141+
sections: getPageSectionsForSearchIndex(main),
142+
};
143+
}
144+
145+
function extractPageLLMContent(
146+
main: X_HTMLElement,
147+
pageTitle: string | undefined,
148+
filePath: string,
149+
nextOutputDir: string,
150+
): { links: string; full: string } {
151+
if (
152+
main.getAttribute("data-noindex") === "true" ||
153+
main.getAttribute("data-no-llm") === "true"
154+
) {
155+
return { links: "", full: "" };
156+
}
157+
158+
const htmlToMarkdown = new NodeHtmlMarkdown({
159+
keepDataImages: false,
160+
});
161+
162+
let linksContent = "";
163+
let fullContent = "";
164+
165+
const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");
166+
167+
// Get first non-empty paragraph for description
168+
const paragraphs = main.querySelectorAll("p");
169+
let description = "";
170+
for (const p of paragraphs) {
171+
if (p.getAttribute("data-noindex") !== "true") {
172+
description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
173+
if (description) break;
174+
}
175+
}
176+
177+
linksContent += `* [${pageTitle}](${pageUrl}): ${description}`;
178+
179+
// Convert main content to markdown, excluding noindex elements
180+
const contentElements = main.querySelectorAll("*");
181+
for (const element of contentElements) {
182+
if (element.getAttribute("data-noindex") === "true") {
183+
element.remove();
184+
}
185+
}
186+
187+
// Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
188+
const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6");
189+
for (const heading of headings) {
190+
const headingLevel = Number.parseInt(heading.tagName.replace("H", ""));
191+
heading.tagName = `H${headingLevel + 1}`;
192+
}
193+
194+
// prefix all the relative links with the `https://portal.thirdweb.com`
195+
const links = main.querySelectorAll("a");
196+
for (const link of links) {
197+
if (link.getAttribute("href")?.startsWith("/")) {
198+
link.setAttribute(
199+
"href",
200+
`https://portal.thirdweb.com${link.getAttribute("href")}`,
201+
);
202+
}
203+
}
204+
205+
// for code blocks inside pre tags -> make them direct descendants of the pre tag
206+
// so they are parsed as blocks by node-html-markdown + add language class
207+
const preTags = main.querySelectorAll("pre");
208+
for (const preTag of preTags) {
209+
const codeBlock = parse(preTag.innerHTML.toString(), {
210+
comment: false,
211+
blockTextElements: {
212+
pre: true,
213+
},
214+
}).querySelector("code");
215+
216+
if (codeBlock) {
217+
const code = codeBlock
218+
.querySelectorAll("div > div > div > div")
219+
.map((x) => x.textContent)
220+
.join("\n")
221+
.trim();
222+
223+
const lang = codeBlock.getAttribute("lang");
224+
codeBlock.textContent = code;
225+
226+
const newCodePreBlock = parse(
227+
`<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
228+
);
229+
230+
preTag.replaceWith(newCodePreBlock);
231+
}
232+
}
233+
234+
// console.log(main.toString());
235+
236+
// Convert the cleaned HTML to markdown
237+
fullContent += `${htmlToMarkdown.translate(main.toString())}`;
238+
239+
return {
240+
links: linksContent,
241+
full: fullContent,
242+
};
81243
}
82244

83-
function getPageSections(main: X_HTMLElement): PageSectionData[] {
245+
function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
84246
const sectionData: PageSectionData[] = [];
85247

86-
const ignoreTags = new Set(["code", "nav"].map((t) => t.toUpperCase()));
248+
const ignoreTags = new Set(
249+
["code", "nav", "pre"].map((t) => t.toUpperCase()),
250+
);
87251

88252
function collector(node: X_Node) {
89253
if (node instanceof X_CommentNode) {
@@ -94,9 +258,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94258
return;
95259
}
96260

97-
const noIndexAttribute = node.getAttribute("data-noindex");
98-
99-
if (noIndexAttribute === "true") {
261+
if (node.getAttribute("data-noindex") === "true") {
100262
return;
101263
}
102264

apps/portal/src/app/cli/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/page.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import DocsHeroLight from "./_images/docs-hero-light.png";
66

77
export default function Page() {
88
return (
9-
<main className="container max-w-[900px] grow pb-20">
9+
<main className="container max-w-[900px] grow pb-20" data-noindex>
1010
<Hero />
1111
<div className="grid grid-cols-1 gap-8">
1212
<FrontendSection />

apps/portal/src/app/react-native/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/react/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/typescript/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

0 commit comments

Comments
 (0)