Skip to content

Commit ebbc521

Browse files
committed
[TOOL-3562] Portal: Add llms.txt and llms-full.txt generation script
1 parent 56301bc commit ebbc521

File tree

14 files changed

+384
-261
lines changed

14 files changed

+384
-261
lines changed

apps/portal/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ next-env.d.ts
3939

4040
# generated files
4141
searchIndex.json
42+
public/llms.txt
43+
public/llms-full.txt
4244

4345
.env
4446
public/sitemap*.xml

apps/portal/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@
3434
"date-fns": "4.1.0",
3535
"flexsearch": "^0.7.43",
3636
"github-slugger": "^2.0.0",
37+
"he": "^1.2.0",
3738
"lucide-react": "0.476.0",
3839
"next": "15.2.0",
3940
"nextjs-toploader": "^1.6.12",
41+
"node-html-markdown": "^1.3.0",
4042
"node-html-parser": "^6.1.13",
4143
"posthog-js": "1.67.1",
4244
"prettier": "3.3.3",
@@ -55,6 +57,7 @@
5557
"devDependencies": {
5658
"@next/eslint-plugin-next": "15.2.0",
5759
"@types/flexsearch": "^0.7.6",
60+
"@types/he": "^1.2.3",
5861
"@types/mdx": "^2.0.13",
5962
"@types/node": "22.13.5",
6063
"@types/react": "19.0.10",
Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import { writeFileSync } from "node:fs";
2-
import { extractSearchData } from "../src/app/api/search/extraction";
2+
import { extractContent } from "../src/app/api/search/extraction";
33

44
async function main() {
55
const rootDir = process.cwd();
6-
const websiteData = await extractSearchData(rootDir);
7-
writeFileSync("./searchIndex.json", JSON.stringify(websiteData, null, 2));
6+
const { searchData, llmContent, llmFullContent } =
7+
await extractContent(rootDir);
8+
writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
9+
writeFileSync("./public/llms.txt", llmContent);
10+
writeFileSync("./public/llms-full.txt", llmFullContent);
811
}
912

1013
main();

apps/portal/src/app/account/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/api/search/extraction/index.ts

Lines changed: 188 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import { readFile } from "node:fs/promises";
2+
import he from "he";
3+
import { NodeHtmlMarkdown } from "node-html-markdown";
24
import {
35
CommentNode as X_CommentNode,
46
HTMLElement as X_HTMLElement,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113
import { ignoreHeadings } from "./settings";
1214
import { trimExtraSpace } from "./trimExtraSpace";
1315

14-
export async function extractSearchData(rootDir: string): Promise<PageData[]> {
16+
type ExtractedContent = {
17+
searchData: PageData[];
18+
llmContent: string;
19+
llmFullContent: string;
20+
};
21+
22+
const llmsContentHeader = `\
23+
# thirdweb
24+
25+
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26+
27+
## Docs
28+
`;
29+
30+
const llmsFullContentHeader = `\
31+
# thirdweb
32+
33+
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34+
`;
35+
36+
export async function extractContent(
37+
rootDir: string,
38+
): Promise<ExtractedContent> {
1539
const nextOutputDir = `${rootDir}/.next/server/app`;
1640
const htmlFiles = getFilesRecursive(nextOutputDir, "html");
1741

1842
const pages: PageData[] = [];
43+
let llmContent = "";
44+
let llmFullContent = "";
1945

2046
const noMainFound: string[] = [];
2147
const noH1Found: string[] = [];
@@ -26,7 +52,7 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2652
const mainEl = parse(htmlContent, {
2753
comment: false,
2854
blockTextElements: {
29-
pre: false, // parse text inside <pre> elements instead of treating it as text
55+
pre: true,
3056
},
3157
}).querySelector("main");
3258

@@ -37,25 +63,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3763
return;
3864
}
3965

40-
const noIndex = mainEl.getAttribute("data-noindex");
41-
42-
if (noIndex) {
66+
if (mainEl.getAttribute("data-noindex") === "true") {
4367
return;
4468
}
4569

4670
const pageTitle = mainEl.querySelector("h1")?.text;
47-
4871
if (!pageTitle) {
4972
noH1Found.push(
5073
filePath.split(".next/server/app")[1]?.replace(".html", "") || "",
5174
);
5275
}
5376

54-
pages.push({
55-
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
56-
title: pageTitle ? trimExtraSpace(pageTitle) : "",
57-
sections: getPageSections(mainEl),
58-
});
77+
// Important: do the search index collection first - we will modify the main element in the next step
78+
// Extract search data
79+
const pageData = extractPageSearchData(
80+
mainEl,
81+
filePath,
82+
nextOutputDir,
83+
pageTitle,
84+
);
85+
if (pageData) {
86+
pages.push(pageData);
87+
}
88+
89+
// Extract LLM content
90+
const { links, full } = extractPageLLMContent(
91+
mainEl,
92+
pageTitle,
93+
filePath,
94+
nextOutputDir,
95+
);
96+
llmContent += links ? `${links}\n` : "";
97+
llmFullContent += full ? `${full}\n` : "";
5998
}),
6099
);
61100

@@ -77,13 +116,147 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77116
console.warn("\n");
78117
}
79118

80-
return pages;
119+
return {
120+
searchData: pages,
121+
llmContent: `${llmsContentHeader}\n${llmContent}`,
122+
llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`,
123+
};
81124
}
82125

83-
function getPageSections(main: X_HTMLElement): PageSectionData[] {
126+
function extractPageSearchData(
127+
main: X_HTMLElement,
128+
filePath: string,
129+
nextOutputDir: string,
130+
pageTitle: string | undefined,
131+
): PageData | null {
132+
if (main.getAttribute("data-noindex") === "true") {
133+
return null;
134+
}
135+
136+
return {
137+
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
138+
title: pageTitle ? trimExtraSpace(pageTitle) : "",
139+
sections: getPageSectionsForSearchIndex(main),
140+
};
141+
}
142+
143+
function extractPageLLMContent(
144+
main: X_HTMLElement,
145+
pageTitle: string | undefined,
146+
filePath: string,
147+
nextOutputDir: string,
148+
): { links: string; full: string } {
149+
if (
150+
main.getAttribute("data-noindex") === "true" ||
151+
main.getAttribute("data-no-llm") === "true"
152+
) {
153+
return { links: "", full: "" };
154+
}
155+
156+
const htmlToMarkdown = new NodeHtmlMarkdown({
157+
keepDataImages: false,
158+
});
159+
160+
let linksContent = "";
161+
let fullContent = "";
162+
163+
const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");
164+
165+
// Get first non-empty paragraph for description
166+
const paragraphs = main.querySelectorAll("p");
167+
let description = "";
168+
for (const p of paragraphs) {
169+
// skip noindex or no-llm paragraphs
170+
if (
171+
p.getAttribute("data-noindex") === "true" ||
172+
p.getAttribute("data-no-llm") === "true"
173+
) {
174+
continue;
175+
}
176+
177+
description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
178+
if (description) {
179+
break;
180+
}
181+
}
182+
183+
linksContent += `* [${pageTitle}](${pageUrl}): ${description}`;
184+
185+
// Remove noindex and no-llm elements
186+
const contentElements = main.querySelectorAll("*");
187+
for (const element of contentElements) {
188+
if (
189+
element.getAttribute("data-noindex") === "true" ||
190+
element.getAttribute("data-no-llm") === "true"
191+
) {
192+
element.remove();
193+
}
194+
}
195+
196+
// Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
197+
const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6");
198+
for (const heading of headings) {
199+
const headingLevel = Number.parseInt(heading.tagName.replace("H", ""));
200+
const newLevel = Math.min(headingLevel + 1, 6);
201+
heading.tagName = `H${newLevel}`;
202+
}
203+
204+
// prefix all the relative links with the `https://portal.thirdweb.com`
205+
const links = main.querySelectorAll("a");
206+
for (const link of links) {
207+
const [path, hash] = link.getAttribute("href")?.split("#") || [];
208+
if (path?.startsWith("/")) {
209+
link.setAttribute(
210+
"href",
211+
`https://portal.thirdweb.com${path}${hash ? `#${hash}` : ""}`,
212+
);
213+
}
214+
}
215+
216+
// for code blocks inside pre tags -> make them direct descendants of the pre tag
217+
// so they are parsed as blocks by node-html-markdown + add language class
218+
const preTags = main.querySelectorAll("pre");
219+
for (const preTag of preTags) {
220+
const codeBlock = parse(preTag.innerHTML.toString(), {
221+
comment: false,
222+
blockTextElements: {
223+
pre: true,
224+
},
225+
}).querySelector("code");
226+
227+
if (codeBlock) {
228+
const code = codeBlock
229+
.querySelectorAll("div > div > div > div")
230+
.map((x) => x.textContent)
231+
.join("\n")
232+
.trim();
233+
234+
const lang = codeBlock.getAttribute("lang");
235+
codeBlock.textContent = code;
236+
237+
const newCodePreBlock = parse(
238+
`<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
239+
);
240+
241+
preTag.replaceWith(newCodePreBlock);
242+
}
243+
}
244+
245+
// Convert the cleaned HTML to markdown
246+
fullContent += `${htmlToMarkdown.translate(main.toString())}`;
247+
248+
return {
249+
links: linksContent,
250+
full: fullContent,
251+
};
252+
}
253+
254+
function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
84255
const sectionData: PageSectionData[] = [];
85256

86-
const ignoreTags = new Set(["code", "nav"].map((t) => t.toUpperCase()));
257+
const ignoreTags = new Set(
258+
["code", "nav", "pre"].map((t) => t.toUpperCase()),
259+
);
87260

88261
function collector(node: X_Node) {
89262
if (node instanceof X_CommentNode) {
@@ -94,9 +267,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94267
return;
95268
}
96269

97-
const noIndexAttribute = node.getAttribute("data-noindex");
98-
99-
if (noIndexAttribute === "true") {
270+
if (node.getAttribute("data-noindex") === "true") {
100271
return;
101272
}
102273

apps/portal/src/app/cli/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/page.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import DocsHeroLight from "./_images/docs-hero-light.png";
66

77
export default function Page() {
88
return (
9-
<main className="container max-w-[900px] grow pb-20">
9+
<main className="container max-w-[900px] grow pb-20" data-noindex>
1010
<Hero />
1111
<div className="grid grid-cols-1 gap-8">
1212
<FrontendSection />

apps/portal/src/app/react-native/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/react/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/typescript/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

0 commit comments

Comments
 (0)