Skip to content

Commit 02fadf7

Browse files
committed
[TOOL-3562] Portal: Add llms.txt and llms-full.txt generation script
1 parent 56301bc commit 02fadf7

File tree

14 files changed

+379
-260
lines changed

14 files changed

+379
-260
lines changed

apps/portal/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ next-env.d.ts
3939

4040
# generated files
4141
searchIndex.json
42+
public/llms.txt
43+
public/llms-full.txt
4244

4345
.env
4446
public/sitemap*.xml

apps/portal/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@
3434
"date-fns": "4.1.0",
3535
"flexsearch": "^0.7.43",
3636
"github-slugger": "^2.0.0",
37+
"he": "^1.2.0",
3738
"lucide-react": "0.476.0",
3839
"next": "15.2.0",
3940
"nextjs-toploader": "^1.6.12",
41+
"node-html-markdown": "^1.3.0",
4042
"node-html-parser": "^6.1.13",
4143
"posthog-js": "1.67.1",
4244
"prettier": "3.3.3",
@@ -55,6 +57,7 @@
5557
"devDependencies": {
5658
"@next/eslint-plugin-next": "15.2.0",
5759
"@types/flexsearch": "^0.7.6",
60+
"@types/he": "^1.2.3",
5861
"@types/mdx": "^2.0.13",
5962
"@types/node": "22.13.5",
6063
"@types/react": "19.0.10",
Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import { writeFileSync } from "node:fs";
2-
import { extractSearchData } from "../src/app/api/search/extraction";
2+
import { extractContent } from "../src/app/api/search/extraction";
33

44
async function main() {
55
const rootDir = process.cwd();
6-
const websiteData = await extractSearchData(rootDir);
7-
writeFileSync("./searchIndex.json", JSON.stringify(websiteData, null, 2));
6+
const { searchData, llmContent, llmFullContent } =
7+
await extractContent(rootDir);
8+
writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
9+
writeFileSync("./public/llms.txt", llmContent);
10+
writeFileSync("./public/llms-full.txt", llmFullContent);
811
}
912

1013
main();

apps/portal/src/app/account/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/api/search/extraction/index.ts

Lines changed: 183 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import { readFile } from "node:fs/promises";
2+
import he from "he";
3+
import { NodeHtmlMarkdown } from "node-html-markdown";
24
import {
35
CommentNode as X_CommentNode,
46
HTMLElement as X_HTMLElement,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113
import { ignoreHeadings } from "./settings";
1214
import { trimExtraSpace } from "./trimExtraSpace";
1315

14-
export async function extractSearchData(rootDir: string): Promise<PageData[]> {
16+
type ExtractedContent = {
17+
searchData: PageData[];
18+
llmContent: string;
19+
llmFullContent: string;
20+
};
21+
22+
const llmsContentHeader = `\
23+
# thirdweb
24+
25+
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26+
27+
## Docs
28+
`;
29+
30+
const llmsFullContentHeader = `\
31+
# thirdweb
32+
33+
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34+
`;
35+
36+
export async function extractContent(
37+
rootDir: string,
38+
): Promise<ExtractedContent> {
1539
const nextOutputDir = `${rootDir}/.next/server/app`;
1640
const htmlFiles = getFilesRecursive(nextOutputDir, "html");
1741

1842
const pages: PageData[] = [];
43+
let llmContent = "";
44+
let llmFullContent = "";
1945

2046
const noMainFound: string[] = [];
2147
const noH1Found: string[] = [];
@@ -26,7 +52,7 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2652
const mainEl = parse(htmlContent, {
2753
comment: false,
2854
blockTextElements: {
29-
pre: false, // parse text inside <pre> elements instead of treating it as text
55+
pre: true,
3056
},
3157
}).querySelector("main");
3258

@@ -38,24 +64,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3864
}
3965

4066
const noIndex = mainEl.getAttribute("data-noindex");
41-
42-
if (noIndex) {
67+
if (noIndex === "true") {
4368
return;
4469
}
4570

4671
const pageTitle = mainEl.querySelector("h1")?.text;
47-
4872
if (!pageTitle) {
4973
noH1Found.push(
5074
filePath.split(".next/server/app")[1]?.replace(".html", "") || "",
5175
);
5276
}
5377

54-
pages.push({
55-
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
56-
title: pageTitle ? trimExtraSpace(pageTitle) : "",
57-
sections: getPageSections(mainEl),
58-
});
78+
// Important: do the search index collection first - we will modify the main element in the next step
79+
// Extract search data
80+
const pageData = extractPageSearchData(
81+
mainEl,
82+
filePath,
83+
nextOutputDir,
84+
pageTitle,
85+
);
86+
if (pageData) {
87+
pages.push(pageData);
88+
}
89+
90+
// Extract LLM content
91+
const { links, full } = extractPageLLMContent(
92+
mainEl,
93+
pageTitle,
94+
filePath,
95+
nextOutputDir,
96+
);
97+
llmContent += links ? `${links}\n` : "";
98+
llmFullContent += full ? `${full}\n` : "";
5999
}),
60100
);
61101

@@ -77,13 +117,142 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77117
console.warn("\n");
78118
}
79119

80-
return pages;
120+
return {
121+
searchData: pages,
122+
llmContent: `${llmsContentHeader}\n${llmContent}`,
123+
llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`,
124+
};
125+
}
126+
127+
function extractPageSearchData(
128+
main: X_HTMLElement,
129+
filePath: string,
130+
nextOutputDir: string,
131+
pageTitle: string | undefined,
132+
): PageData | null {
133+
if (main.getAttribute("data-noindex") === "true") {
134+
return null;
135+
}
136+
137+
return {
138+
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
139+
title: pageTitle ? trimExtraSpace(pageTitle) : "",
140+
sections: getPageSectionsForSearchIndex(main),
141+
};
81142
}
82143

83-
function getPageSections(main: X_HTMLElement): PageSectionData[] {
144+
function extractPageLLMContent(
145+
main: X_HTMLElement,
146+
pageTitle: string | undefined,
147+
filePath: string,
148+
nextOutputDir: string,
149+
): { links: string; full: string } {
150+
if (
151+
main.getAttribute("data-noindex") === "true" ||
152+
main.getAttribute("data-no-llm") === "true"
153+
) {
154+
return { links: "", full: "" };
155+
}
156+
157+
const htmlToMarkdown = new NodeHtmlMarkdown({
158+
keepDataImages: false,
159+
});
160+
161+
let linksContent = "";
162+
let fullContent = "";
163+
164+
const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");
165+
166+
// Get first non-empty paragraph for description
167+
const paragraphs = main.querySelectorAll("p");
168+
let description = "";
169+
for (const p of paragraphs) {
170+
if (
171+
p.getAttribute("data-noindex") !== "true" ||
172+
p.getAttribute("data-no-llm") !== "true"
173+
) {
174+
description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
175+
if (description) break;
176+
}
177+
}
178+
179+
linksContent += `* [${pageTitle}](${pageUrl}): ${description}`;
180+
181+
// Remove noindex and no-llm elements
182+
const contentElements = main.querySelectorAll("*");
183+
for (const element of contentElements) {
184+
if (
185+
element.getAttribute("data-noindex") === "true" ||
186+
element.getAttribute("data-no-llm") === "true"
187+
) {
188+
element.remove();
189+
}
190+
}
191+
192+
// Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
193+
const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6");
194+
for (const heading of headings) {
195+
const headingLevel = Number.parseInt(heading.tagName.replace("H", ""));
196+
const newLevel = Math.min(headingLevel + 1, 6);
197+
heading.tagName = `H${newLevel}`;
198+
}
199+
200+
// prefix all the relative links with the `https://portal.thirdweb.com`
201+
const links = main.querySelectorAll("a");
202+
for (const link of links) {
203+
const [path, hash] = link.getAttribute("href")?.split("#") || [];
204+
if (path?.startsWith("/")) {
205+
link.setAttribute(
206+
"href",
207+
`https://portal.thirdweb.com${path}${hash ? `#${hash}` : ""}`,
208+
);
209+
}
210+
}
211+
212+
// for code blocks inside pre tags -> make them direct descendants of the pre tag
213+
// so they are parsed as blocks by node-html-markdown + add language class
214+
const preTags = main.querySelectorAll("pre");
215+
for (const preTag of preTags) {
216+
const codeBlock = parse(preTag.innerHTML.toString(), {
217+
comment: false,
218+
blockTextElements: {
219+
pre: true,
220+
},
221+
}).querySelector("code");
222+
223+
if (codeBlock) {
224+
const code = codeBlock
225+
.querySelectorAll("div > div > div > div")
226+
.map((x) => x.textContent)
227+
.join("\n")
228+
.trim();
229+
230+
const lang = codeBlock.getAttribute("lang");
231+
codeBlock.textContent = code;
232+
233+
const newCodePreBlock = parse(
234+
`<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
235+
);
236+
237+
preTag.replaceWith(newCodePreBlock);
238+
}
239+
}
240+
241+
// Convert the cleaned HTML to markdown
242+
fullContent += `${htmlToMarkdown.translate(main.toString())}`;
243+
244+
return {
245+
links: linksContent,
246+
full: fullContent,
247+
};
248+
}
249+
250+
function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
84251
const sectionData: PageSectionData[] = [];
85252

86-
const ignoreTags = new Set(["code", "nav"].map((t) => t.toUpperCase()));
253+
const ignoreTags = new Set(
254+
["code", "nav", "pre"].map((t) => t.toUpperCase()),
255+
);
87256

88257
function collector(node: X_Node) {
89258
if (node instanceof X_CommentNode) {
@@ -94,9 +263,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94263
return;
95264
}
96265

97-
const noIndexAttribute = node.getAttribute("data-noindex");
98-
99-
if (noIndexAttribute === "true") {
266+
if (node.getAttribute("data-noindex") === "true") {
100267
return;
101268
}
102269

apps/portal/src/app/cli/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/page.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import DocsHeroLight from "./_images/docs-hero-light.png";
66

77
export default function Page() {
88
return (
9-
<main className="container max-w-[900px] grow pb-20">
9+
<main className="container max-w-[900px] grow pb-20" data-noindex>
1010
<Hero />
1111
<div className="grid grid-cols-1 gap-8">
1212
<FrontendSection />

apps/portal/src/app/react-native/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/react/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/typescript/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

0 commit comments

Comments
 (0)