Skip to content

Commit 035af9b

Browse files
committed
[TOOL-3562] Portal: Add llms.txt and llms-full.txt generation script
1 parent 36b4ea5 commit 035af9b

File tree

13 files changed

+246
-76
lines changed

13 files changed

+246
-76
lines changed

apps/portal/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ next-env.d.ts
3939

4040
# generated files
4141
searchIndex.json
42+
public/llms.txt
43+
public/llms-full.txt
4244

4345
.env
4446
public/sitemap*.xml

apps/portal/package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,17 @@
2929
"@radix-ui/react-tabs": "^1.1.3",
3030
"@tanstack/react-query": "5.66.9",
3131
"@tryghost/content-api": "^1.11.21",
32+
"@types/he": "^1.2.3",
3233
"class-variance-authority": "^0.7.1",
3334
"clsx": "^2.1.1",
3435
"date-fns": "4.1.0",
3536
"flexsearch": "^0.7.43",
3637
"github-slugger": "^2.0.0",
38+
"he": "^1.2.0",
3739
"lucide-react": "0.476.0",
3840
"next": "15.2.0",
3941
"nextjs-toploader": "^1.6.12",
42+
"node-html-markdown": "^1.3.0",
4043
"node-html-parser": "^6.1.13",
4144
"posthog-js": "1.67.1",
4245
"prettier": "3.3.3",
Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
import { writeFileSync } from "node:fs";
2-
import { extractSearchData } from "../src/app/api/search/extraction";
2+
import { extractContent } from "../src/app/api/search/extraction";
33

44
async function main() {
55
const rootDir = process.cwd();
6-
const websiteData = await extractSearchData(rootDir);
7-
writeFileSync("./searchIndex.json", JSON.stringify(websiteData, null, 2));
6+
const { searchData, llmContent, llmFullContent } =
7+
await extractContent(rootDir);
8+
writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
9+
writeFileSync("./public/llms.txt", llmContent);
10+
writeFileSync("./public/llms-full.txt", llmFullContent);
811
}
912

1013
main();

apps/portal/src/app/account/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/api/search/extraction/index.ts

Lines changed: 154 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import { readFile } from "node:fs/promises";
2+
import he from "he";
3+
import { NodeHtmlMarkdown } from "node-html-markdown";
24
import {
35
CommentNode as X_CommentNode,
46
HTMLElement as X_HTMLElement,
@@ -11,11 +13,21 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113
import { ignoreHeadings } from "./settings";
1214
import { trimExtraSpace } from "./trimExtraSpace";
1315

14-
export async function extractSearchData(rootDir: string): Promise<PageData[]> {
16+
type ExtractedContent = {
17+
searchData: PageData[];
18+
llmContent: string;
19+
llmFullContent: string;
20+
};
21+
22+
export async function extractContent(
23+
rootDir: string,
24+
): Promise<ExtractedContent> {
1525
const nextOutputDir = `${rootDir}/.next/server/app`;
1626
const htmlFiles = getFilesRecursive(nextOutputDir, "html");
1727

1828
const pages: PageData[] = [];
29+
let llmContent = "";
30+
let llmFullContent = "";
1931

2032
const noMainFound: string[] = [];
2133
const noH1Found: string[] = [];
@@ -25,8 +37,9 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2537
const htmlContent = await readFile(filePath, "utf-8");
2638
const mainEl = parse(htmlContent, {
2739
comment: false,
40+
// fixNestedATags: true,
2841
blockTextElements: {
29-
pre: false, // parse text inside <pre> elements instead of treating it as text
42+
pre: true,
3043
},
3144
}).querySelector("main");
3245

@@ -38,24 +51,37 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3851
}
3952

4053
const noIndex = mainEl.getAttribute("data-noindex");
41-
42-
if (noIndex) {
54+
if (noIndex === "true") {
4355
return;
4456
}
4557

4658
const pageTitle = mainEl.querySelector("h1")?.text;
47-
4859
if (!pageTitle) {
4960
noH1Found.push(
5061
filePath.split(".next/server/app")[1]?.replace(".html", "") || "",
5162
);
5263
}
5364

54-
pages.push({
55-
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
56-
title: pageTitle ? trimExtraSpace(pageTitle) : "",
57-
sections: getPageSections(mainEl),
58-
});
65+
// Extract search data
66+
const pageData = extractPageSearchData(
67+
mainEl,
68+
filePath,
69+
nextOutputDir,
70+
pageTitle,
71+
);
72+
if (pageData) {
73+
pages.push(pageData);
74+
}
75+
76+
// Extract LLM content
77+
const { links, full } = extractPageLLMContent(
78+
mainEl,
79+
pageTitle,
80+
filePath,
81+
nextOutputDir,
82+
);
83+
llmContent += links ? `${links}\n` : "";
84+
llmFullContent += full ? `${full}\n` : "";
5985
}),
6086
);
6187

@@ -77,13 +103,127 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77103
console.warn("\n");
78104
}
79105

80-
return pages;
106+
return {
107+
searchData: pages,
108+
llmContent,
109+
llmFullContent,
110+
};
81111
}
82112

83-
function getPageSections(main: X_HTMLElement): PageSectionData[] {
113+
function extractPageSearchData(
114+
main: X_HTMLElement,
115+
filePath: string,
116+
nextOutputDir: string,
117+
pageTitle: string | undefined,
118+
): PageData | null {
119+
if (main.getAttribute("data-noindex") === "true") {
120+
return null;
121+
}
122+
123+
return {
124+
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
125+
title: pageTitle ? trimExtraSpace(pageTitle) : "",
126+
sections: getPageSectionsForSearchIndex(main),
127+
};
128+
}
129+
130+
function extractPageLLMContent(
131+
main: X_HTMLElement,
132+
pageTitle: string | undefined,
133+
filePath: string,
134+
nextOutputDir: string,
135+
): { links: string; full: string } {
136+
if (
137+
main.getAttribute("data-noindex") === "true" ||
138+
main.getAttribute("data-no-llm") === "true"
139+
) {
140+
return { links: "", full: "" };
141+
}
142+
143+
const htmlToMarkdown = new NodeHtmlMarkdown({});
144+
145+
let linksContent = "";
146+
let fullContent = "";
147+
148+
const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");
149+
150+
// Get first non-empty paragraph for description
151+
const paragraphs = main.querySelectorAll("p");
152+
let description = "";
153+
for (const p of paragraphs) {
154+
if (p.getAttribute("data-noindex") !== "true") {
155+
description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
156+
if (description) break;
157+
}
158+
}
159+
160+
linksContent += `* [${pageTitle}](${pageUrl}): ${description}`;
161+
162+
// Convert main content to markdown, excluding noindex elements
163+
const contentElements = main.querySelectorAll("*");
164+
for (const element of contentElements) {
165+
if (element.getAttribute("data-noindex") === "true") {
166+
element.remove();
167+
}
168+
}
169+
170+
// prefix all the relative links with the `https://portal.thirdweb.com`
171+
const links = main.querySelectorAll("a");
172+
for (const link of links) {
173+
if (link.getAttribute("href")?.startsWith("/")) {
174+
link.setAttribute(
175+
"href",
176+
`https://portal.thirdweb.com${link.getAttribute("href")}`,
177+
);
178+
}
179+
}
180+
181+
// for code blocks inside pre tags -> make them direct descendants of the pre tag
182+
// so they are parsed as blocks by node-html-markdown + add language class
183+
const preTags = main.querySelectorAll("pre");
184+
for (const preTag of preTags) {
185+
const codeBlock = parse(preTag.innerHTML.toString(), {
186+
comment: false,
187+
blockTextElements: {
188+
pre: true,
189+
},
190+
}).querySelector("code");
191+
192+
if (codeBlock) {
193+
const code = codeBlock
194+
.querySelectorAll("div > div > div")
195+
.map((x) => x.textContent)
196+
.filter((x) => x !== "")
197+
.join("\n");
198+
199+
const lang = codeBlock.getAttribute("lang");
200+
codeBlock.textContent = code;
201+
202+
const newCodePreBlock = parse(
203+
`<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
204+
);
205+
206+
preTag.replaceWith(newCodePreBlock);
207+
}
208+
}
209+
210+
// console.log(main.toString());
211+
212+
// Convert the cleaned HTML to markdown
213+
fullContent += `${htmlToMarkdown.translate(main.toString())}`;
214+
215+
return {
216+
links: linksContent,
217+
full: fullContent,
218+
};
219+
}
220+
221+
function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
84222
const sectionData: PageSectionData[] = [];
85223

86-
const ignoreTags = new Set(["code", "nav"].map((t) => t.toUpperCase()));
224+
const ignoreTags = new Set(
225+
["code", "nav", "pre"].map((t) => t.toUpperCase()),
226+
);
87227

88228
function collector(node: X_Node) {
89229
if (node instanceof X_CommentNode) {
@@ -94,9 +234,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94234
return;
95235
}
96236

97-
const noIndexAttribute = node.getAttribute("data-noindex");
98-
99-
if (noIndexAttribute === "true") {
237+
if (node.getAttribute("data-noindex") === "true") {
100238
return;
101239
}
102240

apps/portal/src/app/cli/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
33

44
export default async function Layout(props: { children: React.ReactNode }) {
55
return (
6-
<DocLayout sideBar={sidebar} editPageButton={true}>
6+
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
77
{props.children}
88
</DocLayout>
99
);

apps/portal/src/app/react-native/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/react/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/app/typescript/v5/layout.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
1515
</div>
1616
}
1717
>
18-
<div data-noindex>{props.children}</div>
18+
<div>{props.children}</div>
1919
</DocLayout>
2020
);
2121
}

apps/portal/src/components/Document/Cards/ArticleCard.tsx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export function ArticleCard(props: {
1111
const isExternal = props.href.startsWith("http");
1212
return (
1313
<Link
14+
data-noindex
1415
href={props.href}
1516
className="flex cursor-default bg-card"
1617
target={isExternal ? "_blank" : undefined}
@@ -38,6 +39,7 @@ export function ArticleIconCard(props: {
3839
const isExternal = props.href.startsWith("http");
3940
return (
4041
<Link
42+
data-noindex
4143
href={props.href}
4244
className={cn(
4345
"flex items-center gap-4 rounded-lg border bg-card p-4 transition-colors hover:border-active-border",

0 commit comments

Comments
 (0)