Skip to content

Commit 3825563

Browse files
[Portal] Separate LLM content extraction from search data extraction (#6854)
1 parent f0dce0a commit 3825563

File tree

5 files changed

+305
-153
lines changed

5 files changed

+305
-153
lines changed

apps/portal/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
"dev": "next dev",
99
"prebuild": "pnpm run create-index",
1010
"build": "next build",
11-
"postbuild": "pnpm run extract-search-data && pnpm next-sitemap",
11+
"postbuild": "pnpm run extract-search-data && pnpm run extract-llm-content && pnpm next-sitemap",
12+
"extract-llm-content": "pnpm tsx scripts/extractLLMData.ts",
1213
"start": "next start",
1314
"lint": "biome check ./src && knip && eslint ./src",
1415
"fix": "biome check ./src --fix && eslint ./src --fix",
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import { writeFileSync } from "node:fs";
2+
import { extractContentForLLM } from "../src/app/api/search/extraction/llm-extract";
3+
4+
async function main() {
5+
const rootDir = process.cwd();
6+
const { llmContent, llmFullContent } = await extractContentForLLM(rootDir);
7+
writeFileSync("./public/llms.txt", llmContent);
8+
writeFileSync("./public/llms-full.txt", llmFullContent);
9+
}
10+
11+
main();

apps/portal/scripts/extractSearchData.ts

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,8 @@ import { extractContent } from "../src/app/api/search/extraction";
33

44
async function main() {
55
const rootDir = process.cwd();
6-
const { searchData, llmContent, llmFullContent } =
7-
await extractContent(rootDir);
6+
const { searchData } = await extractContent(rootDir);
87
writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
9-
writeFileSync("./public/llms.txt", llmContent);
10-
writeFileSync("./public/llms-full.txt", llmFullContent);
118
}
129

1310
main();

apps/portal/src/app/api/search/extraction/index.ts

Lines changed: 0 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
import { readFile } from "node:fs/promises";
2-
import he from "he";
3-
import { NodeHtmlMarkdown } from "node-html-markdown";
42
import {
53
CommentNode as X_CommentNode,
64
HTMLElement as X_HTMLElement,
@@ -15,33 +13,15 @@ import { trimExtraSpace } from "./trimExtraSpace";
1513

1614
type ExtractedContent = {
1715
searchData: PageData[];
18-
llmContent: string;
19-
llmFullContent: string;
2016
};
2117

22-
const llmsContentHeader = `\
23-
# thirdweb
24-
25-
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26-
27-
## Docs
28-
`;
29-
30-
const llmsFullContentHeader = `\
31-
# thirdweb
32-
33-
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34-
`;
35-
3618
export async function extractContent(
3719
rootDir: string,
3820
): Promise<ExtractedContent> {
3921
const nextOutputDir = `${rootDir}/.next/server/app`;
4022
const htmlFiles = getFilesRecursive(nextOutputDir, "html");
4123

4224
const pages: PageData[] = [];
43-
let llmContent = "";
44-
let llmFullContent = "";
4525

4626
const noMainFound: string[] = [];
4727
const noH1Found: string[] = [];
@@ -85,16 +65,6 @@ export async function extractContent(
8565
if (pageData) {
8666
pages.push(pageData);
8767
}
88-
89-
// Extract LLM content
90-
const { links, full } = extractPageLLMContent(
91-
mainEl,
92-
pageTitle,
93-
filePath,
94-
nextOutputDir,
95-
);
96-
llmContent += links ? `${links}\n` : "";
97-
llmFullContent += full ? `${full}\n` : "";
9868
}),
9969
);
10070

@@ -118,8 +88,6 @@ export async function extractContent(
11888

11989
return {
12090
searchData: pages,
121-
llmContent: `${llmsContentHeader}\n${llmContent}`,
122-
llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`,
12391
};
12492
}
12593

@@ -140,122 +108,6 @@ function extractPageSearchData(
140108
};
141109
}
142110

143-
function extractPageLLMContent(
144-
main: X_HTMLElement,
145-
pageTitle: string | undefined,
146-
filePath: string,
147-
nextOutputDir: string,
148-
): { links: string; full: string } {
149-
if (
150-
main.getAttribute("data-noindex") === "true" ||
151-
main.getAttribute("data-no-llm") === "true"
152-
) {
153-
return { links: "", full: "" };
154-
}
155-
156-
const htmlToMarkdown = new NodeHtmlMarkdown({
157-
keepDataImages: false,
158-
ignore: ["button"],
159-
maxConsecutiveNewlines: 2,
160-
});
161-
162-
let linksContent = "";
163-
let fullContent = "";
164-
165-
const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");
166-
167-
// Get first non-empty paragraph for description
168-
const paragraphs = main.querySelectorAll("p");
169-
let description = "";
170-
for (const p of paragraphs) {
171-
// skip noindex or no-llm paragraphs
172-
if (p.closest("[data-noindex]") || p.closest("[data-no-llm]")) {
173-
continue;
174-
}
175-
176-
description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
177-
if (description) {
178-
break;
179-
}
180-
}
181-
182-
linksContent += `* [${pageTitle}](${pageUrl}): ${description || `Reference for ${pageTitle}`}`;
183-
184-
// Remove noindex and no-llm elements
185-
const contentElements = main.querySelectorAll("*");
186-
for (const element of contentElements) {
187-
if (
188-
element.getAttribute("data-noindex") === "true" ||
189-
element.getAttribute("data-no-llm") === "true"
190-
) {
191-
element.remove();
192-
}
193-
}
194-
195-
// Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
196-
const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6");
197-
for (const heading of headings) {
198-
const headingLevel = Number.parseInt(heading.tagName.replace("H", ""));
199-
const newLevel = Math.min(headingLevel + 1, 6);
200-
heading.tagName = `H${newLevel}`;
201-
}
202-
203-
// prefix all the relative links with the `https://portal.thirdweb.com`
204-
const links = main.querySelectorAll("a");
205-
for (const link of links) {
206-
const href = link.getAttribute("href");
207-
if (href?.startsWith("/")) {
208-
link.setAttribute("href", `https://portal.thirdweb.com${href}`);
209-
}
210-
}
211-
212-
// prefix all relative image links with the `https://portal.thirdweb.com`
213-
const images = main.querySelectorAll("img");
214-
for (const image of images) {
215-
const src = image.getAttribute("src");
216-
if (src?.startsWith("/")) {
217-
image.setAttribute("src", `https://portal.thirdweb.com${src}`);
218-
}
219-
}
220-
221-
// for code blocks inside pre tags -> make them direct descendants of the pre tag
222-
// so they are parsed as blocks by node-html-markdown + add language class
223-
const preTags = main.querySelectorAll("pre");
224-
for (const preTag of preTags) {
225-
const codeBlock = parse(preTag.innerHTML.toString(), {
226-
comment: false,
227-
blockTextElements: {
228-
pre: true,
229-
},
230-
}).querySelector("code");
231-
232-
if (codeBlock) {
233-
const code = codeBlock
234-
.querySelectorAll("div > div > div > div")
235-
.map((x) => x.textContent)
236-
.join("\n")
237-
.trim();
238-
239-
const lang = codeBlock.getAttribute("lang");
240-
codeBlock.textContent = code;
241-
242-
const newCodePreBlock = parse(
243-
`<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
244-
);
245-
246-
preTag.replaceWith(newCodePreBlock);
247-
}
248-
}
249-
250-
// Convert the cleaned HTML to markdown
251-
fullContent += `${htmlToMarkdown.translate(main.toString())}`;
252-
253-
return {
254-
links: linksContent,
255-
full: fullContent,
256-
};
257-
}
258-
259111
function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
260112
const sectionData: PageSectionData[] = [];
261113

0 commit comments

Comments
 (0)