diff --git a/apps/portal/package.json b/apps/portal/package.json index a99c9f41b46..5cc5aa64128 100644 --- a/apps/portal/package.json +++ b/apps/portal/package.json @@ -8,7 +8,8 @@ "dev": "next dev", "prebuild": "pnpm run create-index", "build": "next build", - "postbuild": "pnpm run extract-search-data && pnpm next-sitemap", + "postbuild": "pnpm run extract-search-data && pnpm run extract-llm-content && pnpm next-sitemap", + "extract-llm-content": "pnpm tsx scripts/extractLLMData.ts", "start": "next start", "lint": "biome check ./src && knip && eslint ./src", "fix": "biome check ./src --fix && eslint ./src --fix", diff --git a/apps/portal/scripts/extractLLMData.ts b/apps/portal/scripts/extractLLMData.ts new file mode 100644 index 00000000000..4cc2a77ca0f --- /dev/null +++ b/apps/portal/scripts/extractLLMData.ts @@ -0,0 +1,11 @@ +import { writeFileSync } from "node:fs"; +import { extractContentForLLM } from "../src/app/api/search/extraction/llm-extract"; + +async function main() { + const rootDir = process.cwd(); + const { llmContent, llmFullContent } = await extractContentForLLM(rootDir); + writeFileSync("./public/llms.txt", llmContent); + writeFileSync("./public/llms-full.txt", llmFullContent); +} + +main(); diff --git a/apps/portal/scripts/extractSearchData.ts b/apps/portal/scripts/extractSearchData.ts index 802857428b8..6d8c5fd820e 100644 --- a/apps/portal/scripts/extractSearchData.ts +++ b/apps/portal/scripts/extractSearchData.ts @@ -3,11 +3,8 @@ import { extractContent } from "../src/app/api/search/extraction"; async function main() { const rootDir = process.cwd(); - const { searchData, llmContent, llmFullContent } = - await extractContent(rootDir); + const { searchData } = await extractContent(rootDir); writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2)); - writeFileSync("./public/llms.txt", llmContent); - writeFileSync("./public/llms-full.txt", llmFullContent); } main(); diff --git a/apps/portal/src/app/api/search/extraction/index.ts b/apps/portal/src/app/api/search/extraction/index.ts index 5692ba42dd7..8a2f903a7b6 100644 --- a/apps/portal/src/app/api/search/extraction/index.ts +++ b/apps/portal/src/app/api/search/extraction/index.ts @@ -1,6 +1,4 @@ import { readFile } from "node:fs/promises"; -import he from "he"; -import { NodeHtmlMarkdown } from "node-html-markdown"; import { CommentNode as X_CommentNode, HTMLElement as X_HTMLElement, @@ -15,24 +13,8 @@ import { trimExtraSpace } from "./trimExtraSpace"; type ExtractedContent = { searchData: PageData[]; - llmContent: string; - llmFullContent: string; }; -const llmsContentHeader = `\ -# thirdweb - -> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain. - -## Docs -`; - -const llmsFullContentHeader = `\ -# thirdweb - -> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain. -`; - export async function extractContent( rootDir: string, ): Promise { @@ -40,8 +22,6 @@ export async function extractContent( const htmlFiles = getFilesRecursive(nextOutputDir, "html"); const pages: PageData[] = []; - let llmContent = ""; - let llmFullContent = ""; const noMainFound: string[] = []; const noH1Found: string[] = []; @@ -85,16 +65,6 @@ export async function extractContent( if (pageData) { pages.push(pageData); } - - // Extract LLM content - const { links, full } = extractPageLLMContent( - mainEl, - pageTitle, - filePath, - nextOutputDir, - ); - llmContent += links ? `${links}\n` : ""; - llmFullContent += full ? `${full}\n` : ""; }), ); @@ -118,8 +88,6 @@ export async function extractContent( return { searchData: pages, - llmContent: `${llmsContentHeader}\n${llmContent}`, - llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`, }; } @@ -140,122 +108,6 @@ function extractPageSearchData( }; } -function extractPageLLMContent( - main: X_HTMLElement, - pageTitle: string | undefined, - filePath: string, - nextOutputDir: string, -): { links: string; full: string } { - if ( - main.getAttribute("data-noindex") === "true" || - main.getAttribute("data-no-llm") === "true" - ) { - return { links: "", full: "" }; - } - - const htmlToMarkdown = new NodeHtmlMarkdown({ - keepDataImages: false, - ignore: ["button"], - maxConsecutiveNewlines: 2, - }); - - let linksContent = ""; - let fullContent = ""; - - const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", ""); - - // Get first non-empty paragraph for description - const paragraphs = main.querySelectorAll("p"); - let description = ""; - for (const p of paragraphs) { - // skip noindex or no-llm paragraphs - if (p.closest("[data-noindex]") || p.closest("[data-no-llm]")) { - continue; - } - - description = trimExtraSpace(htmlToMarkdown.translate(p.toString())); - if (description) { - break; - } - } - - linksContent += `* [${pageTitle}](${pageUrl}): ${description || `Reference for ${pageTitle}`}`; - - // Remove noindex and no-llm elements - const contentElements = main.querySelectorAll("*"); - for (const element of contentElements) { - if ( - element.getAttribute("data-noindex") === "true" || - element.getAttribute("data-no-llm") === "true" - ) { - element.remove(); - } - } - - // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.) - const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6"); - for (const heading of headings) { - const headingLevel = Number.parseInt(heading.tagName.replace("H", "")); - const newLevel = Math.min(headingLevel + 1, 6); - heading.tagName = `H${newLevel}`; - } - - // prefix all the relative links with the `https://portal.thirdweb.com` - const links = main.querySelectorAll("a"); - for (const link of links) { - const href = link.getAttribute("href"); - if (href?.startsWith("/")) { - link.setAttribute("href", `https://portal.thirdweb.com${href}`); - } - } - - // prefix all relative image links with the `https://portal.thirdweb.com` - const images = main.querySelectorAll("img"); - for (const image of images) { - const src = image.getAttribute("src"); - if (src?.startsWith("/")) { - image.setAttribute("src", `https://portal.thirdweb.com${src}`); - } - } - - // for code blocks inside pre tags -> make them direct descendants of the pre tag - // so they are parsed as blocks by node-html-markdown + add language class - const preTags = main.querySelectorAll("pre"); - for (const preTag of preTags) { - const codeBlock = parse(preTag.innerHTML.toString(), { - comment: false, - blockTextElements: { - pre: true, - }, - }).querySelector("code"); - - if (codeBlock) { - const code = codeBlock - .querySelectorAll("div > div > div > div") - .map((x) => x.textContent) - .join("\n") - .trim(); - - const lang = codeBlock.getAttribute("lang"); - codeBlock.textContent = code; - - const newCodePreBlock = parse( - `
${he.encode(code)}
`, - ); - - preTag.replaceWith(newCodePreBlock); - } - } - - // Convert the cleaned HTML to markdown - fullContent += `${htmlToMarkdown.translate(main.toString())}`; - - return { - links: linksContent, - full: fullContent, - }; -} - function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] { const sectionData: PageSectionData[] = []; diff --git a/apps/portal/src/app/api/search/extraction/llm-extract.ts b/apps/portal/src/app/api/search/extraction/llm-extract.ts new file mode 100644 index 00000000000..ed2b7ee424b --- /dev/null +++ b/apps/portal/src/app/api/search/extraction/llm-extract.ts @@ -0,0 +1,291 @@ +import { readFile } from "node:fs/promises"; +import he from "he"; +import { NodeHtmlMarkdown } from "node-html-markdown"; +import { type HTMLElement as X_HTMLElement, parse } from "node-html-parser"; +import type { LinkGroup } from "../../../../components/others/Sidebar"; +import { fetchTypeScriptDoc } from "../../../references/components/TDoc/fetchDocs/fetchTypeScriptDoc"; +import { getSidebarLinkGroups } from "../../../references/components/TDoc/utils/getSidebarLinkgroups"; +import { getFilesRecursive } from "./getFilesRecursive"; +import { trimExtraSpace } from "./trimExtraSpace"; + +type ExtractedContent = { + llmContent: string; + llmFullContent: string; +}; + +const baseUrl = "https://portal.thirdweb.com"; + +const llmsContentHeader = `\ +# __thirdweb TypeScript SDK Documentation__ + +> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain. +`; + +const llmsFullContentHeader = `\ +# __thirdweb TypeScript SDK Documentation__ + +> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain. +`; + +export async function extractContentForLLM( + rootDir: string, +): Promise { + const nextOutputDir = `${rootDir}/.next/server/app`; + const htmlFiles = getFilesRecursive(nextOutputDir, "html"); + + let llmContent = ""; + let llmFullContent = ""; + + const noMainFound: string[] = []; + const noH1Found: string[] = []; + + const doc = await fetchTypeScriptDoc(); + const sidebarLinks = getSidebarLinkGroups(doc, "/references/typescript/v5"); + + async function processSideBarLink(sideBarLink: LinkGroup, level = 0) { + // Add the sidebar link name as header with appropriate formatting based on level + llmContent += + level === 0 + ? `---\n**${sideBarLink.name}**\n` + : `---\n${"#".repeat(Math.min(level, 5))} ${sideBarLink.name}\n`; + llmFullContent += + level === 0 + ? `---\n**${sideBarLink.name}**\n` + : `---\n${"#".repeat(Math.min(level, 5))} ${sideBarLink.name}\n`; + + // Process this link's own href if it exists + if (sideBarLink.href && level > 0) { + await processLinkPath(sideBarLink.href); + } + + // Process all child links + if (sideBarLink.links && sideBarLink.links.length > 0) { + for (const link of sideBarLink.links) { + // Skip separators + if ("separator" in link) { + continue; + } + + // If the link is a group, process it recursively + if ("links" in link) { + await processSideBarLink(link, level + 1); + } else { + // Process the link path if it exists + if (link.href) { + await processLinkPath(link.href); + } + } + } + } + } + + // Helper function to process the content from a link path + async function processLinkPath(filePath: string) { + if (!filePath) { + return; + } + + const htmlFilePath = htmlFiles.find((f) => f.includes(filePath)); + if (!htmlFilePath) { + return; + } + + try { + const htmlContent = await readFile(htmlFilePath, "utf-8"); + const mainEl = parse(htmlContent, { + comment: false, + blockTextElements: { + pre: true, + }, + }).querySelector("main"); + + if (!mainEl) { + noMainFound.push( + filePath.split(".next/server/app")[1]?.replace(".html", "") || "", + ); + return; + } + + if (mainEl.getAttribute("data-noindex") === "true") { + return; + } + + const pageTitle = mainEl.querySelector("h1")?.text; + if (!pageTitle) { + noH1Found.push( + filePath.split(".next/server/app")[1]?.replace(".html", "") || "", + ); + } + + // Extract LLM content + const { links, full } = extractPageLLMContent( + mainEl, + pageTitle, + filePath, + nextOutputDir, + ); + llmContent += links ? `${links}\n` : ""; + llmFullContent += full ? `---\n\n${full}\n` : ""; + } catch (error) { + console.error(`Error processing file ${htmlFilePath}:`, error); + } + } + + for (const sideBarLink of sidebarLinks) { + await processSideBarLink(sideBarLink); + } + + if (noMainFound.length) { + console.warn( + "\n\nNo
element found in below routes, They will not be included in search results :\n", + ); + for (const f of noMainFound) { + console.warn(`* ${f}`); + } + console.warn("\n"); + } + + if (noH1Found.length) { + console.warn("\n\nNo

element found in below routes :\n"); + for (const f of noH1Found) { + console.warn(`* ${f}`); + } + console.warn("\n"); + } + + return { + llmContent: `${llmsContentHeader}\n${llmContent}`, + llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`, + }; +} + +function extractPageLLMContent( + main: X_HTMLElement, + pageTitle: string | undefined, + filePath: string, + nextOutputDir: string, +): { links: string; full: string } { + if ( + main.getAttribute("data-noindex") === "true" || + main.getAttribute("data-no-llm") === "true" + ) { + return { links: "", full: "" }; + } + + const htmlToMarkdown = new NodeHtmlMarkdown({ + keepDataImages: false, + ignore: ["button"], + maxConsecutiveNewlines: 2, + }); + + let linksContent = ""; + let fullContent = ""; + + const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", ""); + + // Get first non-empty paragraph for description + const paragraphs = main.querySelectorAll("p"); + let description = ""; + for (const p of paragraphs) { + // skip noindex or no-llm paragraphs + if (p.closest("[data-noindex]") || p.closest("[data-no-llm]")) { + continue; + } + + description = trimExtraSpace(htmlToMarkdown.translate(p.toString())); + if (description) { + break; + } + } + + linksContent += `* [${pageTitle}](${baseUrl}${pageUrl}): ${description || `Reference for ${pageTitle}`}`; + + // Remove noindex and no-llm elements + const contentElements = main.querySelectorAll("*"); + for (const element of contentElements) { + if ( + element.getAttribute("data-noindex") === "true" || + element.getAttribute("data-no-llm") === "true" + ) { + element.remove(); + } + } + + // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.) + const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6"); + for (const heading of headings) { + const headingLevel = Number.parseInt(heading.tagName.replace("H", "")); + const newLevel = Math.min(headingLevel + 1, 6); + heading.tagName = `H${newLevel}`; + } + + // prefix all the relative links with the `https://portal.thirdweb.com` + const links = main.querySelectorAll("a"); + for (const link of links) { + const href = link.getAttribute("href"); + if (href?.startsWith("/")) { + link.setAttribute("href", `https://portal.thirdweb.com${href}`); + } + } + + // prefix all relative image links with the `https://portal.thirdweb.com` + const images = main.querySelectorAll("img"); + for (const image of images) { + const src = image.getAttribute("src"); + if (src?.startsWith("/")) { + image.setAttribute("src", `https://portal.thirdweb.com${src}`); + } + } + + // for code blocks inside pre tags -> make them direct descendants of the pre tag + // so they are parsed as blocks by node-html-markdown + add language class + const preTags = main.querySelectorAll("pre"); + for (const preTag of preTags) { + const codeBlock = parse(preTag.innerHTML.toString(), { + comment: false, + blockTextElements: { + pre: true, + }, + }).querySelector("code"); + + if (codeBlock) { + const code = codeBlock + .querySelectorAll("div > div > div > div") + .map((x) => x.textContent) + .join("\n") + .trim(); + + const lang = codeBlock.getAttribute("lang"); + codeBlock.textContent = code; + + const newCodePreBlock = parse( + `
${he.encode(code)}
`, + ); + + preTag.replaceWith(newCodePreBlock); + } + } + + const codeTags = main.querySelectorAll("code"); + for (const codeTag of codeTags) { + const lang = codeTag.getAttribute("lang"); + if (lang) { + const code = codeTag + .querySelectorAll("div > div > div > div") + .map((x) => x.textContent) + .join("\n") + .trim(); + codeTag.replaceWith( + `
${he.encode(code)}
`, + ); + } + } + + // Convert the cleaned HTML to markdown + fullContent += `${htmlToMarkdown.translate(main.toString())}`; + + return { + links: linksContent, + full: fullContent, + }; +}