[Portal] Separate LLM content extraction from search data extraction (#6854)

joaquim-verges · web-flow · commit 3825563d88d6 · 2025-04-25T05:33:29.000-07:00
diff --git a/apps/portal/package.json b/apps/portal/package.json
@@ -8,7 +8,8 @@
     "dev": "next dev",
     "prebuild": "pnpm run create-index",
     "build": "next build",
-    "postbuild": "pnpm run extract-search-data && pnpm next-sitemap",
+    "postbuild": "pnpm run extract-search-data && pnpm run extract-llm-content && pnpm next-sitemap",
+    "extract-llm-content": "pnpm tsx scripts/extractLLMData.ts",
     "start": "next start",
     "lint": "biome check ./src && knip && eslint ./src",
     "fix": "biome check ./src --fix && eslint ./src --fix",
diff --git a/apps/portal/scripts/extractLLMData.ts b/apps/portal/scripts/extractLLMData.ts
@@ -0,0 +1,11 @@
+import { writeFileSync } from "node:fs";
+import { extractContentForLLM } from "../src/app/api/search/extraction/llm-extract";
+
+async function main() {
+  const rootDir = process.cwd();
+  const { llmContent, llmFullContent } = await extractContentForLLM(rootDir);
+  writeFileSync("./public/llms.txt", llmContent);
+  writeFileSync("./public/llms-full.txt", llmFullContent);
+}
+
+main();
diff --git a/apps/portal/scripts/extractSearchData.ts b/apps/portal/scripts/extractSearchData.ts
@@ -3,11 +3,8 @@ import { extractContent } from "../src/app/api/search/extraction";
 
 async function main() {
   const rootDir = process.cwd();
-  const { searchData, llmContent, llmFullContent } =
-    await extractContent(rootDir);
+  const { searchData } = await extractContent(rootDir);
   writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
-  writeFileSync("./public/llms.txt", llmContent);
-  writeFileSync("./public/llms-full.txt", llmFullContent);
 }
 
 main();
diff --git a/apps/portal/src/app/api/search/extraction/index.ts b/apps/portal/src/app/api/search/extraction/index.ts
@@ -1,6 +1,4 @@
 import { readFile } from "node:fs/promises";
-import he from "he";
-import { NodeHtmlMarkdown } from "node-html-markdown";
 import {
   CommentNode as X_CommentNode,
   HTMLElement as X_HTMLElement,
@@ -15,33 +13,15 @@ import { trimExtraSpace } from "./trimExtraSpace";
 
 type ExtractedContent = {
   searchData: PageData[];
-  llmContent: string;
-  llmFullContent: string;
 };
 
-const llmsContentHeader = `\
-# thirdweb
-
-> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
-
-## Docs
-`;
-
-const llmsFullContentHeader = `\
-# thirdweb
-
-> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
-`;
-
 export async function extractContent(
   rootDir: string,
 ): Promise<ExtractedContent> {
   const nextOutputDir = `${rootDir}/.next/server/app`;
   const htmlFiles = getFilesRecursive(nextOutputDir, "html");
 
   const pages: PageData[] = [];
-  let llmContent = "";
-  let llmFullContent = "";
 
   const noMainFound: string[] = [];
   const noH1Found: string[] = [];
@@ -85,16 +65,6 @@ export async function extractContent(
       if (pageData) {
         pages.push(pageData);
       }
-
-      // Extract LLM content
-      const { links, full } = extractPageLLMContent(
-        mainEl,
-        pageTitle,
-        filePath,
-        nextOutputDir,
-      );
-      llmContent += links ? `${links}\n` : "";
-      llmFullContent += full ? `${full}\n` : "";
     }),
   );
 
@@ -118,8 +88,6 @@ export async function extractContent(
 
   return {
     searchData: pages,
-    llmContent: `${llmsContentHeader}\n${llmContent}`,
-    llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`,
   };
 }
 
@@ -140,122 +108,6 @@ function extractPageSearchData(
   };
 }
 
-function extractPageLLMContent(
-  main: X_HTMLElement,
-  pageTitle: string | undefined,
-  filePath: string,
-  nextOutputDir: string,
-): { links: string; full: string } {
-  if (
-    main.getAttribute("data-noindex") === "true" ||
-    main.getAttribute("data-no-llm") === "true"
-  ) {
-    return { links: "", full: "" };
-  }
-
-  const htmlToMarkdown = new NodeHtmlMarkdown({
-    keepDataImages: false,
-    ignore: ["button"],
-    maxConsecutiveNewlines: 2,
-  });
-
-  let linksContent = "";
-  let fullContent = "";
-
-  const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");
-
-  // Get first non-empty paragraph for description
-  const paragraphs = main.querySelectorAll("p");
-  let description = "";
-  for (const p of paragraphs) {
-    // skip noindex or no-llm paragraphs
-    if (p.closest("[data-noindex]") || p.closest("[data-no-llm]")) {
-      continue;
-    }
-
-    description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
-    if (description) {
-      break;
-    }
-  }
-
-  linksContent += `* [${pageTitle}](${pageUrl}): ${description || `Reference for ${pageTitle}`}`;
-
-  // Remove noindex and no-llm elements
-  const contentElements = main.querySelectorAll("*");
-  for (const element of contentElements) {
-    if (
-      element.getAttribute("data-noindex") === "true" ||
-      element.getAttribute("data-no-llm") === "true"
-    ) {
-      element.remove();
-    }
-  }
-
-  // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
-  const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6");
-  for (const heading of headings) {
-    const headingLevel = Number.parseInt(heading.tagName.replace("H", ""));
-    const newLevel = Math.min(headingLevel + 1, 6);
-    heading.tagName = `H${newLevel}`;
-  }
-
-  // prefix all the relative links with the `https://portal.thirdweb.com`
-  const links = main.querySelectorAll("a");
-  for (const link of links) {
-    const href = link.getAttribute("href");
-    if (href?.startsWith("/")) {
-      link.setAttribute("href", `https://portal.thirdweb.com${href}`);
-    }
-  }
-
-  // prefix all relative image links with the `https://portal.thirdweb.com`
-  const images = main.querySelectorAll("img");
-  for (const image of images) {
-    const src = image.getAttribute("src");
-    if (src?.startsWith("/")) {
-      image.setAttribute("src", `https://portal.thirdweb.com${src}`);
-    }
-  }
-
-  // for code blocks inside pre tags -> make them direct descendants of the pre tag
-  // so they are parsed as blocks by node-html-markdown + add language class
-  const preTags = main.querySelectorAll("pre");
-  for (const preTag of preTags) {
-    const codeBlock = parse(preTag.innerHTML.toString(), {
-      comment: false,
-      blockTextElements: {
-        pre: true,
-      },
-    }).querySelector("code");
-
-    if (codeBlock) {
-      const code = codeBlock
-        .querySelectorAll("div > div > div > div")
-        .map((x) => x.textContent)
-        .join("\n")
-        .trim();
-
-      const lang = codeBlock.getAttribute("lang");
-      codeBlock.textContent = code;
-
-      const newCodePreBlock = parse(
-        `<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
-      );
-
-      preTag.replaceWith(newCodePreBlock);
-    }
-  }
-
-  // Convert the cleaned HTML to markdown
-  fullContent += `${htmlToMarkdown.translate(main.toString())}`;
-
-  return {
-    links: linksContent,
-    full: fullContent,
-  };
-}
-
 function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
   const sectionData: PageSectionData[] = [];
 
diff --git a/apps/portal/src/app/api/search/extraction/llm-extract.ts b/apps/portal/src/app/api/search/extraction/llm-extract.ts