[TOOL-3562] Portal: Add llms.txt and llms-full.txt generation script

MananTank · MananTank · commit 035af9b489d0 · 2025-02-28T20:27:20.000+05:30
diff --git a/apps/portal/.gitignore b/apps/portal/.gitignore
@@ -39,6 +39,8 @@ next-env.d.ts
 
 # generated files
 searchIndex.json
+public/llms.txt
+public/llms-full.txt
 
 .env
 public/sitemap*.xml
diff --git a/apps/portal/package.json b/apps/portal/package.json
@@ -29,14 +29,17 @@
     "@radix-ui/react-tabs": "^1.1.3",
     "@tanstack/react-query": "5.66.9",
     "@tryghost/content-api": "^1.11.21",
+    "@types/he": "^1.2.3",
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
     "date-fns": "4.1.0",
     "flexsearch": "^0.7.43",
     "github-slugger": "^2.0.0",
+    "he": "^1.2.0",
     "lucide-react": "0.476.0",
     "next": "15.2.0",
     "nextjs-toploader": "^1.6.12",
+    "node-html-markdown": "^1.3.0",
     "node-html-parser": "^6.1.13",
     "posthog-js": "1.67.1",
     "prettier": "3.3.3",
diff --git a/apps/portal/scripts/extractSearchData.ts b/apps/portal/scripts/extractSearchData.ts
@@ -1,10 +1,13 @@
 import { writeFileSync } from "node:fs";
-import { extractSearchData } from "../src/app/api/search/extraction";
+import { extractContent } from "../src/app/api/search/extraction";
 
 async function main() {
   const rootDir = process.cwd();
-  const websiteData = await extractSearchData(rootDir);
-  writeFileSync("./searchIndex.json", JSON.stringify(websiteData, null, 2));
+  const { searchData, llmContent, llmFullContent } =
+    await extractContent(rootDir);
+  writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
+  writeFileSync("./public/llms.txt", llmContent);
+  writeFileSync("./public/llms-full.txt", llmFullContent);
 }
 
 main();
diff --git a/apps/portal/src/app/account/layout.tsx b/apps/portal/src/app/account/layout.tsx
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
 
 export default async function Layout(props: { children: React.ReactNode }) {
   return (
-    <DocLayout sideBar={sidebar} editPageButton={true}>
+    <DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
       {props.children}
     </DocLayout>
   );
diff --git a/apps/portal/src/app/api/search/extraction/index.ts b/apps/portal/src/app/api/search/extraction/index.ts
@@ -1,4 +1,6 @@
 import { readFile } from "node:fs/promises";
+import he from "he";
+import { NodeHtmlMarkdown } from "node-html-markdown";
 import {
   CommentNode as X_CommentNode,
   HTMLElement as X_HTMLElement,
@@ -11,11 +13,21 @@ import { getFilesRecursive } from "./getFilesRecursive";
 import { ignoreHeadings } from "./settings";
 import { trimExtraSpace } from "./trimExtraSpace";
 
-export async function extractSearchData(rootDir: string): Promise<PageData[]> {
+type ExtractedContent = {
+  searchData: PageData[];
+  llmContent: string;
+  llmFullContent: string;
+};
+
+export async function extractContent(
+  rootDir: string,
+): Promise<ExtractedContent> {
   const nextOutputDir = `${rootDir}/.next/server/app`;
   const htmlFiles = getFilesRecursive(nextOutputDir, "html");
 
   const pages: PageData[] = [];
+  let llmContent = "";
+  let llmFullContent = "";
 
   const noMainFound: string[] = [];
   const noH1Found: string[] = [];
@@ -25,8 +37,9 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
       const htmlContent = await readFile(filePath, "utf-8");
       const mainEl = parse(htmlContent, {
         comment: false,
+        // fixNestedATags: true,
         blockTextElements: {
-          pre: false, // parse text inside <pre> elements instead of treating it as text
+          pre: true,
         },
       }).querySelector("main");
 
@@ -38,24 +51,37 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
       }
 
       const noIndex = mainEl.getAttribute("data-noindex");
-
-      if (noIndex) {
+      if (noIndex === "true") {
         return;
       }
 
       const pageTitle = mainEl.querySelector("h1")?.text;
-
       if (!pageTitle) {
         noH1Found.push(
           filePath.split(".next/server/app")[1]?.replace(".html", "") || "",
         );
       }
 
-      pages.push({
-        href: filePath.replace(nextOutputDir, "").replace(".html", ""),
-        title: pageTitle ? trimExtraSpace(pageTitle) : "",
-        sections: getPageSections(mainEl),
-      });
+      // Extract search data
+      const pageData = extractPageSearchData(
+        mainEl,
+        filePath,
+        nextOutputDir,
+        pageTitle,
+      );
+      if (pageData) {
+        pages.push(pageData);
+      }
+
+      // Extract LLM content
+      const { links, full } = extractPageLLMContent(
+        mainEl,
+        pageTitle,
+        filePath,
+        nextOutputDir,
+      );
+      llmContent += links ? `${links}\n` : "";
+      llmFullContent += full ? `${full}\n` : "";
     }),
   );
 
@@ -77,13 +103,127 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
     console.warn("\n");
   }
 
-  return pages;
+  return {
+    searchData: pages,
+    llmContent,
+    llmFullContent,
+  };
 }
 
-function getPageSections(main: X_HTMLElement): PageSectionData[] {
+function extractPageSearchData(
+  main: X_HTMLElement,
+  filePath: string,
+  nextOutputDir: string,
+  pageTitle: string | undefined,
+): PageData | null {
+  if (main.getAttribute("data-noindex") === "true") {
+    return null;
+  }
+
+  return {
+    href: filePath.replace(nextOutputDir, "").replace(".html", ""),
+    title: pageTitle ? trimExtraSpace(pageTitle) : "",
+    sections: getPageSectionsForSearchIndex(main),
+  };
+}
+
+function extractPageLLMContent(
+  main: X_HTMLElement,
+  pageTitle: string | undefined,
+  filePath: string,
+  nextOutputDir: string,
+): { links: string; full: string } {
+  if (
+    main.getAttribute("data-noindex") === "true" ||
+    main.getAttribute("data-no-llm") === "true"
+  ) {
+    return { links: "", full: "" };
+  }
+
+  const htmlToMarkdown = new NodeHtmlMarkdown({});
+
+  let linksContent = "";
+  let fullContent = "";
+
+  const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");
+
+  // Get first non-empty paragraph for description
+  const paragraphs = main.querySelectorAll("p");
+  let description = "";
+  for (const p of paragraphs) {
+    if (p.getAttribute("data-noindex") !== "true") {
+      description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
+      if (description) break;
+    }
+  }
+
+  linksContent += `* [${pageTitle}](${pageUrl}): ${description}`;
+
+  // Convert main content to markdown, excluding noindex elements
+  const contentElements = main.querySelectorAll("*");
+  for (const element of contentElements) {
+    if (element.getAttribute("data-noindex") === "true") {
+      element.remove();
+    }
+  }
+
+  // prefix all the relative links with the `https://portal.thirdweb.com`
+  const links = main.querySelectorAll("a");
+  for (const link of links) {
+    if (link.getAttribute("href")?.startsWith("/")) {
+      link.setAttribute(
+        "href",
+        `https://portal.thirdweb.com${link.getAttribute("href")}`,
+      );
+    }
+  }
+
+  // for code blocks inside pre tags -> make them direct descendants of the pre tag
+  // so they are parsed as blocks by node-html-markdown + add language class
+  const preTags = main.querySelectorAll("pre");
+  for (const preTag of preTags) {
+    const codeBlock = parse(preTag.innerHTML.toString(), {
+      comment: false,
+      blockTextElements: {
+        pre: true,
+      },
+    }).querySelector("code");
+
+    if (codeBlock) {
+      const code = codeBlock
+        .querySelectorAll("div > div > div")
+        .map((x) => x.textContent)
+        .filter((x) => x !== "")
+        .join("\n");
+
+      const lang = codeBlock.getAttribute("lang");
+      codeBlock.textContent = code;
+
+      const newCodePreBlock = parse(
+        `<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
+      );
+
+      preTag.replaceWith(newCodePreBlock);
+    }
+  }
+
+  // console.log(main.toString());
+
+  // Convert the cleaned HTML to markdown
+  fullContent += `${htmlToMarkdown.translate(main.toString())}`;
+
+  return {
+    links: linksContent,
+    full: fullContent,
+  };
+}
+
+function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
   const sectionData: PageSectionData[] = [];
 
-  const ignoreTags = new Set(["code", "nav"].map((t) => t.toUpperCase()));
+  const ignoreTags = new Set(
+    ["code", "nav", "pre"].map((t) => t.toUpperCase()),
+  );
 
   function collector(node: X_Node) {
     if (node instanceof X_CommentNode) {
@@ -94,9 +234,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
         return;
       }
 
-      const noIndexAttribute = node.getAttribute("data-noindex");
-
-      if (noIndexAttribute === "true") {
+      if (node.getAttribute("data-noindex") === "true") {
         return;
       }
 
diff --git a/apps/portal/src/app/cli/layout.tsx b/apps/portal/src/app/cli/layout.tsx
@@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";
 
 export default async function Layout(props: { children: React.ReactNode }) {
   return (
-    <DocLayout sideBar={sidebar} editPageButton={true}>
+    <DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
       {props.children}
     </DocLayout>
   );
diff --git a/apps/portal/src/app/react-native/v5/layout.tsx b/apps/portal/src/app/react-native/v5/layout.tsx
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
         </div>
       }
     >
-      <div data-noindex>{props.children}</div>
+      <div>{props.children}</div>
     </DocLayout>
   );
 }
diff --git a/apps/portal/src/app/react/v5/layout.tsx b/apps/portal/src/app/react/v5/layout.tsx
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
         </div>
       }
     >
-      <div data-noindex>{props.children}</div>
+      <div>{props.children}</div>
     </DocLayout>
   );
 }
diff --git a/apps/portal/src/app/typescript/v5/layout.tsx b/apps/portal/src/app/typescript/v5/layout.tsx
@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
         </div>
       }
     >
-      <div data-noindex>{props.children}</div>
+      <div>{props.children}</div>
     </DocLayout>
   );
 }
diff --git a/apps/portal/src/components/Document/Cards/ArticleCard.tsx b/apps/portal/src/components/Document/Cards/ArticleCard.tsx
@@ -11,6 +11,7 @@ export function ArticleCard(props: {
   const isExternal = props.href.startsWith("http");
   return (
     <Link
+      data-noindex
       href={props.href}
       className="flex cursor-default bg-card"
       target={isExternal ? "_blank" : undefined}
@@ -38,6 +39,7 @@ export function ArticleIconCard(props: {
   const isExternal = props.href.startsWith("http");
   return (
     <Link
+      data-noindex
       href={props.href}
       className={cn(
         "flex items-center gap-4 rounded-lg border bg-card p-4 transition-colors hover:border-active-border",
diff --git a/apps/portal/src/components/Document/Code.tsx b/apps/portal/src/components/Document/Code.tsx
diff --git a/apps/portal/src/components/Layouts/DocLayout.tsx b/apps/portal/src/components/Layouts/DocLayout.tsx
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {`
`15`	`15`	`</div>`
`16`	`16`	`}`
`17`	`17`	`>`
`18`		`- <div data-noindex>{props.children}</div>`
	`18`	`+ <div>{props.children}</div>`
`19`	`19`	`</DocLayout>`
`20`	`20`	`);`
`21`	`21`	`}`