Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion apps/portal/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
"dev": "next dev",
"prebuild": "pnpm run create-index",
"build": "next build",
"postbuild": "pnpm run extract-search-data && pnpm next-sitemap",
"postbuild": "pnpm run extract-search-data && pnpm run extract-llm-content && pnpm next-sitemap",
"extract-llm-content": "pnpm tsx scripts/extractLLMData.ts",
"start": "next start",
"lint": "biome check ./src && knip && eslint ./src",
"fix": "biome check ./src --fix && eslint ./src --fix",
Expand Down
11 changes: 11 additions & 0 deletions apps/portal/scripts/extractLLMData.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { writeFileSync } from "node:fs";
import { extractContentForLLM } from "../src/app/api/search/extraction/llm-extract";

async function main() {
const rootDir = process.cwd();
const { llmContent, llmFullContent } = await extractContentForLLM(rootDir);
writeFileSync("./public/llms.txt", llmContent);
writeFileSync("./public/llms-full.txt", llmFullContent);
}

main();
5 changes: 1 addition & 4 deletions apps/portal/scripts/extractSearchData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@ import { extractContent } from "../src/app/api/search/extraction";

async function main() {
const rootDir = process.cwd();
const { searchData, llmContent, llmFullContent } =
await extractContent(rootDir);
const { searchData } = await extractContent(rootDir);
writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
writeFileSync("./public/llms.txt", llmContent);
writeFileSync("./public/llms-full.txt", llmFullContent);
}

main();
148 changes: 0 additions & 148 deletions apps/portal/src/app/api/search/extraction/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import { readFile } from "node:fs/promises";
import he from "he";
import { NodeHtmlMarkdown } from "node-html-markdown";
import {
CommentNode as X_CommentNode,
HTMLElement as X_HTMLElement,
Expand All @@ -15,33 +13,15 @@ import { trimExtraSpace } from "./trimExtraSpace";

type ExtractedContent = {
searchData: PageData[];
llmContent: string;
llmFullContent: string;
};

const llmsContentHeader = `\
# thirdweb

> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.

## Docs
`;

const llmsFullContentHeader = `\
# thirdweb

> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
`;

export async function extractContent(
rootDir: string,
): Promise<ExtractedContent> {
const nextOutputDir = `${rootDir}/.next/server/app`;
const htmlFiles = getFilesRecursive(nextOutputDir, "html");

const pages: PageData[] = [];
let llmContent = "";
let llmFullContent = "";

const noMainFound: string[] = [];
const noH1Found: string[] = [];
Expand Down Expand Up @@ -85,16 +65,6 @@ export async function extractContent(
if (pageData) {
pages.push(pageData);
}

// Extract LLM content
const { links, full } = extractPageLLMContent(
mainEl,
pageTitle,
filePath,
nextOutputDir,
);
llmContent += links ? `${links}\n` : "";
llmFullContent += full ? `${full}\n` : "";
}),
);

Expand All @@ -118,8 +88,6 @@ export async function extractContent(

return {
searchData: pages,
llmContent: `${llmsContentHeader}\n${llmContent}`,
llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`,
};
}

Expand All @@ -140,122 +108,6 @@ function extractPageSearchData(
};
}

function extractPageLLMContent(
main: X_HTMLElement,
pageTitle: string | undefined,
filePath: string,
nextOutputDir: string,
): { links: string; full: string } {
if (
main.getAttribute("data-noindex") === "true" ||
main.getAttribute("data-no-llm") === "true"
) {
return { links: "", full: "" };
}

const htmlToMarkdown = new NodeHtmlMarkdown({
keepDataImages: false,
ignore: ["button"],
maxConsecutiveNewlines: 2,
});

let linksContent = "";
let fullContent = "";

const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");

// Get first non-empty paragraph for description
const paragraphs = main.querySelectorAll("p");
let description = "";
for (const p of paragraphs) {
// skip noindex or no-llm paragraphs
if (p.closest("[data-noindex]") || p.closest("[data-no-llm]")) {
continue;
}

description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
if (description) {
break;
}
}

linksContent += `* [${pageTitle}](${pageUrl}): ${description || `Reference for ${pageTitle}`}`;

// Remove noindex and no-llm elements
const contentElements = main.querySelectorAll("*");
for (const element of contentElements) {
if (
element.getAttribute("data-noindex") === "true" ||
element.getAttribute("data-no-llm") === "true"
) {
element.remove();
}
}

// Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6");
for (const heading of headings) {
const headingLevel = Number.parseInt(heading.tagName.replace("H", ""));
const newLevel = Math.min(headingLevel + 1, 6);
heading.tagName = `H${newLevel}`;
}

// prefix all the relative links with the `https://portal.thirdweb.com`
const links = main.querySelectorAll("a");
for (const link of links) {
const href = link.getAttribute("href");
if (href?.startsWith("/")) {
link.setAttribute("href", `https://portal.thirdweb.com${href}`);
}
}

// prefix all relative image links with the `https://portal.thirdweb.com`
const images = main.querySelectorAll("img");
for (const image of images) {
const src = image.getAttribute("src");
if (src?.startsWith("/")) {
image.setAttribute("src", `https://portal.thirdweb.com${src}`);
}
}

// for code blocks inside pre tags -> make them direct descendants of the pre tag
// so they are parsed as blocks by node-html-markdown + add language class
const preTags = main.querySelectorAll("pre");
for (const preTag of preTags) {
const codeBlock = parse(preTag.innerHTML.toString(), {
comment: false,
blockTextElements: {
pre: true,
},
}).querySelector("code");

if (codeBlock) {
const code = codeBlock
.querySelectorAll("div > div > div > div")
.map((x) => x.textContent)
.join("\n")
.trim();

const lang = codeBlock.getAttribute("lang");
codeBlock.textContent = code;

const newCodePreBlock = parse(
`<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
);

preTag.replaceWith(newCodePreBlock);
}
}

// Convert the cleaned HTML to markdown
fullContent += `${htmlToMarkdown.translate(main.toString())}`;

return {
links: linksContent,
full: fullContent,
};
}

function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
const sectionData: PageSectionData[] = [];

Expand Down
Loading
Loading