11import { readFile } from "node:fs/promises" ;
2- import he from "he" ;
3- import { NodeHtmlMarkdown } from "node-html-markdown" ;
42import {
53 CommentNode as X_CommentNode ,
64 HTMLElement as X_HTMLElement ,
@@ -15,33 +13,15 @@ import { trimExtraSpace } from "./trimExtraSpace";
1513
1614type ExtractedContent = {
1715 searchData : PageData [ ] ;
18- llmContent : string ;
19- llmFullContent : string ;
2016} ;
2117
22- const llmsContentHeader = `\
23- # thirdweb
24-
25- > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26-
27- ## Docs
28- ` ;
29-
30- const llmsFullContentHeader = `\
31- # thirdweb
32-
33- > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34- ` ;
35-
3618export async function extractContent (
3719 rootDir : string ,
3820) : Promise < ExtractedContent > {
3921 const nextOutputDir = `${ rootDir } /.next/server/app` ;
4022 const htmlFiles = getFilesRecursive ( nextOutputDir , "html" ) ;
4123
4224 const pages : PageData [ ] = [ ] ;
43- let llmContent = "" ;
44- let llmFullContent = "" ;
4525
4626 const noMainFound : string [ ] = [ ] ;
4727 const noH1Found : string [ ] = [ ] ;
@@ -85,16 +65,6 @@ export async function extractContent(
8565 if ( pageData ) {
8666 pages . push ( pageData ) ;
8767 }
88-
89- // Extract LLM content
90- const { links, full } = extractPageLLMContent (
91- mainEl ,
92- pageTitle ,
93- filePath ,
94- nextOutputDir ,
95- ) ;
96- llmContent += links ? `${ links } \n` : "" ;
97- llmFullContent += full ? `${ full } \n` : "" ;
9868 } ) ,
9969 ) ;
10070
@@ -118,8 +88,6 @@ export async function extractContent(
11888
11989 return {
12090 searchData : pages ,
121- llmContent : `${ llmsContentHeader } \n${ llmContent } ` ,
122- llmFullContent : `${ llmsFullContentHeader } \n${ llmFullContent } ` ,
12391 } ;
12492}
12593
@@ -140,122 +108,6 @@ function extractPageSearchData(
140108 } ;
141109}
142110
143- function extractPageLLMContent (
144- main : X_HTMLElement ,
145- pageTitle : string | undefined ,
146- filePath : string ,
147- nextOutputDir : string ,
148- ) : { links : string ; full : string } {
149- if (
150- main . getAttribute ( "data-noindex" ) === "true" ||
151- main . getAttribute ( "data-no-llm" ) === "true"
152- ) {
153- return { links : "" , full : "" } ;
154- }
155-
156- const htmlToMarkdown = new NodeHtmlMarkdown ( {
157- keepDataImages : false ,
158- ignore : [ "button" ] ,
159- maxConsecutiveNewlines : 2 ,
160- } ) ;
161-
162- let linksContent = "" ;
163- let fullContent = "" ;
164-
165- const pageUrl = filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ;
166-
167- // Get first non-empty paragraph for description
168- const paragraphs = main . querySelectorAll ( "p" ) ;
169- let description = "" ;
170- for ( const p of paragraphs ) {
171- // skip noindex or no-llm paragraphs
172- if ( p . closest ( "[data-noindex]" ) || p . closest ( "[data-no-llm]" ) ) {
173- continue ;
174- }
175-
176- description = trimExtraSpace ( htmlToMarkdown . translate ( p . toString ( ) ) ) ;
177- if ( description ) {
178- break ;
179- }
180- }
181-
182- linksContent += `* [${ pageTitle } ](${ pageUrl } ): ${ description || `Reference for ${ pageTitle } ` } ` ;
183-
184- // Remove noindex and no-llm elements
185- const contentElements = main . querySelectorAll ( "*" ) ;
186- for ( const element of contentElements ) {
187- if (
188- element . getAttribute ( "data-noindex" ) === "true" ||
189- element . getAttribute ( "data-no-llm" ) === "true"
190- ) {
191- element . remove ( ) ;
192- }
193- }
194-
195- // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
196- const headings = main . querySelectorAll ( "h1, h2, h3, h4, h5, h6" ) ;
197- for ( const heading of headings ) {
198- const headingLevel = Number . parseInt ( heading . tagName . replace ( "H" , "" ) ) ;
199- const newLevel = Math . min ( headingLevel + 1 , 6 ) ;
200- heading . tagName = `H${ newLevel } ` ;
201- }
202-
203- // prefix all the relative links with the `https://portal.thirdweb.com`
204- const links = main . querySelectorAll ( "a" ) ;
205- for ( const link of links ) {
206- const href = link . getAttribute ( "href" ) ;
207- if ( href ?. startsWith ( "/" ) ) {
208- link . setAttribute ( "href" , `https://portal.thirdweb.com${ href } ` ) ;
209- }
210- }
211-
212- // prefix all relative image links with the `https://portal.thirdweb.com`
213- const images = main . querySelectorAll ( "img" ) ;
214- for ( const image of images ) {
215- const src = image . getAttribute ( "src" ) ;
216- if ( src ?. startsWith ( "/" ) ) {
217- image . setAttribute ( "src" , `https://portal.thirdweb.com${ src } ` ) ;
218- }
219- }
220-
221- // for code blocks inside pre tags -> make them direct descendants of the pre tag
222- // so they are parsed as blocks by node-html-markdown + add language class
223- const preTags = main . querySelectorAll ( "pre" ) ;
224- for ( const preTag of preTags ) {
225- const codeBlock = parse ( preTag . innerHTML . toString ( ) , {
226- comment : false ,
227- blockTextElements : {
228- pre : true ,
229- } ,
230- } ) . querySelector ( "code" ) ;
231-
232- if ( codeBlock ) {
233- const code = codeBlock
234- . querySelectorAll ( "div > div > div > div" )
235- . map ( ( x ) => x . textContent )
236- . join ( "\n" )
237- . trim ( ) ;
238-
239- const lang = codeBlock . getAttribute ( "lang" ) ;
240- codeBlock . textContent = code ;
241-
242- const newCodePreBlock = parse (
243- `<pre><code class=${ lang ? `language-${ lang } ` : "" } >${ he . encode ( code ) } </code></pre>` ,
244- ) ;
245-
246- preTag . replaceWith ( newCodePreBlock ) ;
247- }
248- }
249-
250- // Convert the cleaned HTML to markdown
251- fullContent += `${ htmlToMarkdown . translate ( main . toString ( ) ) } ` ;
252-
253- return {
254- links : linksContent ,
255- full : fullContent ,
256- } ;
257- }
258-
259111function getPageSectionsForSearchIndex ( main : X_HTMLElement ) : PageSectionData [ ] {
260112 const sectionData : PageSectionData [ ] = [ ] ;
261113
0 commit comments