11import { readFile } from "node:fs/promises" ;
2+ import he from "he" ;
3+ import { NodeHtmlMarkdown } from "node-html-markdown" ;
24import {
35 CommentNode as X_CommentNode ,
46 HTMLElement as X_HTMLElement ,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113import { ignoreHeadings } from "./settings" ;
1214import { trimExtraSpace } from "./trimExtraSpace" ;
1315
14- export async function extractSearchData ( rootDir : string ) : Promise < PageData [ ] > {
16+ type ExtractedContent = {
17+ searchData : PageData [ ] ;
18+ llmContent : string ;
19+ llmFullContent : string ;
20+ } ;
21+
22+ const llmsContentHeader = `\
23+ # thirdweb
24+
25+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26+
27+ ## Docs
28+ ` ;
29+
30+ const llmsFullContentHeader = `\
31+ # thirdweb
32+
33+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34+ ` ;
35+
36+ export async function extractContent (
37+ rootDir : string ,
38+ ) : Promise < ExtractedContent > {
1539 const nextOutputDir = `${ rootDir } /.next/server/app` ;
1640 const htmlFiles = getFilesRecursive ( nextOutputDir , "html" ) ;
1741
1842 const pages : PageData [ ] = [ ] ;
43+ let llmContent = "" ;
44+ let llmFullContent = "" ;
1945
2046 const noMainFound : string [ ] = [ ] ;
2147 const noH1Found : string [ ] = [ ] ;
@@ -26,7 +52,7 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2652 const mainEl = parse ( htmlContent , {
2753 comment : false ,
2854 blockTextElements : {
29- pre : false , // parse text inside <pre> elements instead of treating it as text
55+ pre : true ,
3056 } ,
3157 } ) . querySelector ( "main" ) ;
3258
@@ -38,24 +64,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3864 }
3965
4066 const noIndex = mainEl . getAttribute ( "data-noindex" ) ;
41-
42- if ( noIndex ) {
67+ if ( noIndex === "true" ) {
4368 return ;
4469 }
4570
4671 const pageTitle = mainEl . querySelector ( "h1" ) ?. text ;
47-
4872 if ( ! pageTitle ) {
4973 noH1Found . push (
5074 filePath . split ( ".next/server/app" ) [ 1 ] ?. replace ( ".html" , "" ) || "" ,
5175 ) ;
5276 }
5377
54- pages . push ( {
55- href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
56- title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
57- sections : getPageSections ( mainEl ) ,
58- } ) ;
78+ // Important: do the search index collection first - we will modify the main element in the next step
79+ // Extract search data
80+ const pageData = extractPageSearchData (
81+ mainEl ,
82+ filePath ,
83+ nextOutputDir ,
84+ pageTitle ,
85+ ) ;
86+ if ( pageData ) {
87+ pages . push ( pageData ) ;
88+ }
89+
90+ // Extract LLM content
91+ const { links, full } = extractPageLLMContent (
92+ mainEl ,
93+ pageTitle ,
94+ filePath ,
95+ nextOutputDir ,
96+ ) ;
97+ llmContent += links ? `${ links } \n` : "" ;
98+ llmFullContent += full ? `${ full } \n` : "" ;
5999 } ) ,
60100 ) ;
61101
@@ -77,13 +117,142 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77117 console . warn ( "\n" ) ;
78118 }
79119
80- return pages ;
120+ return {
121+ searchData : pages ,
122+ llmContent : `${ llmsContentHeader } \n${ llmContent } ` ,
123+ llmFullContent : `${ llmsFullContentHeader } \n${ llmFullContent } ` ,
124+ } ;
125+ }
126+
127+ function extractPageSearchData (
128+ main : X_HTMLElement ,
129+ filePath : string ,
130+ nextOutputDir : string ,
131+ pageTitle : string | undefined ,
132+ ) : PageData | null {
133+ if ( main . getAttribute ( "data-noindex" ) === "true" ) {
134+ return null ;
135+ }
136+
137+ return {
138+ href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
139+ title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
140+ sections : getPageSectionsForSearchIndex ( main ) ,
141+ } ;
81142}
82143
83- function getPageSections ( main : X_HTMLElement ) : PageSectionData [ ] {
144+ function extractPageLLMContent (
145+ main : X_HTMLElement ,
146+ pageTitle : string | undefined ,
147+ filePath : string ,
148+ nextOutputDir : string ,
149+ ) : { links : string ; full : string } {
150+ if (
151+ main . getAttribute ( "data-noindex" ) === "true" ||
152+ main . getAttribute ( "data-no-llm" ) === "true"
153+ ) {
154+ return { links : "" , full : "" } ;
155+ }
156+
157+ const htmlToMarkdown = new NodeHtmlMarkdown ( {
158+ keepDataImages : false ,
159+ } ) ;
160+
161+ let linksContent = "" ;
162+ let fullContent = "" ;
163+
164+ const pageUrl = filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ;
165+
166+ // Get first non-empty paragraph for description
167+ const paragraphs = main . querySelectorAll ( "p" ) ;
168+ let description = "" ;
169+ for ( const p of paragraphs ) {
170+ if (
171+ p . getAttribute ( "data-noindex" ) !== "true" ||
172+ p . getAttribute ( "data-no-llm" ) !== "true"
173+ ) {
174+ description = trimExtraSpace ( htmlToMarkdown . translate ( p . toString ( ) ) ) ;
175+ if ( description ) break ;
176+ }
177+ }
178+
179+ linksContent += `* [${ pageTitle } ](${ pageUrl } ): ${ description } ` ;
180+
181+ // Remove noindex and no-llm elements
182+ const contentElements = main . querySelectorAll ( "*" ) ;
183+ for ( const element of contentElements ) {
184+ if (
185+ element . getAttribute ( "data-noindex" ) === "true" ||
186+ element . getAttribute ( "data-no-llm" ) === "true"
187+ ) {
188+ element . remove ( ) ;
189+ }
190+ }
191+
192+ // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
193+ const headings = main . querySelectorAll ( "h1, h2, h3, h4, h5, h6" ) ;
194+ for ( const heading of headings ) {
195+ const headingLevel = Number . parseInt ( heading . tagName . replace ( "H" , "" ) ) ;
196+ const newLevel = Math . min ( headingLevel + 1 , 6 ) ;
197+ heading . tagName = `H${ newLevel } ` ;
198+ }
199+
200+ // prefix all the relative links with the `https://portal.thirdweb.com`
201+ const links = main . querySelectorAll ( "a" ) ;
202+ for ( const link of links ) {
203+ const [ path , hash ] = link . getAttribute ( "href" ) ?. split ( "#" ) || [ ] ;
204+ if ( path ?. startsWith ( "/" ) ) {
205+ link . setAttribute (
206+ "href" ,
207+ `https://portal.thirdweb.com${ path } ${ hash ? `#${ hash } ` : "" } ` ,
208+ ) ;
209+ }
210+ }
211+
212+ // for code blocks inside pre tags -> make them direct descendants of the pre tag
213+ // so they are parsed as blocks by node-html-markdown + add language class
214+ const preTags = main . querySelectorAll ( "pre" ) ;
215+ for ( const preTag of preTags ) {
216+ const codeBlock = parse ( preTag . innerHTML . toString ( ) , {
217+ comment : false ,
218+ blockTextElements : {
219+ pre : true ,
220+ } ,
221+ } ) . querySelector ( "code" ) ;
222+
223+ if ( codeBlock ) {
224+ const code = codeBlock
225+ . querySelectorAll ( "div > div > div > div" )
226+ . map ( ( x ) => x . textContent )
227+ . join ( "\n" )
228+ . trim ( ) ;
229+
230+ const lang = codeBlock . getAttribute ( "lang" ) ;
231+ codeBlock . textContent = code ;
232+
233+ const newCodePreBlock = parse (
234+ `<pre><code class=${ lang ? `language-${ lang } ` : "" } >${ he . encode ( code ) } </code></pre>` ,
235+ ) ;
236+
237+ preTag . replaceWith ( newCodePreBlock ) ;
238+ }
239+ }
240+
241+ // Convert the cleaned HTML to markdown
242+ fullContent += `${ htmlToMarkdown . translate ( main . toString ( ) ) } ` ;
243+
244+ return {
245+ links : linksContent ,
246+ full : fullContent ,
247+ } ;
248+ }
249+
250+ function getPageSectionsForSearchIndex ( main : X_HTMLElement ) : PageSectionData [ ] {
84251 const sectionData : PageSectionData [ ] = [ ] ;
85252
86- const ignoreTags = new Set ( [ "code" , "nav" ] . map ( ( t ) => t . toUpperCase ( ) ) ) ;
253+ const ignoreTags = new Set (
254+ [ "code" , "nav" , "pre" ] . map ( ( t ) => t . toUpperCase ( ) ) ,
255+ ) ;
87256
88257 function collector ( node : X_Node ) {
89258 if ( node instanceof X_CommentNode ) {
@@ -94,9 +263,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94263 return ;
95264 }
96265
97- const noIndexAttribute = node . getAttribute ( "data-noindex" ) ;
98-
99- if ( noIndexAttribute === "true" ) {
266+ if ( node . getAttribute ( "data-noindex" ) === "true" ) {
100267 return ;
101268 }
102269
0 commit comments