11import { readFile } from "node:fs/promises" ;
2+ import he from "he" ;
3+ import { NodeHtmlMarkdown } from "node-html-markdown" ;
24import {
35 CommentNode as X_CommentNode ,
46 HTMLElement as X_HTMLElement ,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113import { ignoreHeadings } from "./settings" ;
1214import { trimExtraSpace } from "./trimExtraSpace" ;
1315
14- export async function extractSearchData ( rootDir : string ) : Promise < PageData [ ] > {
16+ type ExtractedContent = {
17+ searchData : PageData [ ] ;
18+ llmContent : string ;
19+ llmFullContent : string ;
20+ } ;
21+
22+ const llmsContentHeader = `\
23+ # thirdweb
24+
25+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26+
27+ ## Docs
28+ ` ;
29+
30+ const llmsFullContentHeader = `\
31+ # thirdweb
32+
33+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34+ ` ;
35+
36+ export async function extractContent (
37+ rootDir : string ,
38+ ) : Promise < ExtractedContent > {
1539 const nextOutputDir = `${ rootDir } /.next/server/app` ;
1640 const htmlFiles = getFilesRecursive ( nextOutputDir , "html" ) ;
1741
1842 const pages : PageData [ ] = [ ] ;
43+ let llmContent = "" ;
44+ let llmFullContent = "" ;
1945
2046 const noMainFound : string [ ] = [ ] ;
2147 const noH1Found : string [ ] = [ ] ;
@@ -26,7 +52,7 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2652 const mainEl = parse ( htmlContent , {
2753 comment : false ,
2854 blockTextElements : {
29- pre : false , // parse text inside <pre> elements instead of treating it as text
55+ pre : true ,
3056 } ,
3157 } ) . querySelector ( "main" ) ;
3258
@@ -37,25 +63,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3763 return ;
3864 }
3965
40- const noIndex = mainEl . getAttribute ( "data-noindex" ) ;
41-
42- if ( noIndex ) {
66+ if ( mainEl . getAttribute ( "data-noindex" ) === "true" ) {
4367 return ;
4468 }
4569
4670 const pageTitle = mainEl . querySelector ( "h1" ) ?. text ;
47-
4871 if ( ! pageTitle ) {
4972 noH1Found . push (
5073 filePath . split ( ".next/server/app" ) [ 1 ] ?. replace ( ".html" , "" ) || "" ,
5174 ) ;
5275 }
5376
54- pages . push ( {
55- href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
56- title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
57- sections : getPageSections ( mainEl ) ,
58- } ) ;
77+ // Important: do the search index collection first - we will modify the main element in the next step
78+ // Extract search data
79+ const pageData = extractPageSearchData (
80+ mainEl ,
81+ filePath ,
82+ nextOutputDir ,
83+ pageTitle ,
84+ ) ;
85+ if ( pageData ) {
86+ pages . push ( pageData ) ;
87+ }
88+
89+ // Extract LLM content
90+ const { links, full } = extractPageLLMContent (
91+ mainEl ,
92+ pageTitle ,
93+ filePath ,
94+ nextOutputDir ,
95+ ) ;
96+ llmContent += links ? `${ links } \n` : "" ;
97+ llmFullContent += full ? `${ full } \n` : "" ;
5998 } ) ,
6099 ) ;
61100
@@ -77,13 +116,147 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77116 console . warn ( "\n" ) ;
78117 }
79118
80- return pages ;
119+ return {
120+ searchData : pages ,
121+ llmContent : `${ llmsContentHeader } \n${ llmContent } ` ,
122+ llmFullContent : `${ llmsFullContentHeader } \n${ llmFullContent } ` ,
123+ } ;
81124}
82125
83- function getPageSections ( main : X_HTMLElement ) : PageSectionData [ ] {
126+ function extractPageSearchData (
127+ main : X_HTMLElement ,
128+ filePath : string ,
129+ nextOutputDir : string ,
130+ pageTitle : string | undefined ,
131+ ) : PageData | null {
132+ if ( main . getAttribute ( "data-noindex" ) === "true" ) {
133+ return null ;
134+ }
135+
136+ return {
137+ href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
138+ title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
139+ sections : getPageSectionsForSearchIndex ( main ) ,
140+ } ;
141+ }
142+
143+ function extractPageLLMContent (
144+ main : X_HTMLElement ,
145+ pageTitle : string | undefined ,
146+ filePath : string ,
147+ nextOutputDir : string ,
148+ ) : { links : string ; full : string } {
149+ if (
150+ main . getAttribute ( "data-noindex" ) === "true" ||
151+ main . getAttribute ( "data-no-llm" ) === "true"
152+ ) {
153+ return { links : "" , full : "" } ;
154+ }
155+
156+ const htmlToMarkdown = new NodeHtmlMarkdown ( {
157+ keepDataImages : false ,
158+ } ) ;
159+
160+ let linksContent = "" ;
161+ let fullContent = "" ;
162+
163+ const pageUrl = filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ;
164+
165+ // Get first non-empty paragraph for description
166+ const paragraphs = main . querySelectorAll ( "p" ) ;
167+ let description = "" ;
168+ for ( const p of paragraphs ) {
169+ // skip noindex or no-llm paragraphs
170+ if (
171+ p . getAttribute ( "data-noindex" ) === "true" ||
172+ p . getAttribute ( "data-no-llm" ) === "true"
173+ ) {
174+ continue ;
175+ }
176+
177+ description = trimExtraSpace ( htmlToMarkdown . translate ( p . toString ( ) ) ) ;
178+ if ( description ) {
179+ break ;
180+ }
181+ }
182+
183+ linksContent += `* [${ pageTitle } ](${ pageUrl } ): ${ description } ` ;
184+
185+ // Remove noindex and no-llm elements
186+ const contentElements = main . querySelectorAll ( "*" ) ;
187+ for ( const element of contentElements ) {
188+ if (
189+ element . getAttribute ( "data-noindex" ) === "true" ||
190+ element . getAttribute ( "data-no-llm" ) === "true"
191+ ) {
192+ element . remove ( ) ;
193+ }
194+ }
195+
196+ // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
197+ const headings = main . querySelectorAll ( "h1, h2, h3, h4, h5, h6" ) ;
198+ for ( const heading of headings ) {
199+ const headingLevel = Number . parseInt ( heading . tagName . replace ( "H" , "" ) ) ;
200+ const newLevel = Math . min ( headingLevel + 1 , 6 ) ;
201+ heading . tagName = `H${ newLevel } ` ;
202+ }
203+
204+ // prefix all the relative links with the `https://portal.thirdweb.com`
205+ const links = main . querySelectorAll ( "a" ) ;
206+ for ( const link of links ) {
207+ const [ path , hash ] = link . getAttribute ( "href" ) ?. split ( "#" ) || [ ] ;
208+ if ( path ?. startsWith ( "/" ) ) {
209+ link . setAttribute (
210+ "href" ,
211+ `https://portal.thirdweb.com${ path } ${ hash ? `#${ hash } ` : "" } ` ,
212+ ) ;
213+ }
214+ }
215+
216+ // for code blocks inside pre tags -> make them direct descendants of the pre tag
217+ // so they are parsed as blocks by node-html-markdown + add language class
218+ const preTags = main . querySelectorAll ( "pre" ) ;
219+ for ( const preTag of preTags ) {
220+ const codeBlock = parse ( preTag . innerHTML . toString ( ) , {
221+ comment : false ,
222+ blockTextElements : {
223+ pre : true ,
224+ } ,
225+ } ) . querySelector ( "code" ) ;
226+
227+ if ( codeBlock ) {
228+ const code = codeBlock
229+ . querySelectorAll ( "div > div > div > div" )
230+ . map ( ( x ) => x . textContent )
231+ . join ( "\n" )
232+ . trim ( ) ;
233+
234+ const lang = codeBlock . getAttribute ( "lang" ) ;
235+ codeBlock . textContent = code ;
236+
237+ const newCodePreBlock = parse (
238+ `<pre><code class=${ lang ? `language-${ lang } ` : "" } >${ he . encode ( code ) } </code></pre>` ,
239+ ) ;
240+
241+ preTag . replaceWith ( newCodePreBlock ) ;
242+ }
243+ }
244+
245+ // Convert the cleaned HTML to markdown
246+ fullContent += `${ htmlToMarkdown . translate ( main . toString ( ) ) } ` ;
247+
248+ return {
249+ links : linksContent ,
250+ full : fullContent ,
251+ } ;
252+ }
253+
254+ function getPageSectionsForSearchIndex ( main : X_HTMLElement ) : PageSectionData [ ] {
84255 const sectionData : PageSectionData [ ] = [ ] ;
85256
86- const ignoreTags = new Set ( [ "code" , "nav" ] . map ( ( t ) => t . toUpperCase ( ) ) ) ;
257+ const ignoreTags = new Set (
258+ [ "code" , "nav" , "pre" ] . map ( ( t ) => t . toUpperCase ( ) ) ,
259+ ) ;
87260
88261 function collector ( node : X_Node ) {
89262 if ( node instanceof X_CommentNode ) {
@@ -94,9 +267,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94267 return ;
95268 }
96269
97- const noIndexAttribute = node . getAttribute ( "data-noindex" ) ;
98-
99- if ( noIndexAttribute === "true" ) {
270+ if ( node . getAttribute ( "data-noindex" ) === "true" ) {
100271 return ;
101272 }
102273
0 commit comments