11import { readFile } from "node:fs/promises" ;
2+ import he from "he" ;
3+ import { NodeHtmlMarkdown } from "node-html-markdown" ;
24import {
35 CommentNode as X_CommentNode ,
46 HTMLElement as X_HTMLElement ,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113import { ignoreHeadings } from "./settings" ;
1214import { trimExtraSpace } from "./trimExtraSpace" ;
1315
14- export async function extractSearchData ( rootDir : string ) : Promise < PageData [ ] > {
16+ type ExtractedContent = {
17+ searchData : PageData [ ] ;
18+ llmContent : string ;
19+ llmFullContent : string ;
20+ } ;
21+
22+ const llmsContentHeader = `\
23+ # thirdweb
24+
25+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26+
27+ ## Docs
28+ ` ;
29+
30+ const llmsFullContentHeader = `\
31+ # thirdweb
32+
33+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34+ ` ;
35+
36+ export async function extractContent (
37+ rootDir : string ,
38+ ) : Promise < ExtractedContent > {
1539 const nextOutputDir = `${ rootDir } /.next/server/app` ;
1640 const htmlFiles = getFilesRecursive ( nextOutputDir , "html" ) ;
1741
1842 const pages : PageData [ ] = [ ] ;
43+ let llmContent = "" ;
44+ let llmFullContent = "" ;
1945
2046 const noMainFound : string [ ] = [ ] ;
2147 const noH1Found : string [ ] = [ ] ;
@@ -25,8 +51,9 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2551 const htmlContent = await readFile ( filePath , "utf-8" ) ;
2652 const mainEl = parse ( htmlContent , {
2753 comment : false ,
54+ // fixNestedATags: true,
2855 blockTextElements : {
29- pre : false , // parse text inside <pre> elements instead of treating it as text
56+ pre : true ,
3057 } ,
3158 } ) . querySelector ( "main" ) ;
3259
@@ -38,24 +65,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3865 }
3966
4067 const noIndex = mainEl . getAttribute ( "data-noindex" ) ;
41-
42- if ( noIndex ) {
68+ if ( noIndex === "true" ) {
4369 return ;
4470 }
4571
4672 const pageTitle = mainEl . querySelector ( "h1" ) ?. text ;
47-
4873 if ( ! pageTitle ) {
4974 noH1Found . push (
5075 filePath . split ( ".next/server/app" ) [ 1 ] ?. replace ( ".html" , "" ) || "" ,
5176 ) ;
5277 }
5378
54- pages . push ( {
55- href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
56- title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
57- sections : getPageSections ( mainEl ) ,
58- } ) ;
79+ // Important: do the search index collection first - we will modify the main element in the next step
80+ // Extract search data
81+ const pageData = extractPageSearchData (
82+ mainEl ,
83+ filePath ,
84+ nextOutputDir ,
85+ pageTitle ,
86+ ) ;
87+ if ( pageData ) {
88+ pages . push ( pageData ) ;
89+ }
90+
91+ // Extract LLM content
92+ const { links, full } = extractPageLLMContent (
93+ mainEl ,
94+ pageTitle ,
95+ filePath ,
96+ nextOutputDir ,
97+ ) ;
98+ llmContent += links ? `${ links } \n` : "" ;
99+ llmFullContent += full ? `${ full } \n` : "" ;
59100 } ) ,
60101 ) ;
61102
@@ -77,13 +118,144 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77118 console . warn ( "\n" ) ;
78119 }
79120
80- return pages ;
121+ return {
122+ searchData : pages ,
123+ llmContent : `${ llmsContentHeader } \n${ llmContent } ` ,
124+ llmFullContent : `${ llmsFullContentHeader } \n${ llmFullContent } ` ,
125+ } ;
126+ }
127+
128+ function extractPageSearchData (
129+ main : X_HTMLElement ,
130+ filePath : string ,
131+ nextOutputDir : string ,
132+ pageTitle : string | undefined ,
133+ ) : PageData | null {
134+ if ( main . getAttribute ( "data-noindex" ) === "true" ) {
135+ return null ;
136+ }
137+
138+ return {
139+ href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
140+ title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
141+ sections : getPageSectionsForSearchIndex ( main ) ,
142+ } ;
143+ }
144+
145+ function extractPageLLMContent (
146+ main : X_HTMLElement ,
147+ pageTitle : string | undefined ,
148+ filePath : string ,
149+ nextOutputDir : string ,
150+ ) : { links : string ; full : string } {
151+ if (
152+ main . getAttribute ( "data-noindex" ) === "true" ||
153+ main . getAttribute ( "data-no-llm" ) === "true"
154+ ) {
155+ return { links : "" , full : "" } ;
156+ }
157+
158+ const htmlToMarkdown = new NodeHtmlMarkdown ( {
159+ keepDataImages : false ,
160+ } ) ;
161+
162+ let linksContent = "" ;
163+ let fullContent = "" ;
164+
165+ const pageUrl = filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ;
166+
167+ // Get first non-empty paragraph for description
168+ const paragraphs = main . querySelectorAll ( "p" ) ;
169+ let description = "" ;
170+ for ( const p of paragraphs ) {
171+ if (
172+ p . getAttribute ( "data-noindex" ) !== "true" ||
173+ p . getAttribute ( "data-no-llm" ) !== "true"
174+ ) {
175+ description = trimExtraSpace ( htmlToMarkdown . translate ( p . toString ( ) ) ) ;
176+ if ( description ) break ;
177+ }
178+ }
179+
180+ linksContent += `* [${ pageTitle } ](${ pageUrl } ): ${ description } ` ;
181+
182+ // Remove noindex and no-llm elements
183+ const contentElements = main . querySelectorAll ( "*" ) ;
184+ for ( const element of contentElements ) {
185+ if (
186+ element . getAttribute ( "data-noindex" ) === "true" ||
187+ element . getAttribute ( "data-no-llm" ) === "true"
188+ ) {
189+ element . remove ( ) ;
190+ }
191+ }
192+
193+ // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
194+ const headings = main . querySelectorAll ( "h1, h2, h3, h4, h5, h6" ) ;
195+ for ( const heading of headings ) {
196+ const headingLevel = Number . parseInt ( heading . tagName . replace ( "H" , "" ) ) ;
197+ const newLevel = Math . min ( headingLevel + 1 , 6 ) ;
198+ heading . tagName = `H${ newLevel } ` ;
199+ }
200+
201+ // prefix all the relative links with the `https://portal.thirdweb.com`
202+ const links = main . querySelectorAll ( "a" ) ;
203+ for ( const link of links ) {
204+ const [ path , hash ] = link . getAttribute ( "href" ) ?. split ( "#" ) || [ ] ;
205+ if ( path ?. startsWith ( "/" ) ) {
206+ link . setAttribute (
207+ "href" ,
208+ `https://portal.thirdweb.com${ path } ${ hash ? `#${ hash } ` : "" } ` ,
209+ ) ;
210+ }
211+ }
212+
213+ // for code blocks inside pre tags -> make them direct descendants of the pre tag
214+ // so they are parsed as blocks by node-html-markdown + add language class
215+ const preTags = main . querySelectorAll ( "pre" ) ;
216+ for ( const preTag of preTags ) {
217+ const codeBlock = parse ( preTag . innerHTML . toString ( ) , {
218+ comment : false ,
219+ blockTextElements : {
220+ pre : true ,
221+ } ,
222+ } ) . querySelector ( "code" ) ;
223+
224+ if ( codeBlock ) {
225+ const code = codeBlock
226+ . querySelectorAll ( "div > div > div > div" )
227+ . map ( ( x ) => x . textContent )
228+ . join ( "\n" )
229+ . trim ( ) ;
230+
231+ const lang = codeBlock . getAttribute ( "lang" ) ;
232+ codeBlock . textContent = code ;
233+
234+ const newCodePreBlock = parse (
235+ `<pre><code class=${ lang ? `language-${ lang } ` : "" } >${ he . encode ( code ) } </code></pre>` ,
236+ ) ;
237+
238+ preTag . replaceWith ( newCodePreBlock ) ;
239+ }
240+ }
241+
242+ // console.log(main.toString());
243+
244+ // Convert the cleaned HTML to markdown
245+ fullContent += `${ htmlToMarkdown . translate ( main . toString ( ) ) } ` ;
246+
247+ return {
248+ links : linksContent ,
249+ full : fullContent ,
250+ } ;
81251}
82252
83- function getPageSections ( main : X_HTMLElement ) : PageSectionData [ ] {
253+ function getPageSectionsForSearchIndex ( main : X_HTMLElement ) : PageSectionData [ ] {
84254 const sectionData : PageSectionData [ ] = [ ] ;
85255
86- const ignoreTags = new Set ( [ "code" , "nav" ] . map ( ( t ) => t . toUpperCase ( ) ) ) ;
256+ const ignoreTags = new Set (
257+ [ "code" , "nav" , "pre" ] . map ( ( t ) => t . toUpperCase ( ) ) ,
258+ ) ;
87259
88260 function collector ( node : X_Node ) {
89261 if ( node instanceof X_CommentNode ) {
@@ -94,9 +266,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94266 return ;
95267 }
96268
97- const noIndexAttribute = node . getAttribute ( "data-noindex" ) ;
98-
99- if ( noIndexAttribute === "true" ) {
269+ if ( node . getAttribute ( "data-noindex" ) === "true" ) {
100270 return ;
101271 }
102272
0 commit comments