11import { readFile } from "node:fs/promises" ;
2+ import he from "he" ;
3+ import { NodeHtmlMarkdown } from "node-html-markdown" ;
24import {
35 CommentNode as X_CommentNode ,
46 HTMLElement as X_HTMLElement ,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113import { ignoreHeadings } from "./settings" ;
1214import { trimExtraSpace } from "./trimExtraSpace" ;
1315
14- export async function extractSearchData ( rootDir : string ) : Promise < PageData [ ] > {
16+ type ExtractedContent = {
17+ searchData : PageData [ ] ;
18+ llmContent : string ;
19+ llmFullContent : string ;
20+ } ;
21+
22+ const llmsContentHeader = `\
23+ # thirdweb
24+
25+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26+
27+ ## Docs
28+ ` ;
29+
30+ const llmsFullContentHeader = `\
31+ # thirdweb
32+
33+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34+ ` ;
35+
36+ export async function extractContent (
37+ rootDir : string ,
38+ ) : Promise < ExtractedContent > {
1539 const nextOutputDir = `${ rootDir } /.next/server/app` ;
1640 const htmlFiles = getFilesRecursive ( nextOutputDir , "html" ) ;
1741
1842 const pages : PageData [ ] = [ ] ;
43+ let llmContent = "" ;
44+ let llmFullContent = "" ;
1945
2046 const noMainFound : string [ ] = [ ] ;
2147 const noH1Found : string [ ] = [ ] ;
@@ -25,8 +51,9 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2551 const htmlContent = await readFile ( filePath , "utf-8" ) ;
2652 const mainEl = parse ( htmlContent , {
2753 comment : false ,
54+ // fixNestedATags: true,
2855 blockTextElements : {
29- pre : false , // parse text inside <pre> elements instead of treating it as text
56+ pre : true ,
3057 } ,
3158 } ) . querySelector ( "main" ) ;
3259
@@ -38,24 +65,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3865 }
3966
4067 const noIndex = mainEl . getAttribute ( "data-noindex" ) ;
41-
42- if ( noIndex ) {
68+ if ( noIndex === "true" ) {
4369 return ;
4470 }
4571
4672 const pageTitle = mainEl . querySelector ( "h1" ) ?. text ;
47-
4873 if ( ! pageTitle ) {
4974 noH1Found . push (
5075 filePath . split ( ".next/server/app" ) [ 1 ] ?. replace ( ".html" , "" ) || "" ,
5176 ) ;
5277 }
5378
54- pages . push ( {
55- href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
56- title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
57- sections : getPageSections ( mainEl ) ,
58- } ) ;
79+ // Important: do the search index collection first - we will modify the main element in the next step
80+ // Extract search data
81+ const pageData = extractPageSearchData (
82+ mainEl ,
83+ filePath ,
84+ nextOutputDir ,
85+ pageTitle ,
86+ ) ;
87+ if ( pageData ) {
88+ pages . push ( pageData ) ;
89+ }
90+
91+ // Extract LLM content
92+ const { links, full } = extractPageLLMContent (
93+ mainEl ,
94+ pageTitle ,
95+ filePath ,
96+ nextOutputDir ,
97+ ) ;
98+ llmContent += links ? `${ links } \n` : "" ;
99+ llmFullContent += full ? `${ full } \n` : "" ;
59100 } ) ,
60101 ) ;
61102
@@ -77,13 +118,136 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77118 console . warn ( "\n" ) ;
78119 }
79120
80- return pages ;
121+ return {
122+ searchData : pages ,
123+ llmContent : `${ llmsContentHeader } \n${ llmContent } ` ,
124+ llmFullContent : `${ llmsFullContentHeader } \n${ llmFullContent } ` ,
125+ } ;
126+ }
127+
128+ function extractPageSearchData (
129+ main : X_HTMLElement ,
130+ filePath : string ,
131+ nextOutputDir : string ,
132+ pageTitle : string | undefined ,
133+ ) : PageData | null {
134+ if ( main . getAttribute ( "data-noindex" ) === "true" ) {
135+ return null ;
136+ }
137+
138+ return {
139+ href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
140+ title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
141+ sections : getPageSectionsForSearchIndex ( main ) ,
142+ } ;
143+ }
144+
145+ function extractPageLLMContent (
146+ main : X_HTMLElement ,
147+ pageTitle : string | undefined ,
148+ filePath : string ,
149+ nextOutputDir : string ,
150+ ) : { links : string ; full : string } {
151+ if (
152+ main . getAttribute ( "data-noindex" ) === "true" ||
153+ main . getAttribute ( "data-no-llm" ) === "true"
154+ ) {
155+ return { links : "" , full : "" } ;
156+ }
157+
158+ const htmlToMarkdown = new NodeHtmlMarkdown ( {
159+ keepDataImages : false ,
160+ } ) ;
161+
162+ let linksContent = "" ;
163+ let fullContent = "" ;
164+
165+ const pageUrl = filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ;
166+
167+ // Get first non-empty paragraph for description
168+ const paragraphs = main . querySelectorAll ( "p" ) ;
169+ let description = "" ;
170+ for ( const p of paragraphs ) {
171+ if ( p . getAttribute ( "data-noindex" ) !== "true" ) {
172+ description = trimExtraSpace ( htmlToMarkdown . translate ( p . toString ( ) ) ) ;
173+ if ( description ) break ;
174+ }
175+ }
176+
177+ linksContent += `* [${ pageTitle } ](${ pageUrl } ): ${ description } ` ;
178+
179+ // Convert main content to markdown, excluding noindex elements
180+ const contentElements = main . querySelectorAll ( "*" ) ;
181+ for ( const element of contentElements ) {
182+ if ( element . getAttribute ( "data-noindex" ) === "true" ) {
183+ element . remove ( ) ;
184+ }
185+ }
186+
187+ // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
188+ const headings = main . querySelectorAll ( "h1, h2, h3, h4, h5, h6" ) ;
189+ for ( const heading of headings ) {
190+ const headingLevel = Number . parseInt ( heading . tagName . replace ( "H" , "" ) ) ;
191+ heading . tagName = `H${ headingLevel + 1 } ` ;
192+ }
193+
194+ // prefix all the relative links with the `https://portal.thirdweb.com`
195+ const links = main . querySelectorAll ( "a" ) ;
196+ for ( const link of links ) {
197+ if ( link . getAttribute ( "href" ) ?. startsWith ( "/" ) ) {
198+ link . setAttribute (
199+ "href" ,
200+ `https://portal.thirdweb.com${ link . getAttribute ( "href" ) } ` ,
201+ ) ;
202+ }
203+ }
204+
205+ // for code blocks inside pre tags -> make them direct descendants of the pre tag
206+ // so they are parsed as blocks by node-html-markdown + add language class
207+ const preTags = main . querySelectorAll ( "pre" ) ;
208+ for ( const preTag of preTags ) {
209+ const codeBlock = parse ( preTag . innerHTML . toString ( ) , {
210+ comment : false ,
211+ blockTextElements : {
212+ pre : true ,
213+ } ,
214+ } ) . querySelector ( "code" ) ;
215+
216+ if ( codeBlock ) {
217+ const code = codeBlock
218+ . querySelectorAll ( "div > div > div > div" )
219+ . map ( ( x ) => x . textContent )
220+ . join ( "\n" )
221+ . trim ( ) ;
222+
223+ const lang = codeBlock . getAttribute ( "lang" ) ;
224+ codeBlock . textContent = code ;
225+
226+ const newCodePreBlock = parse (
227+ `<pre><code class=${ lang ? `language-${ lang } ` : "" } >${ he . encode ( code ) } </code></pre>` ,
228+ ) ;
229+
230+ preTag . replaceWith ( newCodePreBlock ) ;
231+ }
232+ }
233+
234+ // console.log(main.toString());
235+
236+ // Convert the cleaned HTML to markdown
237+ fullContent += `${ htmlToMarkdown . translate ( main . toString ( ) ) } ` ;
238+
239+ return {
240+ links : linksContent ,
241+ full : fullContent ,
242+ } ;
81243}
82244
83- function getPageSections ( main : X_HTMLElement ) : PageSectionData [ ] {
245+ function getPageSectionsForSearchIndex ( main : X_HTMLElement ) : PageSectionData [ ] {
84246 const sectionData : PageSectionData [ ] = [ ] ;
85247
86- const ignoreTags = new Set ( [ "code" , "nav" ] . map ( ( t ) => t . toUpperCase ( ) ) ) ;
248+ const ignoreTags = new Set (
249+ [ "code" , "nav" , "pre" ] . map ( ( t ) => t . toUpperCase ( ) ) ,
250+ ) ;
87251
88252 function collector ( node : X_Node ) {
89253 if ( node instanceof X_CommentNode ) {
@@ -94,9 +258,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94258 return ;
95259 }
96260
97- const noIndexAttribute = node . getAttribute ( "data-noindex" ) ;
98-
99- if ( noIndexAttribute === "true" ) {
261+ if ( node . getAttribute ( "data-noindex" ) === "true" ) {
100262 return ;
101263 }
102264
0 commit comments