11import { readFile } from "node:fs/promises" ;
2+ import he from "he" ;
3+ import { NodeHtmlMarkdown } from "node-html-markdown" ;
24import {
35 CommentNode as X_CommentNode ,
46 HTMLElement as X_HTMLElement ,
@@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113import { ignoreHeadings } from "./settings" ;
1214import { trimExtraSpace } from "./trimExtraSpace" ;
1315
14- export async function extractSearchData ( rootDir : string ) : Promise < PageData [ ] > {
16+ type ExtractedContent = {
17+ searchData : PageData [ ] ;
18+ llmContent : string ;
19+ llmFullContent : string ;
20+ } ;
21+
22+ const llmsContentHeader = `\
23+ # thirdweb
24+
25+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
26+
27+ ## Docs
28+ ` ;
29+
30+ const llmsFullContentHeader = `\
31+ # thirdweb
32+
33+ > Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
34+ ` ;
35+
36+ export async function extractContent (
37+ rootDir : string ,
38+ ) : Promise < ExtractedContent > {
1539 const nextOutputDir = `${ rootDir } /.next/server/app` ;
1640 const htmlFiles = getFilesRecursive ( nextOutputDir , "html" ) ;
1741
1842 const pages : PageData [ ] = [ ] ;
43+ let llmContent = "" ;
44+ let llmFullContent = "" ;
1945
2046 const noMainFound : string [ ] = [ ] ;
2147 const noH1Found : string [ ] = [ ] ;
@@ -25,8 +51,9 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2551 const htmlContent = await readFile ( filePath , "utf-8" ) ;
2652 const mainEl = parse ( htmlContent , {
2753 comment : false ,
54+ // fixNestedATags: true,
2855 blockTextElements : {
29- pre : false , // parse text inside <pre> elements instead of treating it as text
56+ pre : true ,
3057 } ,
3158 } ) . querySelector ( "main" ) ;
3259
@@ -38,24 +65,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3865 }
3966
4067 const noIndex = mainEl . getAttribute ( "data-noindex" ) ;
41-
42- if ( noIndex ) {
68+ if ( noIndex === "true" ) {
4369 return ;
4470 }
4571
4672 const pageTitle = mainEl . querySelector ( "h1" ) ?. text ;
47-
4873 if ( ! pageTitle ) {
4974 noH1Found . push (
5075 filePath . split ( ".next/server/app" ) [ 1 ] ?. replace ( ".html" , "" ) || "" ,
5176 ) ;
5277 }
5378
54- pages . push ( {
55- href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
56- title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
57- sections : getPageSections ( mainEl ) ,
58- } ) ;
79+ // Important: do the search index collection first - we will modify the main element in the next step
80+ // Extract search data
81+ const pageData = extractPageSearchData (
82+ mainEl ,
83+ filePath ,
84+ nextOutputDir ,
85+ pageTitle ,
86+ ) ;
87+ if ( pageData ) {
88+ pages . push ( pageData ) ;
89+ }
90+
91+ // Extract LLM content
92+ const { links, full } = extractPageLLMContent (
93+ mainEl ,
94+ pageTitle ,
95+ filePath ,
96+ nextOutputDir ,
97+ ) ;
98+ llmContent += links ? `${ links } \n` : "" ;
99+ llmFullContent += full ? `${ full } \n` : "" ;
59100 } ) ,
60101 ) ;
61102
@@ -77,13 +118,134 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77118 console . warn ( "\n" ) ;
78119 }
79120
80- return pages ;
121+ return {
122+ searchData : pages ,
123+ llmContent : `${ llmsContentHeader } \n${ llmContent } ` ,
124+ llmFullContent : `${ llmsFullContentHeader } \n${ llmFullContent } ` ,
125+ } ;
126+ }
127+
128+ function extractPageSearchData (
129+ main : X_HTMLElement ,
130+ filePath : string ,
131+ nextOutputDir : string ,
132+ pageTitle : string | undefined ,
133+ ) : PageData | null {
134+ if ( main . getAttribute ( "data-noindex" ) === "true" ) {
135+ return null ;
136+ }
137+
138+ return {
139+ href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
140+ title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
141+ sections : getPageSectionsForSearchIndex ( main ) ,
142+ } ;
143+ }
144+
145+ function extractPageLLMContent (
146+ main : X_HTMLElement ,
147+ pageTitle : string | undefined ,
148+ filePath : string ,
149+ nextOutputDir : string ,
150+ ) : { links : string ; full : string } {
151+ if (
152+ main . getAttribute ( "data-noindex" ) === "true" ||
153+ main . getAttribute ( "data-no-llm" ) === "true"
154+ ) {
155+ return { links : "" , full : "" } ;
156+ }
157+
158+ const htmlToMarkdown = new NodeHtmlMarkdown ( { } ) ;
159+
160+ let linksContent = "" ;
161+ let fullContent = "" ;
162+
163+ const pageUrl = filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ;
164+
165+ // Get first non-empty paragraph for description
166+ const paragraphs = main . querySelectorAll ( "p" ) ;
167+ let description = "" ;
168+ for ( const p of paragraphs ) {
169+ if ( p . getAttribute ( "data-noindex" ) !== "true" ) {
170+ description = trimExtraSpace ( htmlToMarkdown . translate ( p . toString ( ) ) ) ;
171+ if ( description ) break ;
172+ }
173+ }
174+
175+ linksContent += `* [${ pageTitle } ](${ pageUrl } ): ${ description } ` ;
176+
177+ // Convert main content to markdown, excluding noindex elements
178+ const contentElements = main . querySelectorAll ( "*" ) ;
179+ for ( const element of contentElements ) {
180+ if ( element . getAttribute ( "data-noindex" ) === "true" ) {
181+ element . remove ( ) ;
182+ }
183+ }
184+
185+ // Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
186+ const headings = main . querySelectorAll ( "h1, h2, h3, h4, h5, h6" ) ;
187+ for ( const heading of headings ) {
188+ const headingLevel = Number . parseInt ( heading . tagName . replace ( "H" , "" ) ) ;
189+ heading . tagName = `H${ headingLevel + 1 } ` ;
190+ }
191+
192+ // prefix all the relative links with the `https://portal.thirdweb.com`
193+ const links = main . querySelectorAll ( "a" ) ;
194+ for ( const link of links ) {
195+ if ( link . getAttribute ( "href" ) ?. startsWith ( "/" ) ) {
196+ link . setAttribute (
197+ "href" ,
198+ `https://portal.thirdweb.com${ link . getAttribute ( "href" ) } ` ,
199+ ) ;
200+ }
201+ }
202+
203+ // for code blocks inside pre tags -> make them direct descendants of the pre tag
204+ // so they are parsed as blocks by node-html-markdown + add language class
205+ const preTags = main . querySelectorAll ( "pre" ) ;
206+ for ( const preTag of preTags ) {
207+ const codeBlock = parse ( preTag . innerHTML . toString ( ) , {
208+ comment : false ,
209+ blockTextElements : {
210+ pre : true ,
211+ } ,
212+ } ) . querySelector ( "code" ) ;
213+
214+ if ( codeBlock ) {
215+ const code = codeBlock
216+ . querySelectorAll ( "div > div > div" )
217+ . map ( ( x ) => x . textContent )
218+ . filter ( ( x ) => x !== "" )
219+ . join ( "\n" ) ;
220+
221+ const lang = codeBlock . getAttribute ( "lang" ) ;
222+ codeBlock . textContent = code ;
223+
224+ const newCodePreBlock = parse (
225+ `<pre><code class=${ lang ? `language-${ lang } ` : "" } >${ he . encode ( code ) } </code></pre>` ,
226+ ) ;
227+
228+ preTag . replaceWith ( newCodePreBlock ) ;
229+ }
230+ }
231+
232+ // console.log(main.toString());
233+
234+ // Convert the cleaned HTML to markdown
235+ fullContent += `${ htmlToMarkdown . translate ( main . toString ( ) ) } ` ;
236+
237+ return {
238+ links : linksContent ,
239+ full : fullContent ,
240+ } ;
81241}
82242
83- function getPageSections ( main : X_HTMLElement ) : PageSectionData [ ] {
243+ function getPageSectionsForSearchIndex ( main : X_HTMLElement ) : PageSectionData [ ] {
84244 const sectionData : PageSectionData [ ] = [ ] ;
85245
86- const ignoreTags = new Set ( [ "code" , "nav" ] . map ( ( t ) => t . toUpperCase ( ) ) ) ;
246+ const ignoreTags = new Set (
247+ [ "code" , "nav" , "pre" ] . map ( ( t ) => t . toUpperCase ( ) ) ,
248+ ) ;
87249
88250 function collector ( node : X_Node ) {
89251 if ( node instanceof X_CommentNode ) {
@@ -94,9 +256,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94256 return ;
95257 }
96258
97- const noIndexAttribute = node . getAttribute ( "data-noindex" ) ;
98-
99- if ( noIndexAttribute === "true" ) {
259+ if ( node . getAttribute ( "data-noindex" ) === "true" ) {
100260 return ;
101261 }
102262
0 commit comments