11import { readFile } from "node:fs/promises" ;
2+ import he from "he" ;
3+ import { NodeHtmlMarkdown } from "node-html-markdown" ;
24import {
35 CommentNode as X_CommentNode ,
46 HTMLElement as X_HTMLElement ,
@@ -11,11 +13,21 @@ import { getFilesRecursive } from "./getFilesRecursive";
1113import { ignoreHeadings } from "./settings" ;
1214import { trimExtraSpace } from "./trimExtraSpace" ;
1315
14- export async function extractSearchData ( rootDir : string ) : Promise < PageData [ ] > {
16+ type ExtractedContent = {
17+ searchData : PageData [ ] ;
18+ llmContent : string ;
19+ llmFullContent : string ;
20+ } ;
21+
22+ export async function extractContent (
23+ rootDir : string ,
24+ ) : Promise < ExtractedContent > {
1525 const nextOutputDir = `${ rootDir } /.next/server/app` ;
1626 const htmlFiles = getFilesRecursive ( nextOutputDir , "html" ) ;
1727
1828 const pages : PageData [ ] = [ ] ;
29+ let llmContent = "" ;
30+ let llmFullContent = "" ;
1931
2032 const noMainFound : string [ ] = [ ] ;
2133 const noH1Found : string [ ] = [ ] ;
@@ -25,8 +37,9 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
2537 const htmlContent = await readFile ( filePath , "utf-8" ) ;
2638 const mainEl = parse ( htmlContent , {
2739 comment : false ,
40+ // fixNestedATags: true,
2841 blockTextElements : {
29- pre : false , // parse text inside <pre> elements instead of treating it as text
42+ pre : true ,
3043 } ,
3144 } ) . querySelector ( "main" ) ;
3245
@@ -38,24 +51,37 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
3851 }
3952
4053 const noIndex = mainEl . getAttribute ( "data-noindex" ) ;
41-
42- if ( noIndex ) {
54+ if ( noIndex === "true" ) {
4355 return ;
4456 }
4557
4658 const pageTitle = mainEl . querySelector ( "h1" ) ?. text ;
47-
4859 if ( ! pageTitle ) {
4960 noH1Found . push (
5061 filePath . split ( ".next/server/app" ) [ 1 ] ?. replace ( ".html" , "" ) || "" ,
5162 ) ;
5263 }
5364
54- pages . push ( {
55- href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
56- title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
57- sections : getPageSections ( mainEl ) ,
58- } ) ;
65+ // Extract search data
66+ const pageData = extractPageSearchData (
67+ mainEl ,
68+ filePath ,
69+ nextOutputDir ,
70+ pageTitle ,
71+ ) ;
72+ if ( pageData ) {
73+ pages . push ( pageData ) ;
74+ }
75+
76+ // Extract LLM content
77+ const { links, full } = extractPageLLMContent (
78+ mainEl ,
79+ pageTitle ,
80+ filePath ,
81+ nextOutputDir ,
82+ ) ;
83+ llmContent += links ? `${ links } \n` : "" ;
84+ llmFullContent += full ? `${ full } \n` : "" ;
5985 } ) ,
6086 ) ;
6187
@@ -77,13 +103,127 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
77103 console . warn ( "\n" ) ;
78104 }
79105
80- return pages ;
106+ return {
107+ searchData : pages ,
108+ llmContent,
109+ llmFullContent,
110+ } ;
81111}
82112
83- function getPageSections ( main : X_HTMLElement ) : PageSectionData [ ] {
113+ function extractPageSearchData (
114+ main : X_HTMLElement ,
115+ filePath : string ,
116+ nextOutputDir : string ,
117+ pageTitle : string | undefined ,
118+ ) : PageData | null {
119+ if ( main . getAttribute ( "data-noindex" ) === "true" ) {
120+ return null ;
121+ }
122+
123+ return {
124+ href : filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ,
125+ title : pageTitle ? trimExtraSpace ( pageTitle ) : "" ,
126+ sections : getPageSectionsForSearchIndex ( main ) ,
127+ } ;
128+ }
129+
130+ function extractPageLLMContent (
131+ main : X_HTMLElement ,
132+ pageTitle : string | undefined ,
133+ filePath : string ,
134+ nextOutputDir : string ,
135+ ) : { links : string ; full : string } {
136+ if (
137+ main . getAttribute ( "data-noindex" ) === "true" ||
138+ main . getAttribute ( "data-no-llm" ) === "true"
139+ ) {
140+ return { links : "" , full : "" } ;
141+ }
142+
143+ const htmlToMarkdown = new NodeHtmlMarkdown ( { } ) ;
144+
145+ let linksContent = "" ;
146+ let fullContent = "" ;
147+
148+ const pageUrl = filePath . replace ( nextOutputDir , "" ) . replace ( ".html" , "" ) ;
149+
150+ // Get first non-empty paragraph for description
151+ const paragraphs = main . querySelectorAll ( "p" ) ;
152+ let description = "" ;
153+ for ( const p of paragraphs ) {
154+ if ( p . getAttribute ( "data-noindex" ) !== "true" ) {
155+ description = trimExtraSpace ( htmlToMarkdown . translate ( p . toString ( ) ) ) ;
156+ if ( description ) break ;
157+ }
158+ }
159+
160+ linksContent += `* [${ pageTitle } ](${ pageUrl } ): ${ description } ` ;
161+
162+ // Convert main content to markdown, excluding noindex elements
163+ const contentElements = main . querySelectorAll ( "*" ) ;
164+ for ( const element of contentElements ) {
165+ if ( element . getAttribute ( "data-noindex" ) === "true" ) {
166+ element . remove ( ) ;
167+ }
168+ }
169+
170+ // prefix all the relative links with the `https://portal.thirdweb.com`
171+ const links = main . querySelectorAll ( "a" ) ;
172+ for ( const link of links ) {
173+ if ( link . getAttribute ( "href" ) ?. startsWith ( "/" ) ) {
174+ link . setAttribute (
175+ "href" ,
176+ `https://portal.thirdweb.com${ link . getAttribute ( "href" ) } ` ,
177+ ) ;
178+ }
179+ }
180+
181+ // for code blocks inside pre tags -> make them direct descendants of the pre tag
182+ // so they are parsed as blocks by node-html-markdown + add language class
183+ const preTags = main . querySelectorAll ( "pre" ) ;
184+ for ( const preTag of preTags ) {
185+ const codeBlock = parse ( preTag . innerHTML . toString ( ) , {
186+ comment : false ,
187+ blockTextElements : {
188+ pre : true ,
189+ } ,
190+ } ) . querySelector ( "code" ) ;
191+
192+ if ( codeBlock ) {
193+ const code = codeBlock
194+ . querySelectorAll ( "div > div > div" )
195+ . map ( ( x ) => x . textContent )
196+ . filter ( ( x ) => x !== "" )
197+ . join ( "\n" ) ;
198+
199+ const lang = codeBlock . getAttribute ( "lang" ) ;
200+ codeBlock . textContent = code ;
201+
202+ const newCodePreBlock = parse (
203+ `<pre><code class=${ lang ? `language-${ lang } ` : "" } >${ he . encode ( code ) } </code></pre>` ,
204+ ) ;
205+
206+ preTag . replaceWith ( newCodePreBlock ) ;
207+ }
208+ }
209+
210+ // console.log(main.toString());
211+
212+ // Convert the cleaned HTML to markdown
213+ fullContent += `${ htmlToMarkdown . translate ( main . toString ( ) ) } ` ;
214+
215+ return {
216+ links : linksContent ,
217+ full : fullContent ,
218+ } ;
219+ }
220+
221+ function getPageSectionsForSearchIndex ( main : X_HTMLElement ) : PageSectionData [ ] {
84222 const sectionData : PageSectionData [ ] = [ ] ;
85223
86- const ignoreTags = new Set ( [ "code" , "nav" ] . map ( ( t ) => t . toUpperCase ( ) ) ) ;
224+ const ignoreTags = new Set (
225+ [ "code" , "nav" , "pre" ] . map ( ( t ) => t . toUpperCase ( ) ) ,
226+ ) ;
87227
88228 function collector ( node : X_Node ) {
89229 if ( node instanceof X_CommentNode ) {
@@ -94,9 +234,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
94234 return ;
95235 }
96236
97- const noIndexAttribute = node . getAttribute ( "data-noindex" ) ;
98-
99- if ( noIndexAttribute === "true" ) {
237+ if ( node . getAttribute ( "data-noindex" ) === "true" ) {
100238 return ;
101239 }
102240
0 commit comments