@@ -2,12 +2,20 @@ import path from 'path'
22import fs from 'fs'
33import { SdkReferenceData } from '../src/templates/sdk/SdkReference'
44import { getLanguageFromSdkId } from '../src/components/SdkReferences/utils'
5+ import {
6+ createTurndownService ,
7+ extractTitleFromHtml ,
8+ extractMainContent ,
9+ postProcessMarkdown ,
10+ preprocessHtmlForTabs ,
11+ } from './turndownService'
12+
13+ export const generateRawMarkdownPages = async (
14+ docsNodes : Array < { fields : { slug : string } ; frontmatter : { title : string } } > ,
15+ reporter ?: any
16+ ) => {
17+ const publicPath = path . resolve ( __dirname , '../public' )
518
6- // Function to generate raw markdown files
7- export const generateRawMarkdownPages = async ( pages ) => {
8- console . log ( 'Generating markdown files for LLMs...' )
9-
10- // Filter out any pages with certain slugs
1119 const excludeTerms = [
1220 '/_snippets' ,
1321 '/snippets/' ,
@@ -23,53 +31,63 @@ export const generateRawMarkdownPages = async (pages) => {
2331 '/startups' ,
2432 '/example-components' ,
2533 ]
26- const filteredPages = pages . filter ( ( doc ) => ! excludeTerms . some ( ( term ) => doc . fields . slug . includes ( term ) ) )
2734
28- console . log ( `Found ${ filteredPages . length } docs to generate markdown for (filtered from ${ pages . length } total)` )
35+ const filteredNodes = docsNodes . filter ( ( node ) => {
36+ return ! excludeTerms . some ( ( term ) => node . fields . slug . includes ( term ) )
37+ } )
38+
39+ const processedPages : Array < { slug : string ; title : string } > = [ ]
2940
30- for ( const doc of filteredPages ) {
41+ for ( const node of filteredNodes ) {
3142 try {
32- const { slug, contentWithSnippets } = doc . fields
33- const { title } = doc . frontmatter
34- const body = contentWithSnippets || doc . rawBody
35-
36- // Create the frontmatter, so it always has the page title
37- let markdownContent = `---\ntitle: ${ title } \nslug: ${ slug } \n---\n`
38-
39- // Add the content
40- if ( body ) {
41- // Process internal links to point to .md equivalents
42- let processedBody = body . replace ( / \[ ( [ ^ \] ] + ) \] \( \/ ( [ ^ ) ] + ) \) / g, ( match , text , path ) => {
43- // Only convert if the path doesn't already end with .md
44- if ( ! path . endsWith ( '.md' ) ) {
45- return `[${ text } ](/${ path } .md)`
46- }
47- return match
48- } )
43+ const { slug } = node . fields
44+ const htmlFilePath = path . join ( publicPath , slug , 'index.html' )
4945
50- markdownContent += processedBody
46+ if ( ! fs . existsSync ( htmlFilePath ) ) {
47+ if ( reporter ) {
48+ reporter . warn ( `HTML file not found: ${ htmlFilePath } ` )
49+ }
50+ continue
5151 }
5252
53- // Create the directory structure
54- const publicPath = path . resolve ( __dirname , '../public' )
55- const filePath = path . join ( publicPath , `${ slug } .md` )
56- const dirPath = path . dirname ( filePath )
53+ const html = fs . readFileSync ( htmlFilePath , 'utf8' )
54+ const title = extractTitleFromHtml ( html ) || node . frontmatter . title || 'Untitled'
55+ const mainContent = extractMainContent ( html )
56+ const preprocessedContent = preprocessHtmlForTabs ( mainContent )
57+
58+ const turndownService = createTurndownService ( title )
59+ let markdown = turndownService . turndown ( preprocessedContent )
60+ markdown = postProcessMarkdown ( markdown , title )
61+
62+ const outputPath = path . join ( publicPath , `${ slug } .md` )
63+ const dirPath = path . dirname ( outputPath )
5764
58- // Ensure directory exists
5965 if ( ! fs . existsSync ( dirPath ) ) {
6066 fs . mkdirSync ( dirPath , { recursive : true } )
6167 }
6268
63- // Write the file
64- fs . writeFileSync ( filePath , markdownContent , 'utf8' )
65- console . log ( `Generated: ${ slug } .md` )
69+ fs . writeFileSync ( outputPath , markdown , 'utf8' )
70+ processedPages . push ( { slug, title } )
71+
72+ if ( reporter ) {
73+ reporter . info ( `Generated: ${ slug } .md` )
74+ }
6675 } catch ( error ) {
67- console . error ( `Error generating markdown for ${ doc . fields . slug } :` , error )
76+ const errorMsg = `Error generating markdown for ${ node . fields . slug } : ${ error } `
77+ if ( reporter ) {
78+ reporter . error ( errorMsg )
79+ }
6880 }
6981 }
7082
71- // Return filtered pages for use in generateLlmsTxt
72- return filteredPages
83+ if ( reporter ) {
84+ reporter . info ( `Generated ${ processedPages . length } markdown files` )
85+ }
86+
87+ return processedPages . map ( ( page ) => ( {
88+ fields : { slug : page . slug } ,
89+ frontmatter : { title : page . title } ,
90+ } ) )
7391}
7492
7593// Function to generate individual API endpoint markdown files from the OpenAPI spec
0 commit comments