Skip to content

Commit 441a6fe

Browse files
committed
Fix snippet-final-v5-(2).jpg
1 parent 737d8e2 commit 441a6fe

File tree

9 files changed

+642
-301
lines changed

9 files changed

+642
-301
lines changed

gatsby/onCreateNode.ts

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import slugify from 'slugify'
66
import { JSDOM } from 'jsdom'
77
import { GatsbyNode } from 'gatsby'
88
import { PAGEVIEW_CACHE_KEY } from './onPreBootstrap'
9-
import { resolveSnippets } from './snippetUtils'
109

1110
require('dotenv').config({
1211
path: `.env.${process.env.NODE_ENV}`,
@@ -271,16 +270,15 @@ export const onCreateNode: GatsbyNode['onCreateNode'] = async ({
271270
}
272271

273272
const contentWithoutFrontmatter = stripFrontmatter(node.rawBody)
274-
const contentWithSnippets = resolveSnippets(contentWithoutFrontmatter, node.fileAbsolutePath)
275273

276274
// Prepend title as H1 if it exists
277275
const title = node.frontmatter?.title
278-
const contentWithSnippetsAndTitle = title ? `# ${title}\n\n${contentWithSnippets}` : contentWithSnippets
276+
const contentWithTitle = title ? `# ${title}\n\n${contentWithoutFrontmatter}` : contentWithoutFrontmatter
279277

280278
createNodeField({
281279
node,
282280
name: `contentWithSnippets`,
283-
value: contentWithSnippetsAndTitle,
281+
value: contentWithTitle,
284282
})
285283
}
286284

gatsby/onPostBuild.ts

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,7 @@ const createOrUpdateStrapiPosts = async (posts, roadmaps) => {
482482
)
483483
}
484484

485-
export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql }) => {
485+
export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql, reporter }) => {
486486
// Generate API spec markdown files first
487487
try {
488488
const openApiSpecUrl = process.env.POSTHOG_OPEN_API_SPEC_URL || 'https://app.posthog.com/api/schema/'
@@ -555,26 +555,24 @@ export const onPostBuild: GatsbyNode['onPostBuild'] = async ({ graphql }) => {
555555
generateSdkReferencesMarkdown(node)
556556
})
557557

558-
// Generate markdown files for llms.txt file and LLM ingestion (after API spec files exist)
559-
const markdownQuery = await graphql(`
560-
query pagesForMarkdown {
561-
allMdx {
558+
// Generate markdown files for llms.txt file and LLM ingestion (after pages are built)
559+
// Convert HTML files to markdown using turndown
560+
const docsQuery = (await graphql(`
561+
query {
562+
allMdx(filter: { fields: { slug: { regex: "/^/docs/" } } }) {
562563
nodes {
563-
frontmatter {
564-
title
565-
date
566-
}
567-
rawBody
568564
fields {
569565
slug
570-
contentWithSnippets
566+
}
567+
frontmatter {
568+
title
571569
}
572570
}
573571
}
574572
}
575-
`)
573+
`)) as { data: { allMdx: { nodes: Array<{ fields: { slug: string }; frontmatter: { title: string } }> } } }
576574

577-
const filteredPages = await generateRawMarkdownPages(markdownQuery.data.allMdx.nodes)
575+
const filteredPages = await generateRawMarkdownPages(docsQuery.data.allMdx.nodes, reporter)
578576
generateLlmsTxt(filteredPages)
579577

580578
if (process.env.AWS_CODEPIPELINE !== 'true') {

gatsby/rawMarkdownUtils.ts

Lines changed: 55 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,20 @@ import path from 'path'
22
import fs from 'fs'
33
import { SdkReferenceData } from '../src/templates/sdk/SdkReference'
44
import { getLanguageFromSdkId } from '../src/components/SdkReferences/utils'
5+
import {
6+
createTurndownService,
7+
extractTitleFromHtml,
8+
extractMainContent,
9+
postProcessMarkdown,
10+
preprocessHtmlForTabs,
11+
} from './turndownService'
12+
13+
export const generateRawMarkdownPages = async (
14+
docsNodes: Array<{ fields: { slug: string }; frontmatter: { title: string } }>,
15+
reporter?: any
16+
) => {
17+
const publicPath = path.resolve(__dirname, '../public')
518

6-
// Function to generate raw markdown files
7-
export const generateRawMarkdownPages = async (pages) => {
8-
console.log('Generating markdown files for LLMs...')
9-
10-
// Filter out any pages with certain slugs
1119
const excludeTerms = [
1220
'/_snippets',
1321
'/snippets/',
@@ -23,53 +31,63 @@ export const generateRawMarkdownPages = async (pages) => {
2331
'/startups',
2432
'/example-components',
2533
]
26-
const filteredPages = pages.filter((doc) => !excludeTerms.some((term) => doc.fields.slug.includes(term)))
2734

28-
console.log(`Found ${filteredPages.length} docs to generate markdown for (filtered from ${pages.length} total)`)
35+
const filteredNodes = docsNodes.filter((node) => {
36+
return !excludeTerms.some((term) => node.fields.slug.includes(term))
37+
})
38+
39+
const processedPages: Array<{ slug: string; title: string }> = []
2940

30-
for (const doc of filteredPages) {
41+
for (const node of filteredNodes) {
3142
try {
32-
const { slug, contentWithSnippets } = doc.fields
33-
const { title } = doc.frontmatter
34-
const body = contentWithSnippets || doc.rawBody
35-
36-
// Create the frontmatter, so it always has the page title
37-
let markdownContent = `---\ntitle: ${title}\nslug: ${slug}\n---\n`
38-
39-
// Add the content
40-
if (body) {
41-
// Process internal links to point to .md equivalents
42-
let processedBody = body.replace(/\[([^\]]+)\]\(\/([^)]+)\)/g, (match, text, path) => {
43-
// Only convert if the path doesn't already end with .md
44-
if (!path.endsWith('.md')) {
45-
return `[${text}](/${path}.md)`
46-
}
47-
return match
48-
})
43+
const { slug } = node.fields
44+
const htmlFilePath = path.join(publicPath, slug, 'index.html')
4945

50-
markdownContent += processedBody
46+
if (!fs.existsSync(htmlFilePath)) {
47+
if (reporter) {
48+
reporter.warn(`HTML file not found: ${htmlFilePath}`)
49+
}
50+
continue
5151
}
5252

53-
// Create the directory structure
54-
const publicPath = path.resolve(__dirname, '../public')
55-
const filePath = path.join(publicPath, `${slug}.md`)
56-
const dirPath = path.dirname(filePath)
53+
const html = fs.readFileSync(htmlFilePath, 'utf8')
54+
const title = extractTitleFromHtml(html) || node.frontmatter.title || 'Untitled'
55+
const mainContent = extractMainContent(html)
56+
const preprocessedContent = preprocessHtmlForTabs(mainContent)
57+
58+
const turndownService = createTurndownService(title)
59+
let markdown = turndownService.turndown(preprocessedContent)
60+
markdown = postProcessMarkdown(markdown, title)
61+
62+
const outputPath = path.join(publicPath, `${slug}.md`)
63+
const dirPath = path.dirname(outputPath)
5764

58-
// Ensure directory exists
5965
if (!fs.existsSync(dirPath)) {
6066
fs.mkdirSync(dirPath, { recursive: true })
6167
}
6268

63-
// Write the file
64-
fs.writeFileSync(filePath, markdownContent, 'utf8')
65-
console.log(`Generated: ${slug}.md`)
69+
fs.writeFileSync(outputPath, markdown, 'utf8')
70+
processedPages.push({ slug, title })
71+
72+
if (reporter) {
73+
reporter.info(`Generated: ${slug}.md`)
74+
}
6675
} catch (error) {
67-
console.error(`Error generating markdown for ${doc.fields.slug}:`, error)
76+
const errorMsg = `Error generating markdown for ${node.fields.slug}: ${error}`
77+
if (reporter) {
78+
reporter.error(errorMsg)
79+
}
6880
}
6981
}
7082

71-
// Return filtered pages for use in generateLlmsTxt
72-
return filteredPages
83+
if (reporter) {
84+
reporter.info(`Generated ${processedPages.length} markdown files`)
85+
}
86+
87+
return processedPages.map((page) => ({
88+
fields: { slug: page.slug },
89+
frontmatter: { title: page.title },
90+
}))
7391
}
7492

7593
// Function to generate individual API endpoint markdown files from the OpenAPI spec

gatsby/snippetUtils.ts

Lines changed: 0 additions & 66 deletions
This file was deleted.

0 commit comments

Comments
 (0)