Skip to content

Commit 41d680d

Browse files
committed
[Docs site] Add util to generate descriptions and port into mdx files
1 parent ac3c367 commit 41d680d

File tree

3 files changed

+415
-0
lines changed

3 files changed

+415
-0
lines changed

bin/generate-descriptions.ts

Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
#!/usr/bin/env tsx
2+
3+
/**
4+
* This script generates descriptions for MDX files in the docs directory
5+
* that don't have a description field in their frontmatter.
6+
*
7+
* It uses the rendered HTML from the dist directory to generate descriptions
8+
* using the generateDescription function from src/util/props.ts.
9+
*
10+
* Usage:
11+
* npm run generate-descriptions [-- --pcx-content-type <type>]
12+
*
13+
* Options:
14+
* --pcx-content-type <type> Filter MDX files by pcx_content_type (e.g., overview, tutorial, navigation)
15+
*/
16+
17+
import fs from "fs/promises";
18+
import path from "path";
19+
import globby from "fast-glob";
20+
import { parse as parseHTML } from "node-html-parser";
21+
import { generateDescription } from "../src/util/props";
22+
import matter from "gray-matter";
23+
24+
const DOCS_DIR = path.join(process.cwd(), "src/content/docs");
25+
const DIST_DIR = path.join(process.cwd(), "dist");
26+
27+
// Maximum length for descriptions
28+
const MAX_DESCRIPTION_LENGTH = 160;
29+
30+
/**
31+
* Extracts the first paragraph from HTML content
32+
*/
33+
function extractFirstParagraph(html: string): string | undefined {
34+
const dom = parseHTML(html);
35+
const paragraph = dom.querySelector("p");
36+
37+
if (paragraph) {
38+
return paragraph.textContent.trim();
39+
}
40+
41+
return undefined;
42+
}
43+
44+
/**
45+
* Truncates a description to a reasonable length
46+
*/
47+
function truncateDescription(description: string): string {
48+
if (description.length <= MAX_DESCRIPTION_LENGTH) {
49+
return description;
50+
}
51+
52+
// Truncate at the last space before MAX_DESCRIPTION_LENGTH
53+
const truncated = description.substring(0, MAX_DESCRIPTION_LENGTH);
54+
const lastSpace = truncated.lastIndexOf(" ");
55+
56+
if (lastSpace > 0) {
57+
return truncated.substring(0, lastSpace) + "...";
58+
}
59+
60+
return truncated + "...";
61+
}
62+
63+
/**
64+
* Gets the rendered HTML path for a docs file
65+
*/
66+
function getRenderedPath(docPath: string): string {
67+
// Convert /src/content/docs/product/path/file.mdx to /dist/product/path/file/index.html
68+
const relativePath = path.relative(DOCS_DIR, docPath);
69+
const pathWithoutExt = relativePath.replace(/\.mdx$/, "");
70+
return path.join(DIST_DIR, pathWithoutExt, "index.html");
71+
}
72+
73+
/**
74+
* Updates the frontmatter of an MDX file with a description
75+
* Ensures that only the description field is modified and all other fields remain unchanged
76+
* @returns boolean indicating whether the file was updated (true) or skipped (false)
77+
*/
78+
async function updateFrontmatter(
79+
filePath: string,
80+
description: string,
81+
): Promise<boolean> {
82+
// Read the original file content to preserve exact formatting
83+
const originalContent = await fs.readFile(filePath, "utf-8");
84+
85+
// Parse the frontmatter
86+
const { data: frontmatter, content: mdxContent } = matter(originalContent);
87+
88+
// Check if the description already exists and is the same
89+
if (frontmatter.description === description) {
90+
console.log(
91+
`⏭️ Skipped ${path.relative(process.cwd(), filePath)} (description unchanged)`,
92+
);
93+
return false;
94+
}
95+
96+
// Instead of using matter.stringify which might change date formats,
97+
// we'll manually update just the description field in the original content
98+
99+
// Extract the frontmatter section (between the first two --- markers)
100+
const frontmatterMatch = originalContent.match(/^---\r?\n([\s\S]*?)\r?\n---/);
101+
if (!frontmatterMatch) {
102+
console.error(`Could not extract frontmatter from ${filePath}`);
103+
return false;
104+
}
105+
106+
const originalFrontmatter = frontmatterMatch[1];
107+
108+
// Check if description already exists in the frontmatter
109+
const descriptionRegex = /^description:.*$(\r?\n(?: .*$)*)/m;
110+
let newFrontmatter: string;
111+
112+
if (descriptionRegex.test(originalFrontmatter)) {
113+
// Replace existing description
114+
newFrontmatter = originalFrontmatter.replace(
115+
descriptionRegex,
116+
`description: >-\n ${description.replace(/\n/g, "\n ")}`,
117+
);
118+
} else {
119+
// Add description at the end of frontmatter
120+
newFrontmatter = `${originalFrontmatter.trim()}\ndescription: >-\n ${description.replace(/\n/g, "\n ")}`;
121+
}
122+
123+
// Replace the frontmatter in the original content
124+
const updatedContent = originalContent.replace(
125+
/^---\r?\n[\s\S]*?\r?\n---/,
126+
`---\n${newFrontmatter}\n---`,
127+
);
128+
129+
// Write updated content back to file
130+
await fs.writeFile(filePath, updatedContent, "utf-8");
131+
132+
console.log(`✅ Updated ${path.relative(process.cwd(), filePath)}`);
133+
134+
return true;
135+
}
136+
137+
/**
138+
* Parse command line arguments
139+
*/
140+
function parseArgs() {
141+
const args = process.argv.slice(2);
142+
let pcxContentType: string | undefined;
143+
let showHelp = false;
144+
145+
for (let i = 0; i < args.length; i++) {
146+
if (args[i] === "--pcx-content-type" && i + 1 < args.length) {
147+
pcxContentType = args[i + 1];
148+
i++; // Skip the next argument as it's the value
149+
} else if (args[i] === "--help" || args[i] === "-h") {
150+
showHelp = true;
151+
}
152+
}
153+
154+
return { pcxContentType, showHelp };
155+
}
156+
157+
/**
158+
* Main function
159+
*/
160+
function showUsage() {
161+
console.log(`
162+
Usage: npx tsx bin/generate-descriptions.ts [options]
163+
164+
Options:
165+
--pcx-content-type <type> Filter MDX files by pcx_content_type (e.g., overview, tutorial, navigation)
166+
--help, -h Show this help message
167+
`);
168+
}
169+
170+
async function main() {
171+
// Parse command line arguments
172+
const { pcxContentType, showHelp } = parseArgs();
173+
174+
if (showHelp) {
175+
showUsage();
176+
return;
177+
}
178+
179+
if (pcxContentType) {
180+
console.log(`Filtering by pcx_content_type: ${pcxContentType}`);
181+
}
182+
try {
183+
// Find all MDX files in the docs directory
184+
const mdxFiles = await globby("**/*.mdx", {
185+
cwd: DOCS_DIR,
186+
absolute: true,
187+
});
188+
console.log(`Found ${mdxFiles.length} MDX files in the docs directory`);
189+
190+
// Filter files by pcx_content_type if specified
191+
let filteredMdxFiles = mdxFiles;
192+
if (pcxContentType) {
193+
filteredMdxFiles = [];
194+
for (const mdxFile of mdxFiles) {
195+
try {
196+
const content = await fs.readFile(mdxFile, "utf-8");
197+
const { data: frontmatter } = matter(content);
198+
if (frontmatter.pcx_content_type === pcxContentType) {
199+
filteredMdxFiles.push(mdxFile);
200+
}
201+
} catch (error) {
202+
console.error(`Error reading ${mdxFile}:`, error);
203+
}
204+
}
205+
console.log(
206+
`Filtered to ${filteredMdxFiles.length} MDX files with pcx_content_type: ${pcxContentType}`,
207+
);
208+
}
209+
210+
let updatedCount = 0;
211+
let skippedExistingCount = 0;
212+
let skippedUnchangedCount = 0;
213+
let errorCount = 0;
214+
215+
for (const mdxFile of filteredMdxFiles) {
216+
try {
217+
// Parse frontmatter
218+
const content = await fs.readFile(mdxFile, "utf-8");
219+
const { data: frontmatter } = matter(content);
220+
221+
// Skip if description already exists
222+
if (frontmatter.description) {
223+
skippedExistingCount++;
224+
continue;
225+
}
226+
227+
// Get the rendered HTML path
228+
const renderedPath = getRenderedPath(mdxFile);
229+
230+
// Check if rendered HTML exists
231+
try {
232+
await fs.access(renderedPath);
233+
} catch (error) {
234+
console.warn(
235+
`⚠️ Rendered HTML not found for ${path.relative(process.cwd(), mdxFile)}`,
236+
);
237+
errorCount++;
238+
continue;
239+
}
240+
241+
// Read rendered HTML
242+
const html = await fs.readFile(renderedPath, "utf-8");
243+
244+
// Extract main content from HTML
245+
const dom = parseHTML(html);
246+
const mainContent = dom.querySelector("main")?.innerHTML || "";
247+
248+
if (!mainContent) {
249+
console.warn(
250+
`⚠️ No main content found in rendered HTML for ${path.relative(process.cwd(), mdxFile)}`,
251+
);
252+
errorCount++;
253+
continue;
254+
}
255+
256+
// Generate description
257+
let description = await generateDescription({ html: mainContent });
258+
259+
// If no description was generated, try extracting the first paragraph
260+
if (!description) {
261+
description = extractFirstParagraph(mainContent);
262+
}
263+
264+
// Skip if no description could be generated
265+
if (!description) {
266+
console.warn(
267+
`⚠️ Could not generate description for ${path.relative(process.cwd(), mdxFile)}`,
268+
);
269+
errorCount++;
270+
continue;
271+
}
272+
273+
// Truncate description if needed
274+
description = truncateDescription(description);
275+
276+
// Update frontmatter
277+
const wasUpdated = await updateFrontmatter(mdxFile, description);
278+
if (wasUpdated) {
279+
updatedCount++;
280+
} else {
281+
skippedUnchangedCount++;
282+
}
283+
} catch (error) {
284+
console.error(
285+
`❌ Error processing ${path.relative(process.cwd(), mdxFile)}:`,
286+
error,
287+
);
288+
errorCount++;
289+
}
290+
}
291+
292+
console.log("\n--- Summary ---");
293+
console.log(`Total MDX files: ${mdxFiles.length}`);
294+
console.log(`Updated: ${updatedCount}`);
295+
console.log(`Skipped (already had description): ${skippedExistingCount}`);
296+
console.log(`Skipped (description unchanged): ${skippedUnchangedCount}`);
297+
console.log(`Errors: ${errorCount}`);
298+
} catch (error) {
299+
console.error("Error:", error);
300+
process.exit(1);
301+
}
302+
}
303+
304+
main();

0 commit comments

Comments
 (0)