|
| 1 | +import fs from "node:fs/promises"; |
| 2 | +import path, { dirname } from "node:path"; |
| 3 | +import { fileURLToPath } from "node:url"; |
| 4 | +import glob from "fast-glob"; |
| 5 | +import { createIndex } from "pagefind"; |
| 6 | +import rehypeStringify from "rehype-stringify"; |
| 7 | +import { remark } from "remark"; |
| 8 | +import remarkRehype from "remark-rehype"; |
| 9 | + |
| 10 | +const __filename = fileURLToPath(import.meta.url); |
| 11 | +const __dirname = dirname(__filename); |
| 12 | + |
| 13 | +// Regex patterns for cleaning MDX content |
| 14 | +const FRONTMATTER_REGEX = /^---\n[\s\S]*?\n---\n?/m; |
| 15 | +const IMPORT_REGEX = /^import\s+.*?from\s+['"].*?['"];?\n?/gm; |
| 16 | +const EXPORT_REGEX = /^export\s+(?:const|function|class|default|{).*?;?\n?/gm; |
| 17 | +const JSX_SELF_CLOSING_REGEX = /<[A-Z]\w*(?:\s+[^>]*)?\/>/g; |
| 18 | +const JSX_COMPONENT_REGEX = /<[A-Z]\w*(?:\s+[^>]*)?>[\s\S]*?<\/[A-Z]\w*>/g; |
| 19 | +const JSX_CUSTOM_COMPONENT_REGEX = |
| 20 | + /<[A-Z][\w.]*(?:\s+[^>]*)?>[\s\S]*?<\/[A-Z][\w.]*>/g; |
| 21 | + |
| 22 | +/** |
| 23 | + * Converts MDX content to simple HTML by stripping MDX-specific syntax |
| 24 | + * and converting markdown to HTML. Skips what can't be rendered. |
| 25 | + */ |
| 26 | +async function markdownToHtml(mdxContent: string): Promise<string> { |
| 27 | + try { |
| 28 | + let content = mdxContent; |
| 29 | + |
| 30 | + // Remove frontmatter (---\n...\n---) |
| 31 | + content = content.replace(FRONTMATTER_REGEX, ""); |
| 32 | + |
| 33 | + // Remove import statements |
| 34 | + content = content.replace(IMPORT_REGEX, ""); |
| 35 | + |
| 36 | + // Remove export statements (but keep default exports that might be content) |
| 37 | + content = content.replace(EXPORT_REGEX, ""); |
| 38 | + |
| 39 | + // Remove JSX components (both self-closing and with children) |
| 40 | + // This regex matches <Component /> and <Component>...</Component> |
| 41 | + content = content.replace(JSX_SELF_CLOSING_REGEX, ""); |
| 42 | + content = content.replace(JSX_COMPONENT_REGEX, ""); |
| 43 | + |
| 44 | + // Remove remaining JSX-like tags that might be custom components |
| 45 | + content = content.replace(JSX_CUSTOM_COMPONENT_REGEX, ""); |
| 46 | + |
| 47 | + // Convert markdown to HTML using remark/rehype (same ecosystem as Nextra) |
| 48 | + const result = await remark() |
| 49 | + .use(remarkRehype) |
| 50 | + .use(rehypeStringify) |
| 51 | + .process(content); |
| 52 | + |
| 53 | + return String(result); |
| 54 | + } catch (error) { |
| 55 | + // If markdown parsing fails, return the cleaned content as plain text |
| 56 | + // This ensures we still index the content even if HTML conversion fails |
| 57 | + console.warn( |
| 58 | + `Warning: Failed to convert markdown to HTML, using plain text: ${error}` |
| 59 | + ); |
| 60 | + // Return the cleaned content (without MDX syntax) as fallback |
| 61 | + let cleaned = mdxContent; |
| 62 | + cleaned = cleaned.replace(FRONTMATTER_REGEX, ""); |
| 63 | + cleaned = cleaned.replace(IMPORT_REGEX, ""); |
| 64 | + cleaned = cleaned.replace(EXPORT_REGEX, ""); |
| 65 | + cleaned = cleaned.replace(JSX_SELF_CLOSING_REGEX, ""); |
| 66 | + cleaned = cleaned.replace(JSX_COMPONENT_REGEX, ""); |
| 67 | + cleaned = cleaned.replace(JSX_CUSTOM_COMPONENT_REGEX, ""); |
| 68 | + return cleaned; |
| 69 | + } |
| 70 | +} |
| 71 | + |
| 72 | +const { index } = await createIndex(); |
| 73 | +if (!index) { |
| 74 | + throw new Error("Failed to create index"); |
| 75 | +} |
| 76 | + |
| 77 | +console.log("\r\n🔍 BUILDING SEARCH INDEX\r\n"); |
| 78 | + |
| 79 | +// valid languages are those in the app directory that do not start with an underscore and are not "api" |
| 80 | +const appDir = path.join(__dirname, "..", "app"); |
| 81 | +const entries = await fs.readdir(appDir); |
| 82 | +const languages = await Promise.all( |
| 83 | + entries.map(async (dir: string) => { |
| 84 | + if (dir.startsWith("_") || dir === "api") { |
| 85 | + return null; |
| 86 | + } |
| 87 | + const entryPath = path.join(appDir, dir); |
| 88 | + const stats = await fs.stat(entryPath); |
| 89 | + return stats.isDirectory() ? dir : null; |
| 90 | + }) |
| 91 | +).then((results) => results.filter((dir): dir is string => dir !== null)); |
| 92 | + |
| 93 | +let page_count = 0; |
| 94 | + |
| 95 | +console.log("Building search index for languages: ", languages.join(", ")); |
| 96 | + |
| 97 | +for (const language of languages) { |
| 98 | + const searchPath = path.join(__dirname, "..", "app", language); |
| 99 | + |
| 100 | + console.log(`Adding directory: ${searchPath}`); |
| 101 | + |
| 102 | + for (const entry of glob.sync("**/*.mdx", { cwd: searchPath })) { |
| 103 | + const filePath = path.join(searchPath, entry); |
| 104 | + const url = `/${language}/${entry.split("/page.mdx")[0]}`; |
| 105 | + const mdxContent = await fs.readFile(filePath, "utf-8"); |
| 106 | + const htmlContent = await markdownToHtml(mdxContent); |
| 107 | + |
| 108 | + const { errors, file } = await index.addHTMLFile({ |
| 109 | + url, |
| 110 | + content: `<html lang='${language}'><body>${htmlContent}</body></html>`, |
| 111 | + }); |
| 112 | + |
| 113 | + const fileInfo = file |
| 114 | + ? ` (${file.uniqueWords} words${file.meta?.title ? `, title: ${file.meta.title}` : ""})` |
| 115 | + : ""; |
| 116 | + console.log(`Adding page: ${url}${fileInfo}`); |
| 117 | + |
| 118 | + if (errors.length > 0) { |
| 119 | + console.error(`Error adding page: ${url}`); |
| 120 | + console.error(errors); |
| 121 | + } |
| 122 | + |
| 123 | + page_count += 1; |
| 124 | + } |
| 125 | +} |
| 126 | + |
| 127 | +console.log(`Added ${page_count} pages`); |
| 128 | + |
| 129 | +await index.writeFiles({ |
| 130 | + outputPath: path.join(__dirname, "..", "public", "_pagefind"), |
| 131 | +}); |
0 commit comments