Skip to content

Commit 3b50219

Browse files
authored
Fix pagefind (#557)
* fix pagefind for nextjs v16 * working * better rendering
1 parent a7314b8 commit 3b50219

File tree

3 files changed

+154
-1
lines changed

3 files changed

+154
-1
lines changed

package.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,11 @@
1010
"lint": "pnpm dlx ultracite check",
1111
"format": "pnpm dlx ultracite fix",
1212
"prepare": "husky install",
13-
"postbuild": "pagefind --site .next/server/app --output-path public/_pagefind && next-sitemap",
13+
"postbuild": "pnpm run custompagefind && next-sitemap",
1414
"translate": "pnpm dlx tsx scripts/i18n-sync/index.ts && pnpm format",
1515
"sync:metas": "pnpm dlx tsx scripts/sync-metas.ts app/en",
1616
"llmstxt": "pnpm dlx tsx scripts/generate-llmstxt.ts",
17+
"custompagefind": "pnpm dlx tsx scripts/pagefind.ts",
1718
"test": "vitest --run",
1819
"test:watch": "vitest --watch"
1920
},
@@ -74,6 +75,9 @@
7475
"pagefind": "1.4.0",
7576
"picocolors": "1.1.1",
7677
"postcss": "8.5.6",
78+
"rehype-stringify": "^10.0.1",
79+
"remark": "^15.0.1",
80+
"remark-rehype": "^11.1.2",
7781
"tailwindcss": "4.1.16",
7882
"typescript": "5.9.3",
7983
"ultracite": "6.1.0",

pnpm-lock.yaml

Lines changed: 18 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/pagefind.ts

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import fs from "node:fs/promises";
2+
import path, { dirname } from "node:path";
3+
import { fileURLToPath } from "node:url";
4+
import glob from "fast-glob";
5+
import { createIndex } from "pagefind";
6+
import rehypeStringify from "rehype-stringify";
7+
import { remark } from "remark";
8+
import remarkRehype from "remark-rehype";
9+
10+
const __filename = fileURLToPath(import.meta.url);
11+
const __dirname = dirname(__filename);
12+
13+
// Regex patterns for cleaning MDX content
14+
const FRONTMATTER_REGEX = /^---\n[\s\S]*?\n---\n?/m;
15+
const IMPORT_REGEX = /^import\s+.*?from\s+['"].*?['"];?\n?/gm;
16+
const EXPORT_REGEX = /^export\s+(?:const|function|class|default|{).*?;?\n?/gm;
17+
const JSX_SELF_CLOSING_REGEX = /<[A-Z]\w*(?:\s+[^>]*)?\/>/g;
18+
const JSX_COMPONENT_REGEX = /<[A-Z]\w*(?:\s+[^>]*)?>[\s\S]*?<\/[A-Z]\w*>/g;
19+
const JSX_CUSTOM_COMPONENT_REGEX =
20+
/<[A-Z][\w.]*(?:\s+[^>]*)?>[\s\S]*?<\/[A-Z][\w.]*>/g;
21+
22+
/**
23+
* Converts MDX content to simple HTML by stripping MDX-specific syntax
24+
* and converting markdown to HTML. Skips what can't be rendered.
25+
*/
26+
async function markdownToHtml(mdxContent: string): Promise<string> {
27+
try {
28+
let content = mdxContent;
29+
30+
// Remove frontmatter (---\n...\n---)
31+
content = content.replace(FRONTMATTER_REGEX, "");
32+
33+
// Remove import statements
34+
content = content.replace(IMPORT_REGEX, "");
35+
36+
// Remove export statements (but keep default exports that might be content)
37+
content = content.replace(EXPORT_REGEX, "");
38+
39+
// Remove JSX components (both self-closing and with children)
40+
// This regex matches <Component /> and <Component>...</Component>
41+
content = content.replace(JSX_SELF_CLOSING_REGEX, "");
42+
content = content.replace(JSX_COMPONENT_REGEX, "");
43+
44+
// Remove remaining JSX-like tags that might be custom components
45+
content = content.replace(JSX_CUSTOM_COMPONENT_REGEX, "");
46+
47+
// Convert markdown to HTML using remark/rehype (same ecosystem as Nextra)
48+
const result = await remark()
49+
.use(remarkRehype)
50+
.use(rehypeStringify)
51+
.process(content);
52+
53+
return String(result);
54+
} catch (error) {
55+
// If markdown parsing fails, return the cleaned content as plain text
56+
// This ensures we still index the content even if HTML conversion fails
57+
console.warn(
58+
`Warning: Failed to convert markdown to HTML, using plain text: ${error}`
59+
);
60+
// Return the cleaned content (without MDX syntax) as fallback
61+
let cleaned = mdxContent;
62+
cleaned = cleaned.replace(FRONTMATTER_REGEX, "");
63+
cleaned = cleaned.replace(IMPORT_REGEX, "");
64+
cleaned = cleaned.replace(EXPORT_REGEX, "");
65+
cleaned = cleaned.replace(JSX_SELF_CLOSING_REGEX, "");
66+
cleaned = cleaned.replace(JSX_COMPONENT_REGEX, "");
67+
cleaned = cleaned.replace(JSX_CUSTOM_COMPONENT_REGEX, "");
68+
return cleaned;
69+
}
70+
}
71+
72+
const { index } = await createIndex();
73+
if (!index) {
74+
throw new Error("Failed to create index");
75+
}
76+
77+
console.log("\r\n🔍 BUILDING SEARCH INDEX\r\n");
78+
79+
// valid languages are those in the app directory that do not start with an underscore and are not "api"
80+
const appDir = path.join(__dirname, "..", "app");
81+
const entries = await fs.readdir(appDir);
82+
const languages = await Promise.all(
83+
entries.map(async (dir: string) => {
84+
if (dir.startsWith("_") || dir === "api") {
85+
return null;
86+
}
87+
const entryPath = path.join(appDir, dir);
88+
const stats = await fs.stat(entryPath);
89+
return stats.isDirectory() ? dir : null;
90+
})
91+
).then((results) => results.filter((dir): dir is string => dir !== null));
92+
93+
let page_count = 0;
94+
95+
console.log("Building search index for languages: ", languages.join(", "));
96+
97+
for (const language of languages) {
98+
const searchPath = path.join(__dirname, "..", "app", language);
99+
100+
console.log(`Adding directory: ${searchPath}`);
101+
102+
for (const entry of glob.sync("**/*.mdx", { cwd: searchPath })) {
103+
const filePath = path.join(searchPath, entry);
104+
const url = `/${language}/${entry.split("/page.mdx")[0]}`;
105+
const mdxContent = await fs.readFile(filePath, "utf-8");
106+
const htmlContent = await markdownToHtml(mdxContent);
107+
108+
const { errors, file } = await index.addHTMLFile({
109+
url,
110+
content: `<html lang='${language}'><body>${htmlContent}</body></html>`,
111+
});
112+
113+
const fileInfo = file
114+
? ` (${file.uniqueWords} words${file.meta?.title ? `, title: ${file.meta.title}` : ""})`
115+
: "";
116+
console.log(`Adding page: ${url}${fileInfo}`);
117+
118+
if (errors.length > 0) {
119+
console.error(`Error adding page: ${url}`);
120+
console.error(errors);
121+
}
122+
123+
page_count += 1;
124+
}
125+
}
126+
127+
console.log(`Added ${page_count} pages`);
128+
129+
await index.writeFiles({
130+
outputPath: path.join(__dirname, "..", "public", "_pagefind"),
131+
});

0 commit comments

Comments
 (0)