Skip to content

Commit 4536db8

Browse files
committed
parallelize
1 parent 693c4f4 commit 4536db8

File tree

1 file changed

+99
-39
lines changed

1 file changed

+99
-39
lines changed

scripts/generate-md-exports.mjs

Lines changed: 99 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,101 @@
11
#!/usr/bin/env node
22

3+
import {fileURLToPath} from 'url';
4+
35
import {selectAll} from 'hast-util-select';
46
import {existsSync} from 'node:fs';
57
import {mkdir, opendir, readFile, rm, writeFile} from 'node:fs/promises';
8+
import {cpus} from 'node:os';
69
import * as path from 'node:path';
10+
import {isMainThread, parentPort, Worker, workerData} from 'node:worker_threads';
711
import rehypeParse from 'rehype-parse';
812
import rehypeRemark from 'rehype-remark';
913
import remarkGfm from 'remark-gfm';
1014
import remarkStringify from 'remark-stringify';
1115
import {unified} from 'unified';
1216
import {remove} from 'unist-util-remove';
1317

14-
let root = process.cwd();
15-
while (!existsSync(path.join(root, 'package.json'))) {
16-
const parent = path.dirname(root);
17-
if (parent === root) {
18-
throw new Error('Could not find package.json in parent directories');
18+
async function createWork() {
19+
let root = process.cwd();
20+
while (!existsSync(path.join(root, 'package.json'))) {
21+
const parent = path.dirname(root);
22+
if (parent === root) {
23+
throw new Error('Could not find package.json in parent directories');
24+
}
25+
root = parent;
1926
}
20-
root = parent;
27+
const INPUT_DIR = path.join(root, '.next', 'server', 'app');
28+
const OUTPUT_DIR = path.join(root, 'public', 'md-exports');
29+
30+
console.log(`🚀 Starting markdown generation from: ${INPUT_DIR}`);
31+
console.log(`📁 Output directory: ${OUTPUT_DIR}`);
32+
33+
// Clear output directory
34+
await rm(OUTPUT_DIR, {recursive: true, force: true});
35+
await mkdir(OUTPUT_DIR, {recursive: true});
36+
37+
// On a 16-core machine, 8 workers were optimal (and slightly faster than 16)
38+
const numWorkers = Math.max(Math.floor(cpus().length / 2), 2);
39+
const workerTasks = new Array(numWorkers).fill(null).map(() => []);
40+
41+
console.log(`🔎 Discovering files to convert...`);
42+
43+
let numFiles = 0;
44+
let workerIdx = 0;
45+
// Need a high buffer size here otherwise Node skips some subdirectories!
46+
// See https://github.com/nodejs/node/issues/48820
47+
const dir = await opendir(INPUT_DIR, {recursive: true, bufferSize: 1024});
48+
for await (const dirent of dir) {
49+
if (dirent.name.endsWith('.html') && dirent.isFile()) {
50+
const sourcePath = path.join(dirent.parentPath || dirent.path, dirent.name);
51+
const targetDir = path.join(
52+
OUTPUT_DIR,
53+
path.relative(INPUT_DIR, dirent.parentPath || dirent.path)
54+
);
55+
await mkdir(targetDir, {recursive: true});
56+
const targetPath = path.join(targetDir, dirent.name.slice(0, -5) + '.md');
57+
workerTasks[workerIdx].push({sourcePath, targetPath});
58+
workerIdx = (workerIdx + 1) % numWorkers;
59+
numFiles++;
60+
}
61+
}
62+
63+
console.log(`📄 Converting ${numFiles} files with ${numWorkers} workes...`);
64+
65+
const selfPath = fileURLToPath(import.meta.url);
66+
const workerPromises = new Array(numWorkers - 1).fill(null).map((_, idx) => {
67+
return new Promise((resolve, reject) => {
68+
const worker = new Worker(selfPath, {workerData: workerTasks[idx]});
69+
let hasErrors = false;
70+
worker.on('message', data => {
71+
if (data.failedTasks.length === 0) {
72+
console.log(`✅ Worker[${idx}]: ${data.success} files successfully.`);
73+
} else {
74+
hasErrors = true;
75+
console.error(`❌ Worker[${idx}]: ${data.failedTasks.length} files failed:`);
76+
console.error(data.failedTasks);
77+
}
78+
});
79+
worker.on('error', reject);
80+
worker.on('exit', code => {
81+
if (code !== 0) {
82+
reject(new Error(`Worker[${idx}] stopped with exit code ${code}`));
83+
} else {
84+
hasErrors ? reject(new Error(`Worker[${idx}] had some errors.`)) : resolve();
85+
}
86+
});
87+
});
88+
});
89+
// The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson
90+
workerPromises.push(processTaskList(workerTasks[workerTasks.length - 1]));
91+
92+
await Promise.all(workerPromises);
93+
94+
console.log(`📄 Generated ${numFiles} markdown files from HTML.`);
95+
console.log('✅ Markdown export generation complete!');
2196
}
22-
const INPUT_DIR = path.join(root, '.next', 'server', 'app');
23-
const OUTPUT_DIR = path.join(root, 'public', 'md-exports');
2497

25-
export const genMDFromHTML = async (source, target) => {
98+
async function genMDFromHTML(source, target) {
2699
const text = await readFile(source, {encoding: 'utf8'});
27100
await writeFile(
28101
target,
@@ -52,39 +125,26 @@ export const genMDFromHTML = async (source, target) => {
52125
.process(text)
53126
)
54127
);
55-
};
56-
57-
async function main() {
58-
console.log(`🚀 Starting markdown generation from: ${INPUT_DIR}`);
59-
console.log(`📁 Output directory: ${OUTPUT_DIR}`);
128+
}
60129

61-
// Clear output directory
62-
await rm(OUTPUT_DIR, {recursive: true, force: true});
63-
await mkdir(OUTPUT_DIR, {recursive: true});
64-
let counter = 0;
65-
try {
66-
// Need a high buffer size here otherwise Node skips some subdirectories!
67-
// See https://github.com/nodejs/node/issues/48820
68-
const dir = await opendir(INPUT_DIR, {recursive: true, bufferSize: 1024});
69-
for await (const dirent of dir) {
70-
if (dirent.name.endsWith('.html') && dirent.isFile()) {
71-
const sourcePath = path.join(dirent.parentPath || dirent.path, dirent.name);
72-
const targetDir = path.join(
73-
OUTPUT_DIR,
74-
path.relative(INPUT_DIR, dirent.parentPath || dirent.path)
75-
);
76-
await mkdir(targetDir, {recursive: true});
77-
const targetPath = path.join(targetDir, dirent.name.slice(0, -5) + '.md');
78-
await genMDFromHTML(sourcePath, targetPath);
79-
counter++;
80-
}
130+
async function processTaskList(tasks) {
131+
const failedTasks = [];
132+
for (const {sourcePath, targetPath} of tasks) {
133+
try {
134+
await genMDFromHTML(sourcePath, targetPath);
135+
} catch (error) {
136+
failedTasks.push({sourcePath, targetPath, error});
81137
}
82-
} catch (err) {
83-
console.error(err);
84138
}
139+
return {success: tasks.length - failedTasks.length, failedTasks};
140+
}
85141

86-
console.log(`📄 Generated ${counter} markdown files from HTML.`);
87-
console.log('✅ Markdown export generation complete!');
142+
async function doWork(tasks) {
143+
parentPort.postMessage(await processTaskList(tasks));
88144
}
89145

90-
main().catch(console.error);
146+
if (isMainThread) {
147+
await createWork();
148+
} else {
149+
await doWork(workerData);
150+
}

0 commit comments

Comments
 (0)