|
1 | 1 | #!/usr/bin/env node |
2 | 2 |
|
| 3 | +import {fileURLToPath} from 'url'; |
| 4 | + |
3 | 5 | import {selectAll} from 'hast-util-select'; |
4 | 6 | import {existsSync} from 'node:fs'; |
5 | 7 | import {mkdir, opendir, readFile, rm, writeFile} from 'node:fs/promises'; |
| 8 | +import {cpus} from 'node:os'; |
6 | 9 | import * as path from 'node:path'; |
| 10 | +import {isMainThread, parentPort, Worker, workerData} from 'node:worker_threads'; |
7 | 11 | import rehypeParse from 'rehype-parse'; |
8 | 12 | import rehypeRemark from 'rehype-remark'; |
9 | 13 | import remarkGfm from 'remark-gfm'; |
10 | 14 | import remarkStringify from 'remark-stringify'; |
11 | 15 | import {unified} from 'unified'; |
12 | 16 | import {remove} from 'unist-util-remove'; |
13 | 17 |
|
14 | | -let root = process.cwd(); |
15 | | -while (!existsSync(path.join(root, 'package.json'))) { |
16 | | - const parent = path.dirname(root); |
17 | | - if (parent === root) { |
18 | | - throw new Error('Could not find package.json in parent directories'); |
| 18 | +async function createWork() { |
| 19 | + let root = process.cwd(); |
| 20 | + while (!existsSync(path.join(root, 'package.json'))) { |
| 21 | + const parent = path.dirname(root); |
| 22 | + if (parent === root) { |
| 23 | + throw new Error('Could not find package.json in parent directories'); |
| 24 | + } |
| 25 | + root = parent; |
19 | 26 | } |
20 | | - root = parent; |
| 27 | + const INPUT_DIR = path.join(root, '.next', 'server', 'app'); |
| 28 | + const OUTPUT_DIR = path.join(root, 'public', 'md-exports'); |
| 29 | + |
| 30 | + console.log(`🚀 Starting markdown generation from: ${INPUT_DIR}`); |
| 31 | + console.log(`📁 Output directory: ${OUTPUT_DIR}`); |
| 32 | + |
| 33 | + // Clear output directory |
| 34 | + await rm(OUTPUT_DIR, {recursive: true, force: true}); |
| 35 | + await mkdir(OUTPUT_DIR, {recursive: true}); |
| 36 | + |
| 37 | + // On a 16-core machine, 8 workers were optimal (and slightly faster than 16) |
| 38 | + const numWorkers = Math.max(Math.floor(cpus().length / 2), 2); |
| 39 | + const workerTasks = new Array(numWorkers).fill(null).map(() => []); |
| 40 | + |
| 41 | + console.log(`🔎 Discovering files to convert...`); |
| 42 | + |
| 43 | + let numFiles = 0; |
| 44 | + let workerIdx = 0; |
| 45 | + // Need a high buffer size here otherwise Node skips some subdirectories! |
| 46 | + // See https://github.com/nodejs/node/issues/48820 |
| 47 | + const dir = await opendir(INPUT_DIR, {recursive: true, bufferSize: 1024}); |
| 48 | + for await (const dirent of dir) { |
| 49 | + if (dirent.name.endsWith('.html') && dirent.isFile()) { |
| 50 | + const sourcePath = path.join(dirent.parentPath || dirent.path, dirent.name); |
| 51 | + const targetDir = path.join( |
| 52 | + OUTPUT_DIR, |
| 53 | + path.relative(INPUT_DIR, dirent.parentPath || dirent.path) |
| 54 | + ); |
| 55 | + await mkdir(targetDir, {recursive: true}); |
| 56 | + const targetPath = path.join(targetDir, dirent.name.slice(0, -5) + '.md'); |
| 57 | + workerTasks[workerIdx].push({sourcePath, targetPath}); |
| 58 | + workerIdx = (workerIdx + 1) % numWorkers; |
| 59 | + numFiles++; |
| 60 | + } |
| 61 | + } |
| 62 | + |
| 63 | + console.log(`📄 Converting ${numFiles} files with ${numWorkers} workes...`); |
| 64 | + |
| 65 | + const selfPath = fileURLToPath(import.meta.url); |
| 66 | + const workerPromises = new Array(numWorkers - 1).fill(null).map((_, idx) => { |
| 67 | + return new Promise((resolve, reject) => { |
| 68 | + const worker = new Worker(selfPath, {workerData: workerTasks[idx]}); |
| 69 | + let hasErrors = false; |
| 70 | + worker.on('message', data => { |
| 71 | + if (data.failedTasks.length === 0) { |
| 72 | + console.log(`✅ Worker[${idx}]: ${data.success} files successfully.`); |
| 73 | + } else { |
| 74 | + hasErrors = true; |
| 75 | + console.error(`❌ Worker[${idx}]: ${data.failedTasks.length} files failed:`); |
| 76 | + console.error(data.failedTasks); |
| 77 | + } |
| 78 | + }); |
| 79 | + worker.on('error', reject); |
| 80 | + worker.on('exit', code => { |
| 81 | + if (code !== 0) { |
| 82 | + reject(new Error(`Worker[${idx}] stopped with exit code ${code}`)); |
| 83 | + } else { |
| 84 | + hasErrors ? reject(new Error(`Worker[${idx}] had some errors.`)) : resolve(); |
| 85 | + } |
| 86 | + }); |
| 87 | + }); |
| 88 | + }); |
| 89 | + // The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson |
| 90 | + workerPromises.push(processTaskList(workerTasks[workerTasks.length - 1])); |
| 91 | + |
| 92 | + await Promise.all(workerPromises); |
| 93 | + |
| 94 | + console.log(`📄 Generated ${numFiles} markdown files from HTML.`); |
| 95 | + console.log('✅ Markdown export generation complete!'); |
21 | 96 | } |
22 | | -const INPUT_DIR = path.join(root, '.next', 'server', 'app'); |
23 | | -const OUTPUT_DIR = path.join(root, 'public', 'md-exports'); |
24 | 97 |
|
25 | | -export const genMDFromHTML = async (source, target) => { |
| 98 | +async function genMDFromHTML(source, target) { |
26 | 99 | const text = await readFile(source, {encoding: 'utf8'}); |
27 | 100 | await writeFile( |
28 | 101 | target, |
@@ -52,39 +125,26 @@ export const genMDFromHTML = async (source, target) => { |
52 | 125 | .process(text) |
53 | 126 | ) |
54 | 127 | ); |
55 | | -}; |
56 | | - |
57 | | -async function main() { |
58 | | - console.log(`🚀 Starting markdown generation from: ${INPUT_DIR}`); |
59 | | - console.log(`📁 Output directory: ${OUTPUT_DIR}`); |
| 128 | +} |
60 | 129 |
|
61 | | - // Clear output directory |
62 | | - await rm(OUTPUT_DIR, {recursive: true, force: true}); |
63 | | - await mkdir(OUTPUT_DIR, {recursive: true}); |
64 | | - let counter = 0; |
65 | | - try { |
66 | | - // Need a high buffer size here otherwise Node skips some subdirectories! |
67 | | - // See https://github.com/nodejs/node/issues/48820 |
68 | | - const dir = await opendir(INPUT_DIR, {recursive: true, bufferSize: 1024}); |
69 | | - for await (const dirent of dir) { |
70 | | - if (dirent.name.endsWith('.html') && dirent.isFile()) { |
71 | | - const sourcePath = path.join(dirent.parentPath || dirent.path, dirent.name); |
72 | | - const targetDir = path.join( |
73 | | - OUTPUT_DIR, |
74 | | - path.relative(INPUT_DIR, dirent.parentPath || dirent.path) |
75 | | - ); |
76 | | - await mkdir(targetDir, {recursive: true}); |
77 | | - const targetPath = path.join(targetDir, dirent.name.slice(0, -5) + '.md'); |
78 | | - await genMDFromHTML(sourcePath, targetPath); |
79 | | - counter++; |
80 | | - } |
| 130 | +async function processTaskList(tasks) { |
| 131 | + const failedTasks = []; |
| 132 | + for (const {sourcePath, targetPath} of tasks) { |
| 133 | + try { |
| 134 | + await genMDFromHTML(sourcePath, targetPath); |
| 135 | + } catch (error) { |
| 136 | + failedTasks.push({sourcePath, targetPath, error}); |
81 | 137 | } |
82 | | - } catch (err) { |
83 | | - console.error(err); |
84 | 138 | } |
| 139 | + return {success: tasks.length - failedTasks.length, failedTasks}; |
| 140 | +} |
85 | 141 |
|
86 | | - console.log(`📄 Generated ${counter} markdown files from HTML.`); |
87 | | - console.log('✅ Markdown export generation complete!'); |
| 142 | +async function doWork(tasks) { |
| 143 | + parentPort.postMessage(await processTaskList(tasks)); |
88 | 144 | } |
89 | 145 |
|
90 | | -main().catch(console.error); |
| 146 | +if (isMainThread) { |
| 147 | + await createWork(); |
| 148 | +} else { |
| 149 | + await doWork(workerData); |
| 150 | +} |
0 commit comments