Skip to content

Commit 4cc99d1

Browse files
committed
delete if not used
1 parent 4011531 commit 4cc99d1

File tree

2 files changed

+45
-39
lines changed

2 files changed

+45
-39
lines changed

scripts/generate-md-exports.mjs

Lines changed: 42 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import imgLinks from '@pondorasti/remark-img-links';
55
import {selectAll} from 'hast-util-select';
66
import {createHash} from 'node:crypto';
77
import {createReadStream, createWriteStream, existsSync} from 'node:fs';
8-
import {mkdir, opendir, readdir, readFile, rm, stat, writeFile} from 'node:fs/promises';
8+
import {mkdir, opendir, readdir, readFile, rm, writeFile} from 'node:fs/promises';
99
import {cpus} from 'node:os';
1010
import * as path from 'node:path';
1111
import {compose, Readable} from 'node:stream';
@@ -58,7 +58,12 @@ async function uploadToCFR2(s3Client, relativePath, data) {
5858
return;
5959
}
6060

61-
function taskFinishHandler({id, success, failedTasks}) {
61+
function taskFinishHandler({id, success, failedTasks, usedCacheFiles}, allUsedCacheFiles) {
62+
// Collect cache files used by this worker
63+
if (usedCacheFiles) {
64+
usedCacheFiles.forEach(file => allUsedCacheFiles.add(file));
65+
}
66+
6267
if (failedTasks.length === 0) {
6368
console.log(`✅ Worker[${id}]: converted ${success} files successfully.`);
6469
return false;
@@ -93,39 +98,11 @@ async function createWork() {
9398
if (noCache) {
9499
console.log(`ℹ️ No cache directory found, this will take a while...`);
95100
await mkdir(CACHE_DIR, {recursive: true});
96-
} else {
97-
// Clean up old cache files to prevent unbounded growth
98-
// Keep files accessed within last 7 days only
99-
const MAX_CACHE_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
100-
const now = Date.now();
101-
let cleanedCount = 0;
102-
103-
try {
104-
const files = await readdir(CACHE_DIR);
105-
106-
for (const file of files) {
107-
const filePath = path.join(CACHE_DIR, file);
108-
try {
109-
const stats = await stat(filePath);
110-
const age = now - stats.atimeMs; // Time since last access
111-
112-
if (age > MAX_CACHE_AGE_MS) {
113-
await rm(filePath, {force: true});
114-
cleanedCount++;
115-
}
116-
} catch (err) {
117-
// Skip files we can't stat/delete
118-
}
119-
}
120-
121-
if (cleanedCount > 0) {
122-
console.log(`🧹 Cleaned up ${cleanedCount} old cache files (>7 days)`);
123-
}
124-
} catch (err) {
125-
console.warn('Failed to clean cache:', err);
126-
}
127101
}
128102

103+
// Track which cache files are used during this build
104+
const usedCacheFiles = new Set();
105+
129106
// On a 16-core machine, 8 workers were optimal (and slightly faster than 16)
130107
const numWorkers = Math.max(Math.floor(cpus().length / 2), 2);
131108
const workerTasks = new Array(numWorkers).fill(null).map(() => []);
@@ -194,7 +171,7 @@ async function createWork() {
194171
},
195172
});
196173
let hasErrors = false;
197-
worker.on('message', data => (hasErrors = taskFinishHandler(data)));
174+
worker.on('message', data => (hasErrors = taskFinishHandler(data, usedCacheFiles)));
198175
worker.on('error', reject);
199176
worker.on('exit', code => {
200177
if (code !== 0) {
@@ -206,28 +183,51 @@ async function createWork() {
206183
});
207184
});
208185
// The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson
186+
const mainThreadUsedFiles = new Set();
209187
workerPromises.push(
210188
processTaskList({
211189
id: workerTasks.length - 1,
212190
tasks: workerTasks[workerTasks.length - 1],
213191
cacheDir: CACHE_DIR,
214192
noCache,
193+
usedCacheFiles: mainThreadUsedFiles,
215194
}).then(data => {
216-
if (taskFinishHandler(data)) {
195+
if (taskFinishHandler(data, usedCacheFiles)) {
217196
throw new Error(`Worker[${data.id}] had some errors.`);
218197
}
219198
})
220199
);
221200

222201
await Promise.all(workerPromises);
223202

203+
// Clean up unused cache files to prevent unbounded growth
204+
if (!noCache) {
205+
try {
206+
const allFiles = await readdir(CACHE_DIR);
207+
let cleanedCount = 0;
208+
209+
for (const file of allFiles) {
210+
if (!usedCacheFiles.has(file)) {
211+
await rm(path.join(CACHE_DIR, file), {force: true});
212+
cleanedCount++;
213+
}
214+
}
215+
216+
if (cleanedCount > 0) {
217+
console.log(`🧹 Cleaned up ${cleanedCount} unused cache files`);
218+
}
219+
} catch (err) {
220+
console.warn('Failed to clean unused cache files:', err);
221+
}
222+
}
223+
224224
console.log(`📄 Generated ${numFiles} markdown files from HTML.`);
225225
console.log('✅ Markdown export generation complete!');
226226
}
227227

228228
const md5 = data => createHash('md5').update(data).digest('hex');
229229

230-
async function genMDFromHTML(source, target, {cacheDir, noCache}) {
230+
async function genMDFromHTML(source, target, {cacheDir, noCache, usedCacheFiles}) {
231231
const leanHTML = (await readFile(source, {encoding: 'utf8'}))
232232
// Remove all script tags, as they are not needed in markdown
233233
// and they are not stable across builds, causing cache misses
@@ -241,6 +241,9 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
241241
);
242242
await writeFile(target, data, {encoding: 'utf8'});
243243

244+
// Track that we used this cache file
245+
usedCacheFiles.add(cacheKey);
246+
244247
return {cacheHit: true, data};
245248
} catch (err) {
246249
if (err.code !== 'ENOENT') {
@@ -338,7 +341,7 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
338341
return {cacheHit: false, data};
339342
}
340343

341-
async function processTaskList({id, tasks, cacheDir, noCache}) {
344+
async function processTaskList({id, tasks, cacheDir, noCache, usedCacheFiles}) {
342345
const s3Client = getS3Client();
343346
const failedTasks = [];
344347
let cacheMisses = [];
@@ -349,6 +352,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) {
349352
const {data, cacheHit} = await genMDFromHTML(sourcePath, targetPath, {
350353
cacheDir,
351354
noCache,
355+
usedCacheFiles,
352356
});
353357
if (!cacheHit) {
354358
cacheMisses.push(relativePath);
@@ -388,6 +392,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) {
388392
id,
389393
success,
390394
failedTasks,
395+
usedCacheFiles: Array.from(usedCacheFiles),
391396
};
392397
}
393398

src/mdx.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,11 @@ if (process.env.CI) {
6666
mkdirSync(CACHE_DIR, {recursive: true});
6767

6868
// Clean up old cache files in background to prevent unbounded growth
69+
// Delete any file not accessed in the last 24 hours (meaning it wasn't used in recent builds)
6970
// This runs once per worker process and doesn't block the build
7071
(async () => {
7172
try {
72-
const MAX_CACHE_AGE_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
73+
const MAX_CACHE_AGE_MS = 24 * 60 * 60 * 1000; // 24 hours
7374
const now = Date.now();
7475
let cleanedCount = 0;
7576

@@ -96,7 +97,7 @@ if (process.env.CI) {
9697

9798
if (cleanedCount > 0) {
9899
// eslint-disable-next-line no-console
99-
console.log(`🧹 MDX cache: Cleaned up ${cleanedCount} old items (>7 days)`);
100+
console.log(`🧹 MDX cache: Cleaned up ${cleanedCount} unused items (>24h)`);
100101
}
101102
} catch (err) {
102103
// Silently fail - cache cleanup is not critical

0 commit comments

Comments
 (0)