@@ -5,7 +5,7 @@ import imgLinks from '@pondorasti/remark-img-links';
55import { selectAll } from 'hast-util-select' ;
66import { createHash } from 'node:crypto' ;
77import { createReadStream , createWriteStream , existsSync } from 'node:fs' ;
8- import { mkdir , opendir , readdir , readFile , rm , stat , writeFile } from 'node:fs/promises' ;
8+ import { mkdir , opendir , readdir , readFile , rm , writeFile } from 'node:fs/promises' ;
99import { cpus } from 'node:os' ;
1010import * as path from 'node:path' ;
1111import { compose , Readable } from 'node:stream' ;
@@ -58,7 +58,12 @@ async function uploadToCFR2(s3Client, relativePath, data) {
5858 return ;
5959}
6060
61- function taskFinishHandler ( { id, success, failedTasks} ) {
61+ function taskFinishHandler ( { id, success, failedTasks, usedCacheFiles} , allUsedCacheFiles ) {
62+ // Collect cache files used by this worker
63+ if ( usedCacheFiles ) {
64+ usedCacheFiles . forEach ( file => allUsedCacheFiles . add ( file ) ) ;
65+ }
66+
6267 if ( failedTasks . length === 0 ) {
6368 console . log ( `✅ Worker[${ id } ]: converted ${ success } files successfully.` ) ;
6469 return false ;
@@ -93,39 +98,11 @@ async function createWork() {
9398 if ( noCache ) {
9499 console . log ( `ℹ️ No cache directory found, this will take a while...` ) ;
95100 await mkdir ( CACHE_DIR , { recursive : true } ) ;
96- } else {
97- // Clean up old cache files to prevent unbounded growth
98- // Keep files accessed within last 7 days only
99- const MAX_CACHE_AGE_MS = 7 * 24 * 60 * 60 * 1000 ; // 7 days
100- const now = Date . now ( ) ;
101- let cleanedCount = 0 ;
102-
103- try {
104- const files = await readdir ( CACHE_DIR ) ;
105-
106- for ( const file of files ) {
107- const filePath = path . join ( CACHE_DIR , file ) ;
108- try {
109- const stats = await stat ( filePath ) ;
110- const age = now - stats . atimeMs ; // Time since last access
111-
112- if ( age > MAX_CACHE_AGE_MS ) {
113- await rm ( filePath , { force : true } ) ;
114- cleanedCount ++ ;
115- }
116- } catch ( err ) {
117- // Skip files we can't stat/delete
118- }
119- }
120-
121- if ( cleanedCount > 0 ) {
122- console . log ( `🧹 Cleaned up ${ cleanedCount } old cache files (>7 days)` ) ;
123- }
124- } catch ( err ) {
125- console . warn ( 'Failed to clean cache:' , err ) ;
126- }
127101 }
128102
103+ // Track which cache files are used during this build
104+ const usedCacheFiles = new Set ( ) ;
105+
129106 // On a 16-core machine, 8 workers were optimal (and slightly faster than 16)
130107 const numWorkers = Math . max ( Math . floor ( cpus ( ) . length / 2 ) , 2 ) ;
131108 const workerTasks = new Array ( numWorkers ) . fill ( null ) . map ( ( ) => [ ] ) ;
@@ -194,7 +171,7 @@ async function createWork() {
194171 } ,
195172 } ) ;
196173 let hasErrors = false ;
197- worker . on ( 'message' , data => ( hasErrors = taskFinishHandler ( data ) ) ) ;
174+ worker . on ( 'message' , data => ( hasErrors = taskFinishHandler ( data , usedCacheFiles ) ) ) ;
198175 worker . on ( 'error' , reject ) ;
199176 worker . on ( 'exit' , code => {
200177 if ( code !== 0 ) {
@@ -206,28 +183,51 @@ async function createWork() {
206183 } ) ;
207184 } ) ;
208185 // The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson
186+ const mainThreadUsedFiles = new Set ( ) ;
209187 workerPromises . push (
210188 processTaskList ( {
211189 id : workerTasks . length - 1 ,
212190 tasks : workerTasks [ workerTasks . length - 1 ] ,
213191 cacheDir : CACHE_DIR ,
214192 noCache,
193+ usedCacheFiles : mainThreadUsedFiles ,
215194 } ) . then ( data => {
216- if ( taskFinishHandler ( data ) ) {
195+ if ( taskFinishHandler ( data , usedCacheFiles ) ) {
217196 throw new Error ( `Worker[${ data . id } ] had some errors.` ) ;
218197 }
219198 } )
220199 ) ;
221200
222201 await Promise . all ( workerPromises ) ;
223202
203+ // Clean up unused cache files to prevent unbounded growth
204+ if ( ! noCache ) {
205+ try {
206+ const allFiles = await readdir ( CACHE_DIR ) ;
207+ let cleanedCount = 0 ;
208+
209+ for ( const file of allFiles ) {
210+ if ( ! usedCacheFiles . has ( file ) ) {
211+ await rm ( path . join ( CACHE_DIR , file ) , { force : true } ) ;
212+ cleanedCount ++ ;
213+ }
214+ }
215+
216+ if ( cleanedCount > 0 ) {
217+ console . log ( `🧹 Cleaned up ${ cleanedCount } unused cache files` ) ;
218+ }
219+ } catch ( err ) {
220+ console . warn ( 'Failed to clean unused cache files:' , err ) ;
221+ }
222+ }
223+
224224 console . log ( `📄 Generated ${ numFiles } markdown files from HTML.` ) ;
225225 console . log ( '✅ Markdown export generation complete!' ) ;
226226}
227227
228228const md5 = data => createHash ( 'md5' ) . update ( data ) . digest ( 'hex' ) ;
229229
230- async function genMDFromHTML ( source , target , { cacheDir, noCache} ) {
230+ async function genMDFromHTML ( source , target , { cacheDir, noCache, usedCacheFiles } ) {
231231 const leanHTML = ( await readFile ( source , { encoding : 'utf8' } ) )
232232 // Remove all script tags, as they are not needed in markdown
233233 // and they are not stable across builds, causing cache misses
@@ -241,6 +241,9 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
241241 ) ;
242242 await writeFile ( target , data , { encoding : 'utf8' } ) ;
243243
244+ // Track that we used this cache file
245+ usedCacheFiles . add ( cacheKey ) ;
246+
244247 return { cacheHit : true , data} ;
245248 } catch ( err ) {
246249 if ( err . code !== 'ENOENT' ) {
@@ -338,7 +341,7 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
338341 return { cacheHit : false , data} ;
339342}
340343
341- async function processTaskList ( { id, tasks, cacheDir, noCache} ) {
344+ async function processTaskList ( { id, tasks, cacheDir, noCache, usedCacheFiles } ) {
342345 const s3Client = getS3Client ( ) ;
343346 const failedTasks = [ ] ;
344347 let cacheMisses = [ ] ;
@@ -349,6 +352,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) {
349352 const { data, cacheHit} = await genMDFromHTML ( sourcePath , targetPath , {
350353 cacheDir,
351354 noCache,
355+ usedCacheFiles,
352356 } ) ;
353357 if ( ! cacheHit ) {
354358 cacheMisses . push ( relativePath ) ;
@@ -388,6 +392,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) {
388392 id,
389393 success,
390394 failedTasks,
395+ usedCacheFiles : Array . from ( usedCacheFiles ) ,
391396 } ;
392397}
393398
0 commit comments