diff --git a/scripts/generate-md-exports.mjs b/scripts/generate-md-exports.mjs index 229a700691e29..35bbfb55c442e 100644 --- a/scripts/generate-md-exports.mjs +++ b/scripts/generate-md-exports.mjs @@ -5,7 +5,7 @@ import imgLinks from '@pondorasti/remark-img-links'; import {selectAll} from 'hast-util-select'; import {createHash} from 'node:crypto'; import {createReadStream, createWriteStream, existsSync} from 'node:fs'; -import {mkdir, opendir, readFile, rm, writeFile} from 'node:fs/promises'; +import {mkdir, opendir, readdir, readFile, rm, writeFile} from 'node:fs/promises'; import {cpus} from 'node:os'; import * as path from 'node:path'; import {compose, Readable} from 'node:stream'; @@ -58,7 +58,20 @@ async function uploadToCFR2(s3Client, relativePath, data) { return; } -function taskFinishHandler({id, success, failedTasks}) { +// Global set to track which cache files are used across all workers +let globalUsedCacheFiles = null; + +function taskFinishHandler({id, success, failedTasks, usedCacheFiles}) { + // Collect cache files used by this worker into the global set + if (usedCacheFiles && globalUsedCacheFiles) { + console.log(`๐Ÿ” Worker[${id}]: returned ${usedCacheFiles.length} cache files.`); + usedCacheFiles.forEach(file => globalUsedCacheFiles.add(file)); + } else { + console.warn( + `โš ๏ธ Worker[${id}]: usedCacheFiles=${!!usedCacheFiles}, globalUsedCacheFiles=${!!globalUsedCacheFiles}` + ); + } + if (failedTasks.length === 0) { console.log(`โœ… Worker[${id}]: converted ${success} files successfully.`); return false; @@ -93,8 +106,16 @@ async function createWork() { if (noCache) { console.log(`โ„น๏ธ No cache directory found, this will take a while...`); await mkdir(CACHE_DIR, {recursive: true}); + } else { + const initialCacheFiles = await readdir(CACHE_DIR); + console.log( + `๐Ÿ“ฆ Cache directory has ${initialCacheFiles.length} files from previous build` + ); } + // Track which cache files are used during this build + globalUsedCacheFiles = new Set(); + // On a 16-core machine, 8 workers were optimal (and slightly faster than 16) const numWorkers = Math.max(Math.floor(cpus().length / 2), 2); const workerTasks = new Array(numWorkers).fill(null).map(() => []); @@ -175,12 +196,14 @@ async function createWork() { }); }); // The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson + const mainThreadUsedFiles = new Set(); workerPromises.push( processTaskList({ id: workerTasks.length - 1, tasks: workerTasks[workerTasks.length - 1], cacheDir: CACHE_DIR, noCache, + usedCacheFiles: mainThreadUsedFiles, }).then(data => { if (taskFinishHandler(data)) { throw new Error(`Worker[${data.id}] had some errors.`); @@ -190,13 +213,42 @@ async function createWork() { await Promise.all(workerPromises); + // Clean up unused cache files to prevent unbounded growth + if (!noCache) { + try { + const allFiles = await readdir(CACHE_DIR); + const filesToDelete = allFiles.filter(file => !globalUsedCacheFiles.has(file)); + const overlaps = allFiles.filter(file => globalUsedCacheFiles.has(file)); + + console.log(`๐Ÿ“Š Cache tracking stats:`); + console.log(` - Files in cache dir (after build): ${allFiles.length}`); + console.log(` - Files tracked as used: ${globalUsedCacheFiles.size}`); + console.log(` - Files that existed and were used: ${overlaps.length}`); + console.log(` - Files to delete (old/unused): ${filesToDelete.length}`); + console.log(` - Expected after cleanup: ${overlaps.length} files`); + + if (filesToDelete.length > 0) { + await Promise.all( + filesToDelete.map(file => rm(path.join(CACHE_DIR, file), {force: true})) + ); + console.log(`๐Ÿงน Cleaned up ${filesToDelete.length} unused cache files`); + } + + // Verify cleanup worked + const remainingFiles = await readdir(CACHE_DIR); + console.log(`โœ… Cache directory now has ${remainingFiles.length} files`); + } catch (err) { + console.warn('Failed to clean unused cache files:', err); + } + } + console.log(`๐Ÿ“„ Generated ${numFiles} markdown files from HTML.`); console.log('โœ… Markdown export generation complete!'); } const md5 = data => createHash('md5').update(data).digest('hex'); -async function genMDFromHTML(source, target, {cacheDir, noCache}) { +async function genMDFromHTML(source, target, {cacheDir, noCache, usedCacheFiles}) { const leanHTML = (await readFile(source, {encoding: 'utf8'})) // Remove all script tags, as they are not needed in markdown // and they are not stable across builds, causing cache misses @@ -210,6 +262,11 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) { ); await writeFile(target, data, {encoding: 'utf8'}); + // Track that we used this cache file + if (usedCacheFiles) { + usedCacheFiles.add(cacheKey); + } + return {cacheHit: true, data}; } catch (err) { if (err.code !== 'ENOENT') { @@ -304,10 +361,20 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) { ).catch(err => console.warn('Error writing cache file:', err)), ]); + // Track that we created this cache file + if (usedCacheFiles) { + usedCacheFiles.add(cacheKey); + } + return {cacheHit: false, data}; } -async function processTaskList({id, tasks, cacheDir, noCache}) { +async function processTaskList({id, tasks, cacheDir, noCache, usedCacheFiles}) { + // Workers don't receive usedCacheFiles in workerData, so create a new Set + if (!usedCacheFiles) { + usedCacheFiles = new Set(); + } + const s3Client = getS3Client(); const failedTasks = []; let cacheMisses = []; @@ -318,6 +385,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) { const {data, cacheHit} = await genMDFromHTML(sourcePath, targetPath, { cacheDir, noCache, + usedCacheFiles, }); if (!cacheHit) { cacheMisses.push(relativePath); @@ -345,6 +413,11 @@ async function processTaskList({id, tasks, cacheDir, noCache}) { `๐Ÿ“ค Worker[${id}]: Updated the following files on R2: \n${r2CacheMisses.map(n => ` - ${n}`).join('\n')}` ); } + const cacheHits = success - cacheMisses.length; + console.log( + `๐Ÿ“ˆ Worker[${id}]: Cache stats: ${cacheHits} hits, ${cacheMisses.length} misses (${((cacheMisses.length / success) * 100).toFixed(1)}% miss rate)` + ); + if (cacheMisses.length / tasks.length > 0.1) { console.warn(`โš ๏ธ Worker[${id}]: More than 10% cache miss rate during build.`); } else if (cacheMisses.length > 0) { @@ -357,6 +430,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) { id, success, failedTasks, + usedCacheFiles: Array.from(usedCacheFiles), }; } diff --git a/src/mdx.ts b/src/mdx.ts index 459caea86d37f..1fc5189fad55e 100644 --- a/src/mdx.ts +++ b/src/mdx.ts @@ -68,6 +68,58 @@ if (process.env.CI) { const md5 = (data: BinaryLike) => createHash('md5').update(data).digest('hex'); +// Worker-level registry cache to avoid fetching multiple times per worker +let cachedRegistryHash: Promise | null = null; + +/** + * Fetch registry data and compute its hash, with retry logic and exponential backoff. + * Retries up to maxRetries times with exponential backoff starting at initialDelayMs. + */ +async function getRegistryHashWithRetry( + maxRetries = 3, + initialDelayMs = 1000 +): Promise { + let lastError: Error | null = null; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + const [apps, packages] = await Promise.all([ + getAppRegistry(), + getPackageRegistry(), + ]); + return md5(JSON.stringify({apps, packages})); + } catch (err) { + lastError = err as Error; + + if (attempt < maxRetries) { + const delay = initialDelayMs * Math.pow(2, attempt); + // eslint-disable-next-line no-console + console.warn( + `Failed to fetch registry (attempt ${attempt + 1}/${maxRetries + 1}). Retrying in ${delay}ms...`, + err + ); + await new Promise(resolve => setTimeout(resolve, delay)); + } + } + } + + throw lastError || new Error('Failed to fetch registry after all retries'); +} + +/** + * Get the registry hash, using cached value if available. + * This ensures we only fetch the registry once per worker process. + * If the fetch fails, the error is cached so subsequent calls fail fast. + */ +function getRegistryHash(): Promise { + if (!cachedRegistryHash) { + // eslint-disable-next-line no-console + console.info('Fetching registry hash for the first time in this worker'); + cachedRegistryHash = getRegistryHashWithRetry(); + } + return cachedRegistryHash; +} + async function readCacheFile(file: string): Promise { const reader = createReadStream(file); const decompressor = createBrotliDecompress(); @@ -541,23 +593,41 @@ export async function getFileBySlug(slug: string): Promise { // continue anyway - images should already exist from build time } - // If the file contains content that depends on the Release Registry (such as an SDK's latest version), avoid using the cache for that file, i.e. always rebuild it. - // This is because the content from the registry might have changed since the last time the file was cached. - // If a new component that injects content from the registry is introduced, it should be added to the patterns below. - const skipCache = + // Detect if file contains content that depends on the Release Registry + // If it does, we include the registry hash in the cache key so the cache + // is invalidated when the registry changes. + const dependsOnRegistry = source.includes('@inject') || source.includes(' { }, }; - if (assetsCacheDir && cacheFile && !skipCache) { + if (assetsCacheDir && cacheFile && cacheKey) { try { await cp(assetsCacheDir, outdir, {recursive: true}); } catch (e) {