Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
0c2799d
feat(caching) try with 2 workers on enhanced build
sergical Oct 29, 2025
79ac993
testing with more workers
sergical Oct 29, 2025
6bc7aae
75% workers
sergical Oct 29, 2025
52dee11
test 1 worker
sergical Oct 29, 2025
865b223
caching release registry
sergical Oct 29, 2025
8565372
bring back 75% cpu
sergical Oct 29, 2025
e968968
back to half
sergical Oct 29, 2025
98d867c
registry cache logs
sergical Oct 29, 2025
c163037
lets test file changes
sergical Oct 29, 2025
4f15275
revert content changes
sergical Oct 30, 2025
5e97f5a
cleanup old cache files
sergical Oct 30, 2025
4011531
lint
sergical Oct 30, 2025
4cc99d1
delete if not used
sergical Oct 30, 2025
95385be
worker cleanup
sergical Oct 30, 2025
bef41c2
[getsentry/action-github-commit] Auto commit
getsantry[bot] Oct 30, 2025
6e8ad91
address byk's comments
sergical Oct 31, 2025
e803adc
Fix merge conflict resolution: use cacheKey condition instead of skip…
sergical Oct 31, 2025
aa5dc16
Remove error cache reset - cache failures so worker fails fast
sergical Oct 31, 2025
5116b55
Add debug logging for cache tracking
sergical Oct 31, 2025
64ed4c0
[getsentry/action-github-commit] Auto commit
getsantry[bot] Oct 31, 2025
9e33c13
Add detailed debug logging for cache cleanup
sergical Oct 31, 2025
95af6ef
[getsentry/action-github-commit] Auto commit
getsantry[bot] Oct 31, 2025
6a793f2
Add overlap detection and cache miss debugging
sergical Oct 31, 2025
529e692
[getsentry/action-github-commit] Auto commit
getsantry[bot] Oct 31, 2025
577e9d4
Add detailed cache hit/miss stats and initial cache size
sergical Oct 31, 2025
4683bcf
[getsentry/action-github-commit] Auto commit
getsantry[bot] Oct 31, 2025
b041698
Add detection for non-deterministic HTML patterns
sergical Oct 31, 2025
cefcb46
[getsentry/action-github-commit] Auto commit
getsantry[bot] Oct 31, 2025
bd569ac
Fix cache by normalizing timestamps and Next.js asset hashes
sergical Oct 31, 2025
eba9346
Revert hasty fix and add proper debugging
sergical Oct 31, 2025
3b93727
Output debug info to build logs instead of file
sergical Oct 31, 2025
28b6431
Save leanHTML samples locally for debugging
sergical Oct 31, 2025
77e937b
keep testing
sergical Oct 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 78 additions & 4 deletions scripts/generate-md-exports.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import imgLinks from '@pondorasti/remark-img-links';
import {selectAll} from 'hast-util-select';
import {createHash} from 'node:crypto';
import {createReadStream, createWriteStream, existsSync} from 'node:fs';
import {mkdir, opendir, readFile, rm, writeFile} from 'node:fs/promises';
import {mkdir, opendir, readdir, readFile, rm, writeFile} from 'node:fs/promises';
import {cpus} from 'node:os';
import * as path from 'node:path';
import {compose, Readable} from 'node:stream';
Expand Down Expand Up @@ -58,7 +58,20 @@ async function uploadToCFR2(s3Client, relativePath, data) {
return;
}

function taskFinishHandler({id, success, failedTasks}) {
// Global set to track which cache files are used across all workers
let globalUsedCacheFiles = null;

function taskFinishHandler({id, success, failedTasks, usedCacheFiles}) {
// Collect cache files used by this worker into the global set
if (usedCacheFiles && globalUsedCacheFiles) {
console.log(`🔍 Worker[${id}]: returned ${usedCacheFiles.length} cache files.`);
usedCacheFiles.forEach(file => globalUsedCacheFiles.add(file));
} else {
console.warn(
`⚠️ Worker[${id}]: usedCacheFiles=${!!usedCacheFiles}, globalUsedCacheFiles=${!!globalUsedCacheFiles}`
);
}

if (failedTasks.length === 0) {
console.log(`✅ Worker[${id}]: converted ${success} files successfully.`);
return false;
Expand Down Expand Up @@ -93,8 +106,16 @@ async function createWork() {
if (noCache) {
console.log(`ℹ️ No cache directory found, this will take a while...`);
await mkdir(CACHE_DIR, {recursive: true});
} else {
const initialCacheFiles = await readdir(CACHE_DIR);
console.log(
`📦 Cache directory has ${initialCacheFiles.length} files from previous build`
);
}

// Track which cache files are used during this build
globalUsedCacheFiles = new Set();

// On a 16-core machine, 8 workers were optimal (and slightly faster than 16)
const numWorkers = Math.max(Math.floor(cpus().length / 2), 2);
const workerTasks = new Array(numWorkers).fill(null).map(() => []);
Expand Down Expand Up @@ -175,12 +196,14 @@ async function createWork() {
});
});
// The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson
const mainThreadUsedFiles = new Set();
workerPromises.push(
processTaskList({
id: workerTasks.length - 1,
tasks: workerTasks[workerTasks.length - 1],
cacheDir: CACHE_DIR,
noCache,
usedCacheFiles: mainThreadUsedFiles,
}).then(data => {
if (taskFinishHandler(data)) {
throw new Error(`Worker[${data.id}] had some errors.`);
Expand All @@ -190,13 +213,42 @@ async function createWork() {

await Promise.all(workerPromises);

// Clean up unused cache files to prevent unbounded growth
if (!noCache) {
try {
const allFiles = await readdir(CACHE_DIR);
const filesToDelete = allFiles.filter(file => !globalUsedCacheFiles.has(file));
const overlaps = allFiles.filter(file => globalUsedCacheFiles.has(file));

console.log(`📊 Cache tracking stats:`);
console.log(` - Files in cache dir (after build): ${allFiles.length}`);
console.log(` - Files tracked as used: ${globalUsedCacheFiles.size}`);
console.log(` - Files that existed and were used: ${overlaps.length}`);
console.log(` - Files to delete (old/unused): ${filesToDelete.length}`);
console.log(` - Expected after cleanup: ${overlaps.length} files`);

if (filesToDelete.length > 0) {
await Promise.all(
filesToDelete.map(file => rm(path.join(CACHE_DIR, file), {force: true}))
);
console.log(`🧹 Cleaned up ${filesToDelete.length} unused cache files`);
}

// Verify cleanup worked
const remainingFiles = await readdir(CACHE_DIR);
console.log(`✅ Cache directory now has ${remainingFiles.length} files`);
} catch (err) {
console.warn('Failed to clean unused cache files:', err);
}
}

console.log(`📄 Generated ${numFiles} markdown files from HTML.`);
console.log('✅ Markdown export generation complete!');
}

const md5 = data => createHash('md5').update(data).digest('hex');

async function genMDFromHTML(source, target, {cacheDir, noCache}) {
async function genMDFromHTML(source, target, {cacheDir, noCache, usedCacheFiles}) {
const leanHTML = (await readFile(source, {encoding: 'utf8'}))
// Remove all script tags, as they are not needed in markdown
// and they are not stable across builds, causing cache misses
Expand All @@ -210,6 +262,11 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
);
await writeFile(target, data, {encoding: 'utf8'});

// Track that we used this cache file
if (usedCacheFiles) {
usedCacheFiles.add(cacheKey);
}

return {cacheHit: true, data};
} catch (err) {
if (err.code !== 'ENOENT') {
Expand Down Expand Up @@ -304,10 +361,20 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
).catch(err => console.warn('Error writing cache file:', err)),
]);

// Track that we created this cache file
if (usedCacheFiles) {
usedCacheFiles.add(cacheKey);
}

return {cacheHit: false, data};
}

async function processTaskList({id, tasks, cacheDir, noCache}) {
async function processTaskList({id, tasks, cacheDir, noCache, usedCacheFiles}) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Cache Miss Handling Fails

The cache cleanup logic immediately deletes newly created cache files. When genMDFromHTML generates a cache file due to a miss, it doesn't add the file's key to the usedCacheFiles set. This prevents the cache from being effective for those files in subsequent builds.

Additional Locations (1)

Fix in Cursor Fix in Web

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks legit?

// Workers don't receive usedCacheFiles in workerData, so create a new Set
if (!usedCacheFiles) {
usedCacheFiles = new Set();
}

const s3Client = getS3Client();
const failedTasks = [];
let cacheMisses = [];
Expand All @@ -318,6 +385,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) {
const {data, cacheHit} = await genMDFromHTML(sourcePath, targetPath, {
cacheDir,
noCache,
usedCacheFiles,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Worker Initialization Missing Cache Set

Worker threads are initialized without a usedCacheFiles set in their workerData. This causes usedCacheFiles to be undefined within the worker's processTaskList context, leading to a TypeError when genMDFromHTML attempts to call usedCacheFiles.add() on a cache hit.

Additional Locations (1)

Fix in Cursor Fix in Web

});
if (!cacheHit) {
cacheMisses.push(relativePath);
Expand Down Expand Up @@ -345,6 +413,11 @@ async function processTaskList({id, tasks, cacheDir, noCache}) {
`📤 Worker[${id}]: Updated the following files on R2: \n${r2CacheMisses.map(n => ` - ${n}`).join('\n')}`
);
}
const cacheHits = success - cacheMisses.length;
console.log(
`📈 Worker[${id}]: Cache stats: ${cacheHits} hits, ${cacheMisses.length} misses (${((cacheMisses.length / success) * 100).toFixed(1)}% miss rate)`
);

if (cacheMisses.length / tasks.length > 0.1) {
console.warn(`⚠️ Worker[${id}]: More than 10% cache miss rate during build.`);
} else if (cacheMisses.length > 0) {
Expand All @@ -357,6 +430,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) {
id,
success,
failedTasks,
usedCacheFiles: Array.from(usedCacheFiles),
};
}

Expand Down
92 changes: 81 additions & 11 deletions src/mdx.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,58 @@ if (process.env.CI) {

const md5 = (data: BinaryLike) => createHash('md5').update(data).digest('hex');

// Worker-level registry cache to avoid fetching multiple times per worker
let cachedRegistryHash: Promise<string> | null = null;

/**
* Fetch registry data and compute its hash, with retry logic and exponential backoff.
* Retries up to maxRetries times with exponential backoff starting at initialDelayMs.
*/
async function getRegistryHashWithRetry(
maxRetries = 3,
initialDelayMs = 1000
): Promise<string> {
let lastError: Error | null = null;

for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const [apps, packages] = await Promise.all([
getAppRegistry(),
getPackageRegistry(),
]);
return md5(JSON.stringify({apps, packages}));
} catch (err) {
lastError = err as Error;

if (attempt < maxRetries) {
const delay = initialDelayMs * Math.pow(2, attempt);
// eslint-disable-next-line no-console
console.warn(
`Failed to fetch registry (attempt ${attempt + 1}/${maxRetries + 1}). Retrying in ${delay}ms...`,
err
);
await new Promise(resolve => setTimeout(resolve, delay));
}
}
}

throw lastError || new Error('Failed to fetch registry after all retries');
}

/**
* Get the registry hash, using cached value if available.
* This ensures we only fetch the registry once per worker process.
* If the fetch fails, the error is cached so subsequent calls fail fast.
*/
function getRegistryHash(): Promise<string> {
if (!cachedRegistryHash) {
// eslint-disable-next-line no-console
console.info('Fetching registry hash for the first time in this worker');
cachedRegistryHash = getRegistryHashWithRetry();
}
return cachedRegistryHash;
}

async function readCacheFile<T>(file: string): Promise<T> {
const reader = createReadStream(file);
const decompressor = createBrotliDecompress();
Expand Down Expand Up @@ -541,23 +593,41 @@ export async function getFileBySlug(slug: string): Promise<SlugFile> {
// continue anyway - images should already exist from build time
}

// If the file contains content that depends on the Release Registry (such as an SDK's latest version), avoid using the cache for that file, i.e. always rebuild it.
// This is because the content from the registry might have changed since the last time the file was cached.
// If a new component that injects content from the registry is introduced, it should be added to the patterns below.
const skipCache =
// Detect if file contains content that depends on the Release Registry
// If it does, we include the registry hash in the cache key so the cache
// is invalidated when the registry changes.
const dependsOnRegistry =
source.includes('@inject') ||
source.includes('<PlatformSDKPackageName') ||
source.includes('<LambdaLayerDetail');

// Check cache in CI environments
if (process.env.CI) {
if (skipCache) {
// eslint-disable-next-line no-console
console.info(
`Not using cached version of ${sourcePath}, as its content depends on the Release Registry`
);
const sourceHash = md5(source);

// Include registry hash in cache key for registry-dependent files
if (dependsOnRegistry) {
try {
const registryHash = await getRegistryHash();
cacheKey = `${sourceHash}-${registryHash}`;
// eslint-disable-next-line no-console
console.info(
`Using registry-aware cache for ${sourcePath} (registry hash: ${registryHash.slice(0, 8)}...)`
);
} catch (err) {
// If we can't get registry hash, skip cache for this file
// eslint-disable-next-line no-console
console.warn(
`Failed to get registry hash for ${sourcePath}, skipping cache:`,
err
);
cacheKey = null;
}
} else {
cacheKey = md5(source);
cacheKey = sourceHash;
}

if (cacheKey) {
cacheFile = path.join(CACHE_DIR, `${cacheKey}.br`);
assetsCacheDir = path.join(CACHE_DIR, cacheKey);

Expand Down Expand Up @@ -711,7 +781,7 @@ export async function getFileBySlug(slug: string): Promise<SlugFile> {
},
};

if (assetsCacheDir && cacheFile && !skipCache) {
if (assetsCacheDir && cacheFile && cacheKey) {
try {
await cp(assetsCacheDir, outdir, {recursive: true});
} catch (e) {
Expand Down
Loading