Skip to content

Commit 1821766

Browse files
Martinclaude
andcommitted
perf(cache): add P2 medium-priority performance and robustness improvements
Address five P2 (medium priority) issues from code review: **P2-1: Stale Lock Detection** - Add PID and timestamp to lock files - Detect and remove stale locks (dead process or >60s old) - Prevents permanent lockouts from crashed processes - Files: src/utils/diskCache.ts **P2-2: Configurable Cache Directory** - Support PDF_READER_CACHE_DIR environment variable - Uses MD5 hash suffix to avoid filename collisions - Falls back to PDF directory (current behavior) if not set - Useful for read-only filesystems and centralized caching - Files: src/utils/diskCache.ts **P2-3: Decision Cache Race Fix** - Remove problematic delete-before-set pattern - Rely on Map.set() overwrite semantics (preserves insertion order) - Use while loop for consistent eviction behavior - Files: src/handlers/pdfOcr.ts **P2-4: Batched Image Extraction** - Process images in batches of 6 to prevent memory pressure - Replaces unbounded Promise.all with batched processing - Maintains parallelism within batches for performance - Files: src/pdf/extractor.ts **P2-5: Smart OCR Optimization** - Skip text extraction when cached decision says OCR needed - Avoids redundant work on subsequent smart_ocr calls - Improves performance for scanned documents - Files: src/handlers/pdfOcr.ts All improvements maintain backward compatibility. No breaking changes. Tests: 144 passing. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent 90b55ca commit 1821766

File tree

3 files changed

+98
-28
lines changed

3 files changed

+98
-28
lines changed

src/handlers/pdfOcr.ts

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,18 @@ const getCachedDecision = (fingerprint: string, page: number): OcrDecision | und
4040

4141
const setCachedDecision = (fingerprint: string, page: number, decision: OcrDecision): void => {
4242
const key = buildDecisionCacheKey(fingerprint, page);
43-
if (decisionCache.has(key)) {
44-
decisionCache.delete(key);
45-
}
43+
44+
// Just set - Map.set() overwrites existing entries without changing insertion order
45+
// for existing keys (ES2015+ Map preserves insertion order)
4646
decisionCache.set(key, decision);
47-
if (decisionCache.size > DECISION_CACHE_MAX_ENTRIES) {
47+
48+
// Evict oldest entries if over limit
49+
while (decisionCache.size > DECISION_CACHE_MAX_ENTRIES) {
4850
const oldestKey = decisionCache.keys().next().value;
4951
if (oldestKey) {
5052
decisionCache.delete(oldestKey);
53+
} else {
54+
break;
5155
}
5256
}
5357
};
@@ -231,18 +235,20 @@ const handleSmartOcrDecision = async (
231235
if (!smartOcr) return undefined;
232236

233237
const cached = getCachedDecision(fingerprint, page);
234-
let decision = cached;
235-
let pageText = '';
236238

239+
// If cached decision says OCR is needed, skip text extraction entirely
240+
if (cached?.needsOcr) {
241+
return undefined; // Proceed to OCR
242+
}
243+
244+
// Need to extract text for decision or result
245+
const pdfPage = await pdfDocument.getPage(page);
246+
const pageText = await extractTextFromPage(pdfPage);
247+
248+
let decision = cached;
237249
if (!decision) {
238-
const pdfPage = await pdfDocument.getPage(page);
239-
pageText = await extractTextFromPage(pdfPage);
240250
decision = await decideNeedsOcr(pdfPage, pageText);
241251
setCachedDecision(fingerprint, page, decision);
242-
} else {
243-
// Decision was cached, extract text now
244-
const pdfPage = await pdfDocument.getPage(page);
245-
pageText = await extractTextFromPage(pdfPage);
246252
}
247253

248254
if (!decision.needsOcr) {

src/pdf/extractor.ts

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -320,23 +320,29 @@ const extractImagesFromPage = async (
320320
}
321321
}
322322

323-
// Extract each image using shared helper functions
324-
const imagePromises = imageIndices.map(async (imgIndex, arrayIndex) => {
325-
const argsArray = operatorList.argsArray[imgIndex];
326-
if (!argsArray || argsArray.length === 0) {
327-
return null;
328-
}
323+
// Extract images in batches to avoid memory pressure
324+
const BATCH_SIZE = 6;
325+
for (let batchStart = 0; batchStart < imageIndices.length; batchStart += BATCH_SIZE) {
326+
const batch = imageIndices.slice(batchStart, batchStart + BATCH_SIZE);
329327

330-
const imageName = argsArray[0] as string;
331-
const imageResult = await retrieveImageData(page, imageName, pageNum);
332-
if (imageResult.warning) {
333-
warnings.push(imageResult.warning);
334-
}
335-
return processImageData(imageResult.data, pageNum, arrayIndex);
336-
});
328+
const batchPromises = batch.map(async (imgIndex, localIdx) => {
329+
const globalIndex = batchStart + localIdx;
330+
const argsArray = operatorList.argsArray[imgIndex];
331+
if (!argsArray || argsArray.length === 0) {
332+
return null;
333+
}
337334

338-
const resolvedImages = await Promise.all(imagePromises);
339-
images.push(...resolvedImages.filter((img): img is ExtractedImage => img !== null));
335+
const imageName = argsArray[0] as string;
336+
const imageResult = await retrieveImageData(page, imageName, pageNum);
337+
if (imageResult.warning) {
338+
warnings.push(imageResult.warning);
339+
}
340+
return processImageData(imageResult.data, pageNum, globalIndex);
341+
});
342+
343+
const batchResults = await Promise.all(batchPromises);
344+
images.push(...batchResults.filter((img): img is ExtractedImage => img !== null));
345+
}
340346
} catch (error: unknown) {
341347
const message = error instanceof Error ? error.message : String(error);
342348
logger.warn('Error extracting images from page', { pageNum, error: message });

src/utils/diskCache.ts

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* Format: {pdf_basename}_ocr.json
66
*/
77

8+
import crypto from 'node:crypto';
89
import fs from 'node:fs';
910
import fsPromises from 'node:fs/promises';
1011
import path from 'node:path';
@@ -15,13 +16,30 @@ const logger = createLogger('DiskCache');
1516
const LOCK_RETRY_MS = 25;
1617
const LOCK_TIMEOUT_MS = 5_000;
1718

19+
/**
20+
* Get cache directory from environment variable or default to PDF directory
21+
*/
22+
const getCacheDirectory = (): string | null => {
23+
return process.env.PDF_READER_CACHE_DIR ?? null;
24+
};
25+
1826
/**
1927
* Generate cache file path from PDF path
2028
* Example: /path/to/document.pdf → /path/to/document_ocr.json
29+
* If PDF_READER_CACHE_DIR is set, uses that directory with hash to avoid collisions
2130
*/
2231
export const getCacheFilePath = (pdfPath: string): string => {
23-
const dir = path.dirname(pdfPath);
32+
const cacheDir = getCacheDirectory();
2433
const basename = path.basename(pdfPath, path.extname(pdfPath));
34+
35+
if (cacheDir) {
36+
// Use configured cache directory with hash to avoid collisions
37+
const hash = crypto.createHash('md5').update(pdfPath).digest('hex').slice(0, 8);
38+
return path.join(cacheDir, `${basename}_${hash}_ocr.json`);
39+
}
40+
41+
// Default: alongside the PDF
42+
const dir = path.dirname(pdfPath);
2543
return path.join(dir, `${basename}_ocr.json`);
2644
};
2745

@@ -64,18 +82,58 @@ const sleep = (ms: number): Promise<void> => {
6482
return new Promise((resolve) => setTimeout(resolve, ms));
6583
};
6684

85+
const LOCK_STALE_MS = 60_000; // Consider lock stale after 60 seconds
86+
87+
/**
88+
* Check if a lock file is stale (process died or lock too old)
89+
*/
90+
const isLockStale = async (lockPath: string): Promise<boolean> => {
91+
try {
92+
const content = await fsPromises.readFile(lockPath, 'utf-8');
93+
const { pid, timestamp } = JSON.parse(content) as { pid: number; timestamp: number };
94+
95+
// Check if process is still running
96+
try {
97+
process.kill(pid, 0); // Signal 0 checks existence without killing
98+
} catch {
99+
logger.debug('Lock process no longer exists', { pid, lockPath });
100+
return true; // Process doesn't exist, lock is stale
101+
}
102+
103+
// Check if lock is too old
104+
if (Date.now() - timestamp > LOCK_STALE_MS) {
105+
logger.debug('Lock exceeded max age', { pid, age: Date.now() - timestamp, lockPath });
106+
return true;
107+
}
108+
109+
return false;
110+
} catch {
111+
// Can't read lock file, consider it stale
112+
return true;
113+
}
114+
};
115+
67116
const acquireCacheLock = async (lockPath: string): Promise<fsPromises.FileHandle | null> => {
68117
const start = Date.now();
69118

70119
// eslint-disable-next-line no-constant-condition
71120
while (true) {
72121
try {
73122
const handle = await fsPromises.open(lockPath, 'wx');
123+
// Write our PID and timestamp to the lock file
124+
await handle.write(JSON.stringify({ pid: process.pid, timestamp: Date.now() }));
74125
return handle;
75126
} catch (error: unknown) {
76127
const err = error as NodeJS.ErrnoException;
77128

78129
if (err.code === 'EEXIST') {
130+
// Check if lock is stale
131+
if (await isLockStale(lockPath)) {
132+
logger.warn('Removing stale cache lock', { lockPath });
133+
await fsPromises.rm(lockPath, { force: true });
134+
continue; // Retry acquisition
135+
}
136+
79137
if (Date.now() - start > LOCK_TIMEOUT_MS) {
80138
throw new Error(`Timed out waiting for cache lock at ${lockPath}`);
81139
}

0 commit comments

Comments
 (0)