Skip to content

Commit ce04c00

Browse files
authored
Merge pull request #23 from BadlyDrawnBoy/codex/implement-write-strategy-for-diskcache
Add locked atomic writes for disk OCR cache
2 parents 483e280 + e9cf9a1 commit ce04c00

File tree

2 files changed

+204
-12
lines changed

2 files changed

+204
-12
lines changed

src/utils/diskCache.ts

Lines changed: 82 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import type { OcrDiskCache, OcrImageResult, OcrPageResult } from '../types/cache
1111
import { createLogger } from './logger.js';
1212

1313
const logger = createLogger('DiskCache');
14+
const LOCK_RETRY_MS = 25;
15+
const LOCK_TIMEOUT_MS = 5_000;
1416

1517
/**
1618
* Generate cache file path from PDF path
@@ -57,34 +59,102 @@ export const loadOcrCache = (pdfPath: string): OcrDiskCache | null => {
5759
}
5860
};
5961

62+
const sleepSync = (ms: number): void => {
63+
const array = new Int32Array(new SharedArrayBuffer(4));
64+
Atomics.wait(array, 0, 0, ms);
65+
};
66+
67+
const acquireCacheLock = (lockPath: string): number => {
68+
const start = Date.now();
69+
70+
// eslint-disable-next-line no-constant-condition
71+
while (true) {
72+
try {
73+
return fs.openSync(lockPath, 'wx');
74+
} catch (error: unknown) {
75+
const err = error as NodeJS.ErrnoException;
76+
77+
if (err.code === 'EEXIST') {
78+
if (Date.now() - start > LOCK_TIMEOUT_MS) {
79+
throw new Error(`Timed out waiting for cache lock at ${lockPath}`);
80+
}
81+
82+
sleepSync(LOCK_RETRY_MS);
83+
continue;
84+
}
85+
86+
throw error;
87+
}
88+
}
89+
};
90+
91+
const releaseCacheLock = (lockPath: string, fd: number): void => {
92+
fs.closeSync(fd);
93+
fs.rmSync(lockPath, { force: true });
94+
};
95+
96+
const writeCacheFile = (cachePath: string, cache: OcrDiskCache): void => {
97+
cache.updated_at = new Date().toISOString();
98+
99+
// Ensure directory exists
100+
const dir = path.dirname(cachePath);
101+
if (!fs.existsSync(dir)) {
102+
fs.mkdirSync(dir, { recursive: true });
103+
}
104+
105+
const tempPath = `${cachePath}.${process.pid}.${Date.now()}.tmp`;
106+
fs.writeFileSync(tempPath, JSON.stringify(cache, null, 2), 'utf-8');
107+
fs.renameSync(tempPath, cachePath);
108+
};
109+
110+
const mergeCaches = (existing: OcrDiskCache | null, incoming: OcrDiskCache): OcrDiskCache => {
111+
const now = new Date().toISOString();
112+
113+
if (existing && existing.fingerprint === incoming.fingerprint) {
114+
return {
115+
...existing,
116+
...incoming,
117+
created_at: existing.created_at,
118+
updated_at: now,
119+
pages: { ...existing.pages, ...incoming.pages },
120+
images: { ...existing.images, ...incoming.images },
121+
} satisfies OcrDiskCache;
122+
}
123+
124+
return {
125+
...incoming,
126+
created_at: incoming.created_at ?? existing?.created_at ?? now,
127+
updated_at: now,
128+
pages: incoming.pages ?? {},
129+
images: incoming.images ?? {},
130+
} satisfies OcrDiskCache;
131+
};
132+
60133
/**
61134
* Save OCR cache to disk
62135
*/
63136
export const saveOcrCache = (pdfPath: string, cache: OcrDiskCache): void => {
64137
const cachePath = getCacheFilePath(pdfPath);
138+
const lockPath = `${cachePath}.lock`;
139+
const lockFd = acquireCacheLock(lockPath);
65140

66141
try {
67-
// Update timestamp
68-
cache.updated_at = new Date().toISOString();
69-
70-
// Ensure directory exists
71-
const dir = path.dirname(cachePath);
72-
if (!fs.existsSync(dir)) {
73-
fs.mkdirSync(dir, { recursive: true });
74-
}
142+
const latest = loadOcrCache(pdfPath);
143+
const merged = mergeCaches(latest, cache);
75144

76-
// Write with formatting for readability
77-
fs.writeFileSync(cachePath, JSON.stringify(cache, null, 2), 'utf-8');
145+
writeCacheFile(cachePath, merged);
78146

79147
logger.debug('Saved OCR cache to disk', {
80148
cachePath,
81-
pageCount: Object.keys(cache.pages).length,
82-
imageCount: Object.keys(cache.images).length,
149+
pageCount: Object.keys(merged.pages).length,
150+
imageCount: Object.keys(merged.images).length,
83151
});
84152
} catch (error: unknown) {
85153
const message = error instanceof Error ? error.message : String(error);
86154
logger.error('Failed to save OCR cache', { cachePath, error: message });
87155
throw new Error(`Failed to save OCR cache: ${message}`);
156+
} finally {
157+
releaseCacheLock(lockPath, lockFd);
88158
}
89159
};
90160

test/utils/diskCache.test.ts

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import fs from 'node:fs';
2+
import os from 'node:os';
3+
import path from 'node:path';
4+
import { afterEach, describe, expect, it } from 'vitest';
5+
import type { OcrDiskCache } from '../../src/types/cache.js';
6+
import { loadOcrCache, saveOcrCache } from '../../src/utils/diskCache.js';
7+
8+
const fingerprint = 'disk-cache-fingerprint';
9+
const provider = 'test-provider';
10+
11+
const createBaseCache = (pdfPath: string): OcrDiskCache => ({
12+
fingerprint,
13+
pdf_path: pdfPath,
14+
created_at: new Date().toISOString(),
15+
updated_at: new Date().toISOString(),
16+
ocr_provider: provider,
17+
pages: {},
18+
images: {},
19+
});
20+
21+
const scheduleWrite = (fn: () => void): Promise<void> =>
22+
new Promise((resolve) => {
23+
setTimeout(() => {
24+
fn();
25+
resolve();
26+
}, 0);
27+
});
28+
29+
const createdDirs: string[] = [];
30+
31+
afterEach(() => {
32+
for (const dir of createdDirs) {
33+
fs.rmSync(dir, { recursive: true, force: true });
34+
}
35+
36+
createdDirs.length = 0;
37+
});
38+
39+
describe('disk cache writes', () => {
40+
it('merges parallel page writes so no entries are lost', async () => {
41+
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'disk-cache-'));
42+
createdDirs.push(tempDir);
43+
44+
const pdfPath = path.join(tempDir, 'test.pdf');
45+
const baseCache = createBaseCache(pdfPath);
46+
47+
const cacheA: OcrDiskCache = {
48+
...baseCache,
49+
pages: {
50+
'1': {
51+
text: 'page-one',
52+
provider_hash: 'provider-hash',
53+
cached_at: new Date().toISOString(),
54+
},
55+
},
56+
};
57+
58+
const cacheB: OcrDiskCache = {
59+
...baseCache,
60+
pages: {
61+
'2': {
62+
text: 'page-two',
63+
provider_hash: 'provider-hash',
64+
cached_at: new Date().toISOString(),
65+
},
66+
},
67+
};
68+
69+
await Promise.all([
70+
scheduleWrite(() => saveOcrCache(pdfPath, cacheA)),
71+
scheduleWrite(() => saveOcrCache(pdfPath, cacheB)),
72+
]);
73+
74+
const cache = loadOcrCache(pdfPath);
75+
76+
expect(cache?.pages['1']?.text).toBe('page-one');
77+
expect(cache?.pages['2']?.text).toBe('page-two');
78+
expect(Object.keys(cache?.pages ?? {})).toHaveLength(2);
79+
});
80+
81+
it('merges page and image entries from overlapping writes', async () => {
82+
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'disk-cache-'));
83+
createdDirs.push(tempDir);
84+
85+
const pdfPath = path.join(tempDir, 'test.pdf');
86+
const baseCache = createBaseCache(pdfPath);
87+
88+
const pageCache: OcrDiskCache = {
89+
...baseCache,
90+
pages: {
91+
'3': {
92+
text: 'page-three',
93+
provider_hash: 'provider-hash',
94+
cached_at: new Date().toISOString(),
95+
},
96+
},
97+
};
98+
99+
const imageCache: OcrDiskCache = {
100+
...baseCache,
101+
images: {
102+
'3/0': {
103+
text: 'image-text',
104+
provider_hash: 'provider-hash',
105+
cached_at: new Date().toISOString(),
106+
},
107+
},
108+
};
109+
110+
await Promise.all([
111+
scheduleWrite(() => saveOcrCache(pdfPath, pageCache)),
112+
scheduleWrite(() => saveOcrCache(pdfPath, imageCache)),
113+
]);
114+
115+
const cache = loadOcrCache(pdfPath);
116+
117+
expect(cache?.pages['3']?.text).toBe('page-three');
118+
expect(cache?.images['3/0']?.text).toBe('image-text');
119+
expect(Object.keys(cache?.pages ?? {})).toHaveLength(1);
120+
expect(Object.keys(cache?.images ?? {})).toHaveLength(1);
121+
});
122+
});

0 commit comments

Comments
 (0)