Skip to content

Commit 5f5864c

Browse files
luandroclaude
andauthored
Add path normalization for consistent cache comparison (#112)
* fix(cache): add path normalization for consistent cache comparison This addresses issue #95 where content might not be updated correctly during incremental fetches due to path comparison inconsistencies. Changes: - Add normalizePath() helper that resolves all paths to absolute paths relative to PROJECT_ROOT for consistent comparison - Update hasMissingOutputs() to use normalizePath when checking file existence - Update updatePageInCache() to normalize paths before storage - Fix hasMissingOutputs() to return true for empty outputPaths arrays, ensuring pages with no outputs get regenerated - Add comprehensive test coverage in pathNormalization.test.ts - Update existing tests to expect normalized absolute paths * fix(cache): normalize existing paths during merge for proper deduplication When merging existing paths with new paths in updatePageInCache, existing paths from older caches (stored in non-normalized format) were being added without normalization. This caused duplicate entries when the same file was represented in different formats. Changes: - Normalize existing paths in updatePageInCache before merging - Add test case verifying old-format paths are properly deduplicated - Comment clarifies this handles migration from older cache formats * feat(cache): add PROJECT_ROOT import and integration tests for path handling Phase 5-6 of issue #95 fix: - Import PROJECT_ROOT in generateBlocks.ts for consistent path resolution - Replace process.cwd() with PROJECT_ROOT for S3 URL path detection - Add integration tests for path normalization edge cases: - Mixed path formats in cache (migration scenario) - Missing outputs detection regardless of format - Path deduplication across updates - Multi-language path consistency * docs(cache): update outputPaths comment to reflect absolute path storage The interface comment incorrectly stated paths were "relative to project root" but they are now stored as absolute paths for consistency. This documentation fix aligns the comment with the actual implementation. * fix(cache): normalize filePath before comparing with cached paths Ensures consistent path comparison in the needsProcessing check by normalizing filePath before comparing with cached outputPaths. This prevents edge cases where path format differences could cause incorrect skip/process decisions. * fix(cache): improve normalizePath to handle genuine system paths The normalizePath function now correctly distinguishes between: 1. Project-relative paths like /docs/intro.md (where /docs doesn't exist) 2. Genuine system paths like /tmp/foo or /etc/config (where parent exists) Uses filesystem check (fs.existsSync on parent directory) to determine if an absolute path outside PROJECT_ROOT should be preserved as-is or treated as project-relative. Fixes external feedback about /tmp/foo being incorrectly rewritten to PROJECT_ROOT/tmp/foo. * fix(cache): handle root-level paths correctly in normalizePath Paths like /a where the parent is "/" should be treated as project-relative, not system paths. The root directory always exists, so we can't use its existence as an indicator of a genuine system path. Added check: if parent directory IS the root (path.parse(parentDir).root === parentDir), don't trust fs.existsSync and treat as project-relative instead. Added tests for: - /a (single directory at root level) - /foo/bar/baz.txt (nested paths where only root exists) --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent 264e2dd commit 5f5864c

File tree

5 files changed

+955
-25
lines changed

5 files changed

+955
-25
lines changed

scripts/notion-fetch/__tests__/incrementalSync.test.ts

Lines changed: 162 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { describe, it, expect, vi } from "vitest";
22
import fs from "node:fs";
3+
import path from "node:path";
34
import { computeScriptHash, CRITICAL_SCRIPT_FILES } from "../scriptHasher";
45
import {
56
loadPageMetadataCache,
@@ -11,6 +12,7 @@ import {
1112
updatePageInCache,
1213
CACHE_VERSION,
1314
PAGE_METADATA_CACHE_PATH,
15+
PROJECT_ROOT,
1416
type PageMetadataCache,
1517
} from "../pageMetadataCache";
1618

@@ -94,7 +96,10 @@ describe("Incremental Sync Integration", () => {
9496

9597
expect(Object.keys(cache.pages)).toHaveLength(2);
9698
expect(cache.pages["page-1"].lastEdited).toBe("2024-01-01T00:00:00.000Z");
97-
expect(cache.pages["page-2"].outputPaths).toEqual(["/docs/page-2.md"]);
99+
// Paths are normalized to absolute paths
100+
expect(cache.pages["page-2"].outputPaths).toEqual([
101+
path.join(PROJECT_ROOT, "docs/page-2.md"),
102+
]);
98103
});
99104

100105
it("should handle version migration correctly", () => {
@@ -226,6 +231,162 @@ describe("Incremental Sync Integration", () => {
226231
});
227232
});
228233

234+
describe("Path normalization integration", () => {
235+
it("should handle cache with mixed path formats during filtering", () => {
236+
// Simulate a cache with paths in different formats (migration scenario)
237+
const cache: PageMetadataCache = {
238+
version: CACHE_VERSION,
239+
scriptHash: "test-hash",
240+
lastSync: "2024-01-01T00:00:00.000Z",
241+
pages: {
242+
"page-1": {
243+
lastEdited: "2024-01-01T00:00:00.000Z",
244+
outputPaths: ["docs/page-1.md"], // Old format: relative without leading slash
245+
processedAt: "2024-01-01T00:00:00.000Z",
246+
},
247+
"page-2": {
248+
lastEdited: "2024-01-01T00:00:00.000Z",
249+
outputPaths: ["/docs/page-2.md"], // Old format: with leading slash
250+
processedAt: "2024-01-01T00:00:00.000Z",
251+
},
252+
"page-3": {
253+
lastEdited: "2024-01-01T00:00:00.000Z",
254+
outputPaths: [path.join(PROJECT_ROOT, "docs/page-3.md")], // New format: absolute
255+
processedAt: "2024-01-01T00:00:00.000Z",
256+
},
257+
},
258+
};
259+
260+
// Mock that all output files exist
261+
vi.spyOn(fs, "existsSync").mockReturnValue(true);
262+
263+
const pages = [
264+
{ id: "page-1", last_edited_time: "2024-01-01T00:00:00.000Z" },
265+
{ id: "page-2", last_edited_time: "2024-01-01T00:00:00.000Z" },
266+
{ id: "page-3", last_edited_time: "2024-01-01T00:00:00.000Z" },
267+
];
268+
269+
// All paths should resolve correctly regardless of format
270+
const changedPages = filterChangedPages(pages, cache);
271+
272+
// No pages should be marked as changed (all unchanged, all outputs exist)
273+
expect(changedPages).toHaveLength(0);
274+
275+
vi.restoreAllMocks();
276+
});
277+
278+
it("should detect missing outputs regardless of path format", () => {
279+
const cache: PageMetadataCache = {
280+
version: CACHE_VERSION,
281+
scriptHash: "test-hash",
282+
lastSync: "2024-01-01T00:00:00.000Z",
283+
pages: {
284+
"page-1": {
285+
lastEdited: "2024-01-01T00:00:00.000Z",
286+
outputPaths: ["docs/page-1.md"], // Old format
287+
processedAt: "2024-01-01T00:00:00.000Z",
288+
},
289+
},
290+
};
291+
292+
// Mock that the file does NOT exist
293+
vi.spyOn(fs, "existsSync").mockReturnValue(false);
294+
295+
const pages = [
296+
{ id: "page-1", last_edited_time: "2024-01-01T00:00:00.000Z" },
297+
];
298+
299+
// Page should be marked as changed because output is missing
300+
const changedPages = filterChangedPages(pages, cache);
301+
expect(changedPages).toHaveLength(1);
302+
expect(changedPages[0].id).toBe("page-1");
303+
304+
vi.restoreAllMocks();
305+
});
306+
307+
it("should normalize paths when updating cache after processing", () => {
308+
const cache = createEmptyCache("test-hash");
309+
310+
// Simulate processing with different path formats
311+
updatePageInCache(
312+
cache,
313+
"page-1",
314+
"2024-01-01T00:00:00.000Z",
315+
["docs/intro.md"], // No leading slash
316+
false
317+
);
318+
319+
updatePageInCache(
320+
cache,
321+
"page-1",
322+
"2024-01-01T00:00:00.000Z",
323+
["/docs/intro.md"], // With leading slash
324+
false
325+
);
326+
327+
// Should be deduplicated to single normalized path
328+
expect(cache.pages["page-1"].outputPaths).toHaveLength(1);
329+
expect(cache.pages["page-1"].outputPaths[0]).toBe(
330+
path.join(PROJECT_ROOT, "docs/intro.md")
331+
);
332+
});
333+
334+
it("should maintain consistency across multiple updates with different formats", () => {
335+
const cache = createEmptyCache("test-hash");
336+
337+
// First update: English version
338+
updatePageInCache(
339+
cache,
340+
"page-1",
341+
"2024-01-01T00:00:00.000Z",
342+
["/docs/getting-started.md"],
343+
false
344+
);
345+
346+
// Second update: Portuguese version with different format
347+
updatePageInCache(
348+
cache,
349+
"page-1",
350+
"2024-01-01T00:00:00.000Z",
351+
["i18n/pt/docusaurus-plugin-content-docs/current/getting-started.md"],
352+
false
353+
);
354+
355+
// Third update: Spanish version with absolute path
356+
updatePageInCache(
357+
cache,
358+
"page-1",
359+
"2024-01-01T00:00:00.000Z",
360+
[
361+
path.join(
362+
PROJECT_ROOT,
363+
"i18n/es/docusaurus-plugin-content-docs/current/getting-started.md"
364+
),
365+
],
366+
false
367+
);
368+
369+
// All three paths should be stored, all normalized to absolute
370+
expect(cache.pages["page-1"].outputPaths).toHaveLength(3);
371+
372+
const expectedPaths = [
373+
path.join(PROJECT_ROOT, "docs/getting-started.md"),
374+
path.join(
375+
PROJECT_ROOT,
376+
"i18n/pt/docusaurus-plugin-content-docs/current/getting-started.md"
377+
),
378+
path.join(
379+
PROJECT_ROOT,
380+
"i18n/es/docusaurus-plugin-content-docs/current/getting-started.md"
381+
),
382+
];
383+
384+
for (const expectedPath of expectedPaths) {
385+
expect(cache.pages["page-1"].outputPaths).toContain(expectedPath);
386+
}
387+
});
388+
});
389+
229390
describe("Script hash stability", () => {
230391
it("should produce consistent hash for same files", async () => {
231392
const hash1 = await computeScriptHash();

scripts/notion-fetch/__tests__/pageMetadataCache.test.ts

Lines changed: 102 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { describe, it, expect, vi, beforeEach } from "vitest";
22
import fs from "node:fs";
3+
import path from "node:path";
34

45
const {
56
existsSyncMock,
@@ -27,6 +28,7 @@ import {
2728
hasMissingOutputs,
2829
CACHE_VERSION,
2930
PAGE_METADATA_CACHE_PATH,
31+
PROJECT_ROOT,
3032
type PageMetadataCache,
3133
} from "../pageMetadataCache";
3234

@@ -228,6 +230,9 @@ describe("pageMetadataCache", () => {
228230
});
229231

230232
it("should filter unchanged pages", () => {
233+
// Mock files exist
234+
existsSyncMock.mockReturnValue(true);
235+
231236
const pages = [
232237
{ id: "1", last_edited_time: "2024-01-01T00:00:00.000Z" },
233238
{ id: "2", last_edited_time: "2024-01-03T00:00:00.000Z" },
@@ -239,12 +244,12 @@ describe("pageMetadataCache", () => {
239244
pages: {
240245
"1": {
241246
lastEdited: "2024-01-01T00:00:00.000Z",
242-
outputPaths: [],
247+
outputPaths: [path.join(PROJECT_ROOT, "docs/page-1.md")],
243248
processedAt: "",
244249
},
245250
"2": {
246251
lastEdited: "2024-01-02T00:00:00.000Z",
247-
outputPaths: [],
252+
outputPaths: [path.join(PROJECT_ROOT, "docs/page-2.md")],
248253
processedAt: "",
249254
},
250255
},
@@ -258,6 +263,9 @@ describe("pageMetadataCache", () => {
258263
});
259264

260265
it("should include new pages not in cache", () => {
266+
// Mock files exist
267+
existsSyncMock.mockReturnValue(true);
268+
261269
const pages = [
262270
{ id: "1", last_edited_time: "2024-01-01" },
263271
{ id: "new", last_edited_time: "2024-01-01" },
@@ -267,7 +275,11 @@ describe("pageMetadataCache", () => {
267275
scriptHash: "hash",
268276
lastSync: "2024-01-02",
269277
pages: {
270-
"1": { lastEdited: "2024-01-01", outputPaths: [], processedAt: "" },
278+
"1": {
279+
lastEdited: "2024-01-01",
280+
outputPaths: [path.join(PROJECT_ROOT, "docs/page-1.md")],
281+
processedAt: "",
282+
},
271283
},
272284
};
273285

@@ -325,6 +337,75 @@ describe("pageMetadataCache", () => {
325337

326338
expect(hasMissingOutputs(cache, "1")).toBe(true);
327339
});
340+
341+
it("returns false when page is not in cache", () => {
342+
const cache: PageMetadataCache = {
343+
version: CACHE_VERSION,
344+
scriptHash: "hash",
345+
lastSync: "2024-01-02",
346+
pages: {},
347+
};
348+
349+
expect(hasMissingOutputs(cache, "non-existent")).toBe(false);
350+
});
351+
352+
it("returns false when cache is null", () => {
353+
expect(hasMissingOutputs(null, "page-1")).toBe(false);
354+
});
355+
356+
it("returns false when outputPaths is undefined", () => {
357+
const cache: PageMetadataCache = {
358+
version: CACHE_VERSION,
359+
scriptHash: "hash",
360+
lastSync: "2024-01-02",
361+
pages: {
362+
"1": {
363+
lastEdited: "2024-01-01T00:00:00.000Z",
364+
outputPaths: undefined as unknown as string[],
365+
processedAt: "",
366+
},
367+
},
368+
};
369+
370+
expect(hasMissingOutputs(cache, "1")).toBe(false);
371+
});
372+
373+
it("returns true when outputPaths is empty array (Phase 3 fix)", () => {
374+
// Empty outputPaths means no files were written - treat as missing
375+
const cache: PageMetadataCache = {
376+
version: CACHE_VERSION,
377+
scriptHash: "hash",
378+
lastSync: "2024-01-02",
379+
pages: {
380+
"1": {
381+
lastEdited: "2024-01-01T00:00:00.000Z",
382+
outputPaths: [],
383+
processedAt: "",
384+
},
385+
},
386+
};
387+
388+
expect(hasMissingOutputs(cache, "1")).toBe(true);
389+
});
390+
391+
it("returns false when all output files exist", () => {
392+
const cache: PageMetadataCache = {
393+
version: CACHE_VERSION,
394+
scriptHash: "hash",
395+
lastSync: "2024-01-02",
396+
pages: {
397+
"1": {
398+
lastEdited: "2024-01-01T00:00:00.000Z",
399+
outputPaths: ["/docs/page-1.md", "/i18n/pt/docs/page-1.md"],
400+
processedAt: "",
401+
},
402+
},
403+
};
404+
405+
existsSyncMock.mockReturnValue(true);
406+
407+
expect(hasMissingOutputs(cache, "1")).toBe(false);
408+
});
328409
});
329410

330411
describe("findDeletedPages", () => {
@@ -379,24 +460,28 @@ describe("pageMetadataCache", () => {
379460

380461
expect(cache.pages["page-1"]).toBeDefined();
381462
expect(cache.pages["page-1"].lastEdited).toBe("2024-01-01");
382-
expect(cache.pages["page-1"].outputPaths).toEqual(["/docs/test.md"]);
463+
// Paths are normalized to absolute paths
464+
expect(cache.pages["page-1"].outputPaths).toEqual([
465+
path.join(PROJECT_ROOT, "docs/test.md"),
466+
]);
383467
});
384468

385469
it("should update existing page in cache", () => {
386470
const cache = createEmptyCache("hash");
471+
// Use normalized path for existing entry
472+
const oldPath = path.join(PROJECT_ROOT, "docs/old.md");
387473
cache.pages["page-1"] = {
388474
lastEdited: "2024-01-01",
389-
outputPaths: ["/docs/old.md"],
475+
outputPaths: [oldPath],
390476
processedAt: "2024-01-01",
391477
};
392478

393479
updatePageInCache(cache, "page-1", "2024-01-02", ["/docs/new.md"], false);
394480

395481
expect(cache.pages["page-1"].lastEdited).toBe("2024-01-02");
396-
expect(cache.pages["page-1"].outputPaths.sort()).toEqual([
397-
"/docs/new.md",
398-
"/docs/old.md",
399-
]);
482+
expect(cache.pages["page-1"].outputPaths.sort()).toEqual(
483+
[path.join(PROJECT_ROOT, "docs/new.md"), oldPath].sort()
484+
);
400485
});
401486

402487
it("should merge and deduplicate output paths across languages", () => {
@@ -417,11 +502,14 @@ describe("pageMetadataCache", () => {
417502
false
418503
);
419504

420-
expect(cache.pages["page-1"].outputPaths.sort()).toEqual([
421-
"/docs/fr/page-1.md",
422-
"/docs/page-1.md",
423-
"/docs/page-2.md",
424-
]);
505+
// Paths are normalized to absolute paths
506+
expect(cache.pages["page-1"].outputPaths.sort()).toEqual(
507+
[
508+
path.join(PROJECT_ROOT, "docs/fr/page-1.md"),
509+
path.join(PROJECT_ROOT, "docs/page-1.md"),
510+
path.join(PROJECT_ROOT, "docs/page-2.md"),
511+
].sort()
512+
);
425513
});
426514
});
427515

0 commit comments

Comments
 (0)