Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions lefthook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@
#
pre-commit:
parallel: true
jobs:
- run: npx eslint {staged_files}
commands:
eslint:
glob: "*.{js,ts,jsx,tsx}"

- name: notion-fetch-tests
run: npx vitest --run scripts/notion-fetch/__tests__/
run: bunx eslint --cache --cache-location ./node_modules/.cache/eslint --max-warnings=0 --fix {staged_files}
stage_fixed: true

notion-fetch-tests:
glob: "scripts/notion-fetch/**/__tests__/**/*.{ts,js}"
run: bunx vitest --run scripts/notion-fetch/__tests__/
stage_fixed: true

# - name: rubocop
# glob: "*.rb"
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"private": true,
"scripts": {
"docusaurus": "docusaurus",
"prepare": "lefthook install",
"dev": "docusaurus start",
"dev:es": "docusaurus start --locale es",
"dev:pt": "docusaurus start --locale pt",
Expand Down
76 changes: 76 additions & 0 deletions scripts/notion-fetch/__tests__/cacheValidation.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import { describe, it, expect, vi, beforeEach } from "vitest";
import { containsExpiringUrls } from "../cacheLoaders";
import { isUrlExpiringSoon } from "../imageReplacer";

// Mock the module
vi.mock("../imageReplacer", () => ({
isUrlExpiringSoon: vi.fn(),
// Add other exports if needed by imports in cacheLoaders,
// but looking at cacheLoaders, it only imports isUrlExpiringSoon.
}));

describe("containsExpiringUrls", () => {
beforeEach(() => {
vi.mocked(isUrlExpiringSoon).mockReset();
vi.mocked(isUrlExpiringSoon).mockImplementation(
(url) => url === "EXPIRING_URL"
);
});

it("should return false for null/undefined", () => {
expect(containsExpiringUrls(null)).toBe(false);
expect(containsExpiringUrls(undefined)).toBe(false);
});

it("should return true for simple expiring string", () => {
expect(containsExpiringUrls("EXPIRING_URL")).toBe(true);
});

it("should return false for safe string", () => {
expect(containsExpiringUrls("SAFE_URL")).toBe(false);
});

it("should find expiring URL in array", () => {
expect(containsExpiringUrls(["SAFE", "EXPIRING_URL"])).toBe(true);
});

it("should find expiring URL in object", () => {
expect(containsExpiringUrls({ key: "EXPIRING_URL" })).toBe(true);
});

it("should find expiring URL in nested structure", () => {
const data = {
level1: {
level2: [
{
target: "EXPIRING_URL",
},
],
},
};
expect(containsExpiringUrls(data)).toBe(true);
});

it("should find expiring URL inside Map", () => {
const map = new Map();
map.set("key", "EXPIRING_URL");
expect(containsExpiringUrls(map)).toBe(true);
});

it("should find expiring URL inside Set", () => {
const set = new Set(["SAFE", "EXPIRING_URL"]);
expect(containsExpiringUrls(set)).toBe(true);
});

it("should handle circular references safely", () => {
const a: any = { val: "SAFE" };
const b: any = { val: "SAFE", ref: a };
a.ref = b;

expect(containsExpiringUrls(a)).toBe(false);

// If we introduce expiry in circular ref
a.val = "EXPIRING_URL";
expect(containsExpiringUrls(b)).toBe(true);
});
});
99 changes: 99 additions & 0 deletions scripts/notion-fetch/__tests__/urlExpiration.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import { describe, it, expect } from "vitest";
import { isUrlExpiringSoon } from "../imageReplacer";

describe("isUrlExpiringSoon", () => {
const NOW = 1700000000000; // Fixed time for consistent testing
const ONE_HOUR = 3600;

// Helper to mock Date.now()
const withMockedTime = (fn: () => void) => {
const originalNow = Date.now;
Date.now = () => NOW;
try {
fn();
} finally {
Date.now = originalNow;
}
};

it("should return false for non-S3 URLs", () => {
const url = "https://example.com/image.png";
expect(isUrlExpiringSoon(url)).toBe(false);
});

it("should return false for S3 URLs without expiration params", () => {
const url = "https://s3.us-west-2.amazonaws.com/bucket/image.png";
expect(isUrlExpiringSoon(url)).toBe(false);
});

describe("X-Amz-Expires + X-Amz-Date", () => {
it("should return true if expiring soon", () => {
withMockedTime(() => {
// Signature time: NOW - 50 mins
// Expires: 1 hour (3600s)
// Remaining: 10 mins (600s)
// Threshold: 15 mins (900s) -> should be true
// Wait, Date.now() is NOW.
// X-Amz-Date needs to be formatted as YYYYMMDDTHHMMSSZ

const date = new Date(NOW - 50 * 60 * 1000);
// Manually format to UTC YYYYMMDDTHHMMSSZ
const pad = (n: number) => n.toString().padStart(2, "0");
const amzDate = `${date.getUTCFullYear()}${pad(date.getUTCMonth() + 1)}${pad(date.getUTCDate())}T${pad(date.getUTCHours())}${pad(date.getUTCMinutes())}${pad(date.getUTCSeconds())}Z`;

const url = `https://s3.amazonaws.com/bucket/obj?X-Amz-Date=${amzDate}&X-Amz-Expires=${ONE_HOUR}`;
// Expiration time = signature time + 1 hour = NOW - 50min + 60min = NOW + 10min
// Time left = 10min = 600s
// Default threshold = 300s (5min) -> should be FALSE (wait, 600s > 300s)

// Let's use a smaller remaining time
// Signature time: NOW - 58 mins
// Remaining: 2 mins (120s)
// Threshold: 300s -> should be TRUE

const date2 = new Date(NOW - 58 * 60 * 1000);
const amzDate2 = `${date2.getUTCFullYear()}${pad(date2.getUTCMonth() + 1)}${pad(date2.getUTCDate())}T${pad(date2.getUTCHours())}${pad(date2.getUTCMinutes())}${pad(date2.getUTCSeconds())}Z`;
const url2 = `https://s3.amazonaws.com/bucket/obj?X-Amz-Date=${amzDate2}&X-Amz-Expires=${ONE_HOUR}`;

expect(isUrlExpiringSoon(url2)).toBe(true);
});
});

it("should return false if plenty of time left", () => {
withMockedTime(() => {
// Signature time: NOW (fresh)
// Expires: 1 hour
// Time left: 60 mins
// Threshold: 5 mins -> FALSE

const date = new Date(NOW);
const pad = (n: number) => n.toString().padStart(2, "0");
const amzDate = `${date.getUTCFullYear()}${pad(date.getUTCMonth() + 1)}${pad(date.getUTCDate())}T${pad(date.getUTCHours())}${pad(date.getUTCMinutes())}${pad(date.getUTCSeconds())}Z`;

const url = `https://s3.amazonaws.com/bucket/obj?X-Amz-Date=${amzDate}&X-Amz-Expires=${ONE_HOUR}`;
expect(isUrlExpiringSoon(url)).toBe(false);
});
});
});

describe("Expires (Unix Timestamp)", () => {
it("should return true if expiring soon", () => {
withMockedTime(() => {
// Expires in 2 mins
const expires = Math.floor(NOW / 1000) + 120;
// Use a URL structure that matches SECURE_NOTION_STATIC_S3_REGEX
const url = `https://s3.us-west-2.amazonaws.com/secure.notion-static.com/obj?Expires=${expires}`;
expect(isUrlExpiringSoon(url)).toBe(true);
});
});

it("should return false if plenty of time left", () => {
withMockedTime(() => {
// Expires in 1 hour
const expires = Math.floor(NOW / 1000) + 3600;
const url = `https://s3.us-west-2.amazonaws.com/secure.notion-static.com/obj?Expires=${expires}`;
expect(isUrlExpiringSoon(url)).toBe(false);
});
});
});
});
144 changes: 140 additions & 4 deletions scripts/notion-fetch/cacheLoaders.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import { buildCacheKey } from "./cacheStrategies";
import { fetchNotionBlocks } from "../fetchNotionData";
import { n2m } from "../notionClient";

import { isUrlExpiringSoon } from "./imageReplacer";

/**
* Logs progress for data fetching operations
* Throttles output to reduce noise: logs at index 0, last item, and every 10th item
Expand Down Expand Up @@ -54,16 +56,87 @@ export interface CacheLoaderConfig<T> {
fetchFn: (pageId: string) => Promise<T>;
/** Normalizes fetched data to expected type */
normalizeResult: (result: any) => T;
/** Optional validator for fetched data. Returns true if valid. */
validateResult?: (result: T) => boolean;
/** Prefix for progress log messages */
logPrefix: string;
}

/**
* Helper to check if data contains expiring S3 URLs using recursive traversal
* Avoids JSON.stringify overhead and regex DoS risks
*/
export function containsExpiringUrls(
data: any,
visited = new WeakSet()
): boolean {
if (data === null || data === undefined) {
return false;
}

// Check strings directly
if (typeof data === "string") {
return isUrlExpiringSoon(data);
}

// Skip non-objects
if (typeof data !== "object") {
return false;
}

// Handle circular references
if (visited.has(data)) {
return false;
}
visited.add(data);

// Traverse Maps
if (data instanceof Map) {
for (const value of data.values()) {
if (containsExpiringUrls(value, visited)) {
return true;
}
}
return false;
}

// Traverse Sets
if (data instanceof Set) {
for (const value of data.values()) {
if (containsExpiringUrls(value, visited)) {
return true;
}
}
return false;
}

// Traverse arrays
if (Array.isArray(data)) {
for (const item of data) {
if (containsExpiringUrls(item, visited)) {
return true;
}
}
return false;
}

// Traverse object values
for (const value of Object.values(data)) {
if (containsExpiringUrls(value, visited)) {
return true;
}
}

return false;
}

/**
* Generic cache loader that handles:
* 1. Main map cache lookup
* 2. Prefetch cache lookup
* 3. In-flight request deduplication
* 4. Cache hit/miss tracking
* 5. Validation and retry for fresh content
*
* @returns Object with fetched/cached data and source indicator
*/
Expand Down Expand Up @@ -103,10 +176,71 @@ export async function loadWithCache<T>(
config.fetchCount.value += 1;
logProgress(pageIndex, totalCount, config.logPrefix, title);
inFlight = (async () => {
const result = await config.fetchFn(pageId);
const normalized = config.normalizeResult(result);
config.prefetchCache.set(cacheKey, normalized);
return normalized;
let attempts = 0;
const MAX_ATTEMPTS = 3;
let lastNormalized: T | null = null;

while (attempts < MAX_ATTEMPTS) {
attempts++;
try {
const result = await config.fetchFn(pageId);
const normalized = config.normalizeResult(result);
lastNormalized = normalized;

if (config.validateResult) {
const isValid = config.validateResult(normalized);
if (!isValid) {
if (attempts === MAX_ATTEMPTS) {
console.warn(
chalk.yellow(
` ⚠️ Content validation failed for "${title}" after ${MAX_ATTEMPTS} attempts; using latest result.`
)
);
config.prefetchCache.set(cacheKey, normalized);
return normalized;
}

const delay = attempts * 1000; // Linear backoff: 1s, 2s
console.warn(
chalk.yellow(
` ⚠️ Content validation failed for "${title}" (attempt ${attempts}/${MAX_ATTEMPTS}), retrying in ${delay}ms...`
)
);
await new Promise((resolve) => setTimeout(resolve, delay));
continue;
}
}

// Validation passed
config.prefetchCache.set(cacheKey, normalized);
return normalized;
} catch (error) {
// If it was our validation error, just propagate it if we are out of retries
// But if it was a fetch error, we might want to retry that too?
// The current logic places the retry loop AROUND the fetch+validate.
// So if fetch throws, we also want to separate network retry from validation retry?
// Standard fetchFn likely has its own retries (notionClient.ts usually does).
// Assuming fetchFn throws on permanent failure.
// We will re-throw fetch errors immediately unless we want to use this loop for fetch retries too.
// The previous code re-threw unexpected errors.
// However, we want to respect the 'continue' for validation failures.

// If strict validation error thrown above:
if (
error instanceof Error &&
error.message.includes("Content validation failed")
) {
throw error;
}

// If fetch error, we let it bubble up (assuming fetchFn manages its own resiliency usually,
// OR if we want to use this loop for generic retries, we could 'continue' here too.
// But instructions were specific about validation retry.
throw error;
}
}

throw new Error("Unexpected end of retry loop");
})()
.catch((error) => {
config.prefetchCache.delete(cacheKey);
Expand Down Expand Up @@ -146,6 +280,7 @@ export async function loadBlocksForPage(
fetchCount: blockFetchCount,
fetchFn: fetchNotionBlocks,
normalizeResult: (result) => (Array.isArray(result) ? result : []),
validateResult: (blocks) => !containsExpiringUrls(blocks),
logPrefix: "Fetching blocks",
});
}
Expand Down Expand Up @@ -173,6 +308,7 @@ export async function loadMarkdownForPage(
fetchFn: (pageId) => n2m.pageToMarkdown(pageId),
normalizeResult: (result) =>
Array.isArray(result) ? result : (result ?? []),
validateResult: (markdown) => !containsExpiringUrls(markdown),
logPrefix: "Converting markdown",
});
}
Loading