fix(translate): chunk large pages to avoid GPT-5.2 structured-output token limit

luandro · luandro · commit c62bf7ad5431 · 2026-02-25T16:51:40.000-03:00
Pages like "Creating a New Observation" (~486K tokens) exceed OpenAI's
272K token limit for json_schema strict mode, causing translation to fail
for both pt-BR and es.

Fix: translateText now splits oversized markdown into chunks before
calling the API, then reassembles the translated pieces transparently.
Callers and function signatures are unchanged.

Key details:
- TRANSLATION_CHUNK_MAX_CHARS = 500_000 (~143K tokens, conservative buffer)
- Fence-aware section splitter: # inside code blocks is never a boundary
- 3-level fallback: headings -&gt; paragraphs -&gt; lines -&gt; character slicing
- Leading oversized tokens are correctly split even when no content has
  been accumulated yet in the current chunk
- token_overflow error code (non-critical) enables targeted recovery
- Adaptive fallback halves any chunk that still overflows after splitting
- 11 tests including lossless round-trip and leading-token edge cases
diff --git a/scripts/constants.ts b/scripts/constants.ts
@@ -129,6 +129,10 @@ export const ENGLISH_DIR_SAVE_ERROR =
 // Translation retry configuration
 export const TRANSLATION_MAX_RETRIES = 3;
 export const TRANSLATION_RETRY_BASE_DELAY_MS = 750;
+/** Max characters per translation chunk.
+ *  Targets ~143K tokens (500K chars / 3.5 chars per token).
+ *  Leaves generous buffer within OpenAI's 272K structured-output limit. */
+export const TRANSLATION_CHUNK_MAX_CHARS = 500_000;
 
 // URL handling
 export const INVALID_URL_PLACEHOLDER =
diff --git a/scripts/notion-translate/translateFrontMatter.test.ts b/scripts/notion-translate/translateFrontMatter.test.ts
@@ -54,4 +54,115 @@ describe("notion-translate translateFrontMatter", () => {
       })
     );
   });
+
+  it("classifies token overflow errors as non-critical token_overflow code", async () => {
+    const { translateText } = await import("./translateFrontMatter");
+
+    mockOpenAIChatCompletionCreate.mockRejectedValueOnce({
+      status: 400,
+      message:
+        "Input tokens exceed the configured limit of 272000 tokens. Your messages resulted in 486881 tokens.",
+    });
+
+    await expect(translateText("# Body", "Title", "pt-BR")).rejects.toEqual(
+      expect.objectContaining({
+        code: "token_overflow",
+        isCritical: false,
+      })
+    );
+  });
+
+  it("takes the single-call fast path for small content", async () => {
+    const { translateText } = await import("./translateFrontMatter");
+
+    const result = await translateText(
+      "# Small page\n\nJust a paragraph.",
+      "Small",
+      "pt-BR"
+    );
+
+    expect(mockOpenAIChatCompletionCreate).toHaveBeenCalledTimes(1);
+    expect(result.title).toBe("Mock Title");
+    expect(result.markdown).toBe("# translated\n\nMock content");
+  });
+
+  it("chunks large content and calls the API once per chunk", async () => {
+    const { translateText, splitMarkdownIntoChunks } = await import(
+      "./translateFrontMatter"
+    );
+
+    // Build content that is larger than the chunk threshold
+    const bigSection1 = "# Section One\n\n" + "word ".repeat(100_000);
+    const bigSection2 = "\n# Section Two\n\n" + "word ".repeat(100_000);
+    const bigContent = bigSection1 + bigSection2;
+
+    // Sanity: verify it would be split
+    const chunks = splitMarkdownIntoChunks(bigContent, 500_000);
+    expect(chunks.length).toBeGreaterThan(1);
+
+    // translateText should call the API once per chunk
+    const result = await translateText(bigContent, "Big Page", "pt-BR");
+
+    expect(
+      mockOpenAIChatCompletionCreate.mock.calls.length
+    ).toBeGreaterThanOrEqual(2);
+    expect(result.title).toBe("Mock Title"); // taken from first chunk
+    expect(typeof result.markdown).toBe("string");
+    expect(result.markdown.length).toBeGreaterThan(0);
+  });
+
+  it("splitMarkdownIntoChunks does not split on headings inside fenced code blocks", async () => {
+    const { splitMarkdownIntoChunks } = await import("./translateFrontMatter");
+
+    const content =
+      "# Real Heading\n\n```\n# not a heading\n```\n\n# Another Heading\n\ntext\n";
+
+    // With a small limit, only the real headings should be split boundaries
+    const chunks = splitMarkdownIntoChunks(content, 40);
+
+    // The "# not a heading" line inside the fence should stay in one chunk
+    const joined = chunks.join("");
+    expect(joined).toBe(content); // round-trip must be lossless
+    const fenceChunk = chunks.find((c) => c.includes("```"));
+    expect(fenceChunk).toBeDefined();
+    expect(fenceChunk).toContain("# not a heading");
+  });
+
+  it("splitMarkdownIntoChunks reassembly is lossless", async () => {
+    const { splitMarkdownIntoChunks } = await import("./translateFrontMatter");
+
+    const original =
+      "# Heading 1\n\nParagraph one.\n\n# Heading 2\n\nParagraph two.\n";
+    const chunks = splitMarkdownIntoChunks(original, 30);
+    const reassembled = chunks.join("");
+    expect(reassembled).toBe(original);
+  });
+
+  it("splitMarkdownIntoChunks splits an oversized leading paragraph (no current accumulation bug)", async () => {
+    const { splitMarkdownIntoChunks } = await import("./translateFrontMatter");
+
+    // Leading paragraph exceeds the chunk limit with no preceding content
+    const bigParagraph = "a".repeat(200);
+    const chunks = splitMarkdownIntoChunks(bigParagraph, 50);
+
+    // Every chunk must respect the limit
+    for (const chunk of chunks) {
+      expect(chunk.length).toBeLessThanOrEqual(50);
+    }
+    // Round-trip must be lossless
+    expect(chunks.join("")).toBe(bigParagraph);
+  });
+
+  it("splitMarkdownIntoChunks splits an oversized leading line (splitByLines leading bug)", async () => {
+    const { splitMarkdownIntoChunks } = await import("./translateFrontMatter");
+
+    // A single very long line with no newlines (worst case for splitByLines)
+    const longLine = "x".repeat(300);
+    const chunks = splitMarkdownIntoChunks(longLine, 100);
+
+    for (const chunk of chunks) {
+      expect(chunk.length).toBeLessThanOrEqual(100);
+    }
+    expect(chunks.join("")).toBe(longLine);
+  });
 });
diff --git a/scripts/notion-translate/translateFrontMatter.ts b/scripts/notion-translate/translateFrontMatter.ts