Detecting bad JSON in run replication and ignoring it

matt-aitken · matt-aitken · commit 59a31e9b34df · 2025-06-30T09:55:06.000+01:00
diff --git a/apps/webapp/app/services/runsReplicationService.server.ts b/apps/webapp/app/services/runsReplicationService.server.ts
@@ -15,6 +15,8 @@ import { TaskRun } from "@trigger.dev/database";
 import { nanoid } from "nanoid";
 import EventEmitter from "node:events";
 import pLimit from "p-limit";
+import { logger } from "./logger.server";
+import { detectBadJsonStrings } from "~/utils/detectBadJsonStrings";
 
 interface TransactionEvent<T = any> {
   tag: "insert" | "update" | "delete";
@@ -740,6 +742,14 @@ export class RunsReplicationService {
       return { data: undefined };
     }
 
+    if (detectBadJsonStrings(data)) {
+      this.logger.warn("Detected bad JSON strings", {
+        data,
+        dataType,
+      });
+      return { data: undefined };
+    }
+
     const packet = {
       data,
       dataType,
diff --git a/apps/webapp/app/utils/detectBadJsonStrings.ts b/apps/webapp/app/utils/detectBadJsonStrings.ts
@@ -0,0 +1,48 @@
+export function detectBadJsonStrings(jsonString: string): boolean {
+  // Single regex with global flag to find all matches with their positions
+  const regex = /\\ud[89ab][0-9a-f]{2}|\\ud[cd][0-9a-f]{2}/g;
+  const matches: Array<{ index: number; isHigh: boolean }> = [];
+
+  let match;
+  while ((match = regex.exec(jsonString)) !== null) {
+    const isHigh =
+      match[0].startsWith("\\ud8") ||
+      match[0].startsWith("\\ud9") ||
+      match[0].startsWith("\\uda") ||
+      match[0].startsWith("\\udb");
+    matches.push({ index: match.index, isHigh });
+  }
+
+  if (matches.length === 0) {
+    return false; // No Unicode escapes found
+  }
+
+  // Check for incomplete pairs
+  const highSurrogates = new Set<number>();
+  const lowSurrogates = new Set<number>();
+
+  for (const { index, isHigh } of matches) {
+    if (isHigh) {
+      highSurrogates.add(index);
+    } else {
+      lowSurrogates.add(index);
+    }
+  }
+
+  // Check for unmatched surrogates
+  for (const highIndex of highSurrogates) {
+    const expectedLowIndex = highIndex + 6; // Length of high surrogate
+    if (!lowSurrogates.has(expectedLowIndex)) {
+      return true; // Incomplete high surrogate
+    }
+  }
+
+  for (const lowIndex of lowSurrogates) {
+    const expectedHighIndex = lowIndex - 6; // Length of low surrogate
+    if (!highSurrogates.has(expectedHighIndex)) {
+      return true; // Incomplete low surrogate
+    }
+  }
+
+  return false;
+}
diff --git a/apps/webapp/test/detectbadJsonStrings.test.ts b/apps/webapp/test/detectbadJsonStrings.test.ts
@@ -0,0 +1,137 @@
+import { describe, expect, it } from "vitest";
+import { detectBadJsonStrings } from "~/utils/detectBadJsonStrings";
+
+describe("detectBadJsonStrings", () => {
+  it("should not detect valid JSON string", () => {
+    const goodJson = `{"title": "hello"}`;
+    const result = detectBadJsonStrings(goodJson);
+    expect(result).toBe(false);
+  });
+
+  it("should detect incomplete Unicode escape sequences", () => {
+    const badJson = `{"title": "hello\\ud835"}`;
+    const result = detectBadJsonStrings(badJson);
+    expect(result).toBe(true);
+  });
+
+  it("should not detect complete Unicode escape sequences", () => {
+    const goodJson = `{"title": "hello\\ud835\\udc00"}`;
+    const result = detectBadJsonStrings(goodJson);
+    expect(result).toBe(false);
+  });
+
+  it("should detect incomplete low surrogate", () => {
+    const badJson = `{"title": "hello\\udc00"}`;
+    const result = detectBadJsonStrings(badJson);
+    expect(result).toBe(true);
+  });
+
+  it("should handle multiple Unicode sequences correctly", () => {
+    const goodJson = `{"title": "hello\\ud835\\udc00\\ud835\\udc01"}`;
+    const result = detectBadJsonStrings(goodJson);
+    expect(result).toBe(false);
+  });
+
+  it("should detect mixed complete and incomplete sequences", () => {
+    const badJson = `{"title": "hello\\ud835\\udc00\\ud835"}`;
+    const result = detectBadJsonStrings(badJson);
+    expect(result).toBe(true);
+  });
+
+  it("should have acceptable performance overhead", () => {
+    const longText = `hello world `.repeat(1_000);
+    const goodJson = `{"title": "hello", "text": "${longText}"}`;
+    const badJson = `{"title": "hello\\ud835", "text": "${longText}"}`;
+
+    const iterations = 100_000;
+
+    // Warm up
+    for (let i = 0; i < 1000; i++) {
+      detectBadJsonStrings(goodJson);
+      detectBadJsonStrings(badJson);
+    }
+
+    // Measure good JSON (most common case)
+    const goodStart = performance.now();
+    for (let i = 0; i < iterations; i++) {
+      detectBadJsonStrings(goodJson);
+    }
+    const goodTime = performance.now() - goodStart;
+
+    // Measure bad JSON (edge case)
+    const badStart = performance.now();
+    for (let i = 0; i < iterations; i++) {
+      detectBadJsonStrings(badJson);
+    }
+    const badTime = performance.now() - badStart;
+
+    // Measure baseline (just function call overhead)
+    const baselineStart = performance.now();
+    for (let i = 0; i < iterations; i++) {
+      // Empty function call to measure baseline
+    }
+    const baselineTime = performance.now() - baselineStart;
+
+    const goodOverhead = goodTime - baselineTime;
+    const badOverhead = badTime - baselineTime;
+
+    console.log(`Baseline (${iterations} iterations): ${baselineTime.toFixed(2)}ms`);
+    console.log(
+      `Good JSON (${iterations} iterations): ${goodTime.toFixed(
+        2
+      )}ms (overhead: ${goodOverhead.toFixed(2)}ms)`
+    );
+    console.log(
+      `Bad JSON (${iterations} iterations): ${badTime.toFixed(
+        2
+      )}ms (overhead: ${badOverhead.toFixed(2)}ms)`
+    );
+    console.log(
+      `Average per call - Good: ${(goodOverhead / iterations).toFixed(4)}ms, Bad: ${(
+        badOverhead / iterations
+      ).toFixed(4)}ms`
+    );
+
+    // Assertions for performance expectations
+    // Good JSON should be reasonably fast (most common case)
+    expect(goodOverhead / iterations).toBeLessThan(0.01); // Less than 10 microseconds per call
+
+    // Bad JSON can be slower due to regex matching, but still reasonable
+    expect(badOverhead / iterations).toBeLessThan(0.02); // Less than 20 microseconds per call
+
+    // Total overhead for 100k calls should be reasonable
+    expect(goodOverhead).toBeLessThan(1000); // Less than 1 second for 100k calls
+  });
+
+  it("should handle various JSON sizes efficiently", () => {
+    const sizes = [100, 1000, 10000, 100000];
+    const iterations = 10_000;
+
+    for (const size of sizes) {
+      const text = `hello world `.repeat(size / 11); // Approximate size
+      const goodJson = `{"title": "hello", "text": "${text}"}`;
+
+      const start = performance.now();
+      for (let i = 0; i < iterations; i++) {
+        detectBadJsonStrings(goodJson);
+      }
+      const time = performance.now() - start;
+
+      console.log(
+        `Size ${size} chars (${iterations} iterations): ${time.toFixed(2)}ms (${(
+          time / iterations
+        ).toFixed(4)}ms per call)`
+      );
+
+      // Performance should scale reasonably with size
+      expect(time / iterations).toBeLessThan(size / 1000); // Roughly linear scaling
+    }
+  });
+});
+
+function processPacket(data: string): { data?: string; dataType?: string } {
+  if (detectBadJsonStrings(data)) {
+    return { data: undefined };
+  }
+  return { data, dataType: "application/json" };
+}
diff --git a/apps/webapp/test/runsReplicationService.part2.test.ts b/apps/webapp/test/runsReplicationService.part2.test.ts