Skip to content

Commit 59a31e9

Browse files
committed
Detecting bad JSON in run replication and ignoring it
1 parent f6522e6 commit 59a31e9

File tree

4 files changed

+374
-4
lines changed

4 files changed

+374
-4
lines changed

apps/webapp/app/services/runsReplicationService.server.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import { TaskRun } from "@trigger.dev/database";
1515
import { nanoid } from "nanoid";
1616
import EventEmitter from "node:events";
1717
import pLimit from "p-limit";
18+
import { logger } from "./logger.server";
19+
import { detectBadJsonStrings } from "~/utils/detectBadJsonStrings";
1820

1921
interface TransactionEvent<T = any> {
2022
tag: "insert" | "update" | "delete";
@@ -740,6 +742,14 @@ export class RunsReplicationService {
740742
return { data: undefined };
741743
}
742744

745+
if (detectBadJsonStrings(data)) {
746+
this.logger.warn("Detected bad JSON strings", {
747+
data,
748+
dataType,
749+
});
750+
return { data: undefined };
751+
}
752+
743753
const packet = {
744754
data,
745755
dataType,
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
export function detectBadJsonStrings(jsonString: string): boolean {
2+
// Single regex with global flag to find all matches with their positions
3+
const regex = /\\ud[89ab][0-9a-f]{2}|\\ud[cd][0-9a-f]{2}/g;
4+
const matches: Array<{ index: number; isHigh: boolean }> = [];
5+
6+
let match;
7+
while ((match = regex.exec(jsonString)) !== null) {
8+
const isHigh =
9+
match[0].startsWith("\\ud8") ||
10+
match[0].startsWith("\\ud9") ||
11+
match[0].startsWith("\\uda") ||
12+
match[0].startsWith("\\udb");
13+
matches.push({ index: match.index, isHigh });
14+
}
15+
16+
if (matches.length === 0) {
17+
return false; // No Unicode escapes found
18+
}
19+
20+
// Check for incomplete pairs
21+
const highSurrogates = new Set<number>();
22+
const lowSurrogates = new Set<number>();
23+
24+
for (const { index, isHigh } of matches) {
25+
if (isHigh) {
26+
highSurrogates.add(index);
27+
} else {
28+
lowSurrogates.add(index);
29+
}
30+
}
31+
32+
// Check for unmatched surrogates
33+
for (const highIndex of highSurrogates) {
34+
const expectedLowIndex = highIndex + 6; // Length of high surrogate
35+
if (!lowSurrogates.has(expectedLowIndex)) {
36+
return true; // Incomplete high surrogate
37+
}
38+
}
39+
40+
for (const lowIndex of lowSurrogates) {
41+
const expectedHighIndex = lowIndex - 6; // Length of low surrogate
42+
if (!highSurrogates.has(expectedHighIndex)) {
43+
return true; // Incomplete low surrogate
44+
}
45+
}
46+
47+
return false;
48+
}
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import { describe, expect, it } from "vitest";
2+
import { detectBadJsonStrings } from "~/utils/detectBadJsonStrings";
3+
4+
describe("detectBadJsonStrings", () => {
5+
it("should not detect valid JSON string", () => {
6+
const goodJson = `{"title": "hello"}`;
7+
const result = detectBadJsonStrings(goodJson);
8+
expect(result).toBe(false);
9+
});
10+
11+
it("should detect incomplete Unicode escape sequences", () => {
12+
const badJson = `{"title": "hello\\ud835"}`;
13+
const result = detectBadJsonStrings(badJson);
14+
expect(result).toBe(true);
15+
});
16+
17+
it("should not detect complete Unicode escape sequences", () => {
18+
const goodJson = `{"title": "hello\\ud835\\udc00"}`;
19+
const result = detectBadJsonStrings(goodJson);
20+
expect(result).toBe(false);
21+
});
22+
23+
it("should detect incomplete low surrogate", () => {
24+
const badJson = `{"title": "hello\\udc00"}`;
25+
const result = detectBadJsonStrings(badJson);
26+
expect(result).toBe(true);
27+
});
28+
29+
it("should handle multiple Unicode sequences correctly", () => {
30+
const goodJson = `{"title": "hello\\ud835\\udc00\\ud835\\udc01"}`;
31+
const result = detectBadJsonStrings(goodJson);
32+
expect(result).toBe(false);
33+
});
34+
35+
it("should detect mixed complete and incomplete sequences", () => {
36+
const badJson = `{"title": "hello\\ud835\\udc00\\ud835"}`;
37+
const result = detectBadJsonStrings(badJson);
38+
expect(result).toBe(true);
39+
});
40+
41+
it("should have acceptable performance overhead", () => {
42+
const longText = `hello world `.repeat(1_000);
43+
const goodJson = `{"title": "hello", "text": "${longText}"}`;
44+
const badJson = `{"title": "hello\\ud835", "text": "${longText}"}`;
45+
46+
const iterations = 100_000;
47+
48+
// Warm up
49+
for (let i = 0; i < 1000; i++) {
50+
detectBadJsonStrings(goodJson);
51+
detectBadJsonStrings(badJson);
52+
}
53+
54+
// Measure good JSON (most common case)
55+
const goodStart = performance.now();
56+
for (let i = 0; i < iterations; i++) {
57+
detectBadJsonStrings(goodJson);
58+
}
59+
const goodTime = performance.now() - goodStart;
60+
61+
// Measure bad JSON (edge case)
62+
const badStart = performance.now();
63+
for (let i = 0; i < iterations; i++) {
64+
detectBadJsonStrings(badJson);
65+
}
66+
const badTime = performance.now() - badStart;
67+
68+
// Measure baseline (just function call overhead)
69+
const baselineStart = performance.now();
70+
for (let i = 0; i < iterations; i++) {
71+
// Empty function call to measure baseline
72+
}
73+
const baselineTime = performance.now() - baselineStart;
74+
75+
const goodOverhead = goodTime - baselineTime;
76+
const badOverhead = badTime - baselineTime;
77+
78+
console.log(`Baseline (${iterations} iterations): ${baselineTime.toFixed(2)}ms`);
79+
console.log(
80+
`Good JSON (${iterations} iterations): ${goodTime.toFixed(
81+
2
82+
)}ms (overhead: ${goodOverhead.toFixed(2)}ms)`
83+
);
84+
console.log(
85+
`Bad JSON (${iterations} iterations): ${badTime.toFixed(
86+
2
87+
)}ms (overhead: ${badOverhead.toFixed(2)}ms)`
88+
);
89+
console.log(
90+
`Average per call - Good: ${(goodOverhead / iterations).toFixed(4)}ms, Bad: ${(
91+
badOverhead / iterations
92+
).toFixed(4)}ms`
93+
);
94+
95+
// Assertions for performance expectations
96+
// Good JSON should be reasonably fast (most common case)
97+
expect(goodOverhead / iterations).toBeLessThan(0.01); // Less than 10 microseconds per call
98+
99+
// Bad JSON can be slower due to regex matching, but still reasonable
100+
expect(badOverhead / iterations).toBeLessThan(0.02); // Less than 20 microseconds per call
101+
102+
// Total overhead for 100k calls should be reasonable
103+
expect(goodOverhead).toBeLessThan(1000); // Less than 1 second for 100k calls
104+
});
105+
106+
it("should handle various JSON sizes efficiently", () => {
107+
const sizes = [100, 1000, 10000, 100000];
108+
const iterations = 10_000;
109+
110+
for (const size of sizes) {
111+
const text = `hello world `.repeat(size / 11); // Approximate size
112+
const goodJson = `{"title": "hello", "text": "${text}"}`;
113+
114+
const start = performance.now();
115+
for (let i = 0; i < iterations; i++) {
116+
detectBadJsonStrings(goodJson);
117+
}
118+
const time = performance.now() - start;
119+
120+
console.log(
121+
`Size ${size} chars (${iterations} iterations): ${time.toFixed(2)}ms (${(
122+
time / iterations
123+
).toFixed(4)}ms per call)`
124+
);
125+
126+
// Performance should scale reasonably with size
127+
expect(time / iterations).toBeLessThan(size / 1000); // Roughly linear scaling
128+
}
129+
});
130+
});
131+
132+
function processPacket(data: string): { data?: string; dataType?: string } {
133+
if (detectBadJsonStrings(data)) {
134+
return { data: undefined };
135+
}
136+
return { data, dataType: "application/json" };
137+
}

0 commit comments

Comments
 (0)