Skip to content

Commit 903600d

Browse files
committed
Optimize processCarriageReturns function for performance and multi-byte character handling
This commit enhances the `processCarriageReturns` function by implementing in-place string operations to improve performance, especially with large outputs. Key features include: - Line-by-line processing to maximize chunk handling. - Use of string indexes and substring operations instead of arrays. - Single-pass traversal of input for efficiency. - Special handling for multi-byte characters to prevent corruption during overwrites. Additionally, tests have been updated to validate the new functionality, ensuring correct behavior with various character sets, including emojis and non-ASCII text. Highly Density CR case is added to Benchmark
1 parent 6eb573a commit 903600d

File tree

3 files changed

+293
-76
lines changed

3 files changed

+293
-76
lines changed

src/integrations/misc/__tests__/extract-text.test.ts

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -384,18 +384,47 @@ describe("processCarriageReturns", () => {
384384
})
385385

386386
it("should handle carriage returns with special characters", () => {
387-
const input = "Line with 🚀 emoji\rUpdated with 🔥 emoji"
388-
const expected = "Updated with 🔥 emoji"
387+
// This test demonstrates our handling of multi-byte characters (like emoji) when they get partially overwritten.
388+
// When a carriage return causes partial overwrite of a multi-byte character (like an emoji),
389+
// we need to handle this special case to prevent display issues or corruption.
390+
//
391+
// In this example:
392+
// 1. "Line with 🚀 emoji" is printed (note that the emoji is a multi-byte character)
393+
// 2. CR moves cursor to start of line
394+
// 3. "Line with a" is printed, which partially overwrites the line
395+
// 4. The 'a' character ends at a position that would split the 🚀 emoji
396+
// 5. Instead of creating corrupted output, we insert a space to replace the partial emoji
397+
//
398+
// This behavior mimics terminals that can detect and properly handle these situations
399+
// by replacing partial characters with spaces to maintain text integrity.
400+
const input = "Line with 🚀 emoji\rLine with a"
401+
const expected = "Line with a emoji"
389402
expect(processCarriageReturns(input)).toBe(expected)
390403
})
391404

392405
it("should correctly handle multiple consecutive newlines with carriage returns", () => {
406+
// Another test case for multi-byte character handling during carriage return overwrites.
407+
// In this case, we're testing with a different emoji and pattern to ensure robustness.
408+
//
409+
// When a new line with an emoji partially overlaps with text from the previous line,
410+
// we need to properly detect surrogate pairs and other multi-byte sequences to avoid
411+
// creating invalid Unicode output.
412+
//
413+
// Note: The expected result might look strange but it's consistent with how real
414+
// terminals process such content - they only overwrite at character boundaries
415+
// and don't attempt to interpret or normalize the resulting text.
393416
const input = "Line with not a emoji\rLine with 🔥 emoji"
394417
const expected = "Line with 🔥 emojioji"
395418
expect(processCarriageReturns(input)).toBe(expected)
396419
})
397420

398421
it("should handle carriage returns in the middle of non-ASCII text", () => {
422+
// Tests handling of non-Latin text (like Chinese characters)
423+
// Non-ASCII text uses multi-byte encodings, so this test verifies our handling works
424+
// properly with such character sets.
425+
//
426+
// Our implementation ensures we preserve character boundaries and don't create
427+
// invalid sequences when carriage returns cause partial overwrites.
399428
const input = "你好世界啊\r你好地球"
400429
const expected = "你好地球啊"
401430
expect(processCarriageReturns(input)).toBe(expected)

src/integrations/misc/__tests__/performance/processCarriageReturns.benchmark.ts

Lines changed: 184 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@ import { processCarriageReturns, applyRunLengthEncoding, truncateOutput } from "
33
/**
44
* Enhanced Benchmark test for terminal output processing functions
55
*
6-
* This script tests three key functions:
7-
* 1. processCarriageReturns - Handles carriage returns like a real terminal
8-
* 2. applyRunLengthEncoding - Compresses repetitive output patterns
9-
* 3. truncateOutput - Limits output to a specified line count
6+
* This script tests terminal output processing with various data patterns:
7+
* 1. Regular output with carriage returns (various sizes)
8+
* 2. Extremely long single lines with carriage returns
9+
* 3. High-density carriage return patterns
1010
*
1111
* Tests with various data sizes and complexity levels for real-world performance metrics
1212
*/
@@ -94,30 +94,89 @@ function generateTestData(size: number, complexity: "simple" | "medium" | "compl
9494
return result
9595
}
9696

97+
// Generate a test with extremely long single lines
98+
function generateLongLineTestData(lineLengthKB: number, updateCount: number): string {
99+
// Create a base string that's lineLengthKB kilobytes
100+
const baseLength = lineLengthKB * 1024
101+
let baseString = ""
102+
103+
// Generate a long string with repeating characters
104+
for (let i = 0; i < baseLength; i++) {
105+
baseString += String.fromCharCode(32 + (i % 94)) // Printable ASCII chars
106+
}
107+
108+
let result = baseString
109+
110+
// Add carriage returns and modifications at various positions
111+
for (let i = 0; i < updateCount; i++) {
112+
// Calculate update position (divide the string into updateCount segments)
113+
const updateLength = Math.floor(baseLength / updateCount)
114+
const updatePosition = updateLength * i
115+
116+
// Create update string that's 10% of the update segment length
117+
const modificationLength = Math.floor(updateLength * 0.1)
118+
let modification = ""
119+
for (let j = 0; j < modificationLength; j++) {
120+
modification += String.fromCharCode(65 + (j % 26)) // A-Z
121+
}
122+
123+
// Add carriage return and modification
124+
result += `\r${modification}${baseString.substring(modification.length, updatePosition)}`
125+
}
126+
127+
return result
128+
}
129+
130+
// Generate high-density carriage return data
131+
function generateHighDensityCRData(size: number): string {
132+
let result = ""
133+
134+
// Create small text segments separated by carriage returns
135+
for (let i = 0; i < size; i++) {
136+
// Add a small text segment (3-10 chars)
137+
const segmentLength = 3 + Math.floor(random() * 8)
138+
let segment = ""
139+
for (let j = 0; j < segmentLength; j++) {
140+
segment += String.fromCharCode(97 + Math.floor(random() * 26)) // a-z
141+
}
142+
143+
result += segment
144+
145+
// 90% chance to add a carriage return
146+
if (random() < 0.9) {
147+
result += "\r"
148+
} else {
149+
result += "\n"
150+
}
151+
}
152+
153+
return result
154+
}
155+
97156
// Get appropriate iteration count for different sizes to ensure meaningful timing
98157
function getIterationCount(size: number): number {
99158
if (size <= 10000) return 100
100159
if (size <= 100000) return 20
101-
return 10
160+
if (size <= 500000) return 10
161+
return 5 // For very large tests
102162
}
103163

104164
// Calculate statistical measures
105165
function calculateStats(durations: number[]) {
106166
// Sort durations for percentile calculations
107167
const sorted = [...durations].sort((a, b) => a - b)
108168

169+
// Calculate mean once to avoid repeating this calculation
170+
const mean = durations.reduce((a, b) => a + b, 0) / durations.length
171+
109172
return {
110173
min: sorted[0],
111174
max: sorted[sorted.length - 1],
112175
median: sorted[Math.floor(sorted.length / 2)],
113176
p95: sorted[Math.floor(sorted.length * 0.95)],
114177
p99: sorted[Math.floor(sorted.length * 0.99)],
115-
mean: durations.reduce((a, b) => a + b, 0) / durations.length,
116-
stdDev: Math.sqrt(
117-
durations
118-
.map((x) => Math.pow(x - durations.reduce((a, b) => a + b, 0) / durations.length, 2))
119-
.reduce((a, b) => a + b, 0) / durations.length,
120-
),
178+
mean,
179+
stdDev: Math.sqrt(durations.map((x) => Math.pow(x - mean, 2)).reduce((a, b) => a + b, 0) / durations.length),
121180
}
122181
}
123182

@@ -168,6 +227,8 @@ function runPerformanceTest(
168227
const totalBenchmarkTime = durations.reduce((a, b) => a + b, 0) / 1000 // seconds
169228
const averageThroughput = (totalSizeProcessed / totalBenchmarkTime).toFixed(2) // MB/s
170229
const peakThroughput = (input.length / (1024 * 1024) / (stats.min / 1000)).toFixed(2) // MB/s
230+
// Add a more stable "reliable throughput" metric based on p95
231+
const reliableThroughput = (input.length / (1024 * 1024) / (stats.p95 / 1000)).toFixed(2) // MB/s
171232

172233
// Output metrics
173234
console.log(`- Time Statistics (in ms):`)
@@ -180,6 +241,7 @@ function runPerformanceTest(
180241
console.log(`- Throughput:`)
181242
console.log(` • Average: ${averageThroughput} MB/s`)
182243
console.log(` • Peak: ${peakThroughput} MB/s`)
244+
console.log(` • Reliable (P95): ${reliableThroughput} MB/s`)
183245
console.log(
184246
`- Output size: ${resultSize} MB (${reduction}% ${parseFloat(reduction) < 0 ? "increase" : "reduction"})`,
185247
)
@@ -190,17 +252,62 @@ function runPerformanceTest(
190252
reduction,
191253
averageThroughput,
192254
peakThroughput,
255+
reliableThroughput,
193256
}
194257
}
195258

196-
// Run benchmark with different data sizes and complexities
259+
// Run comparative test between identical runs to measure variance
260+
function runBaselineTest(input: string, iterations: number) {
261+
console.log("\n=== Baseline Performance Test ===")
262+
console.log(`Testing with ${(input.length / (1024 * 1024)).toFixed(2)} MB of data`)
263+
264+
const runs = 5 // Run 5 times for better variance analysis
265+
const results = []
266+
267+
for (let i = 0; i < runs; i++) {
268+
results.push(runPerformanceTest(`Run ${i + 1}`, processCarriageReturns, input, iterations))
269+
}
270+
271+
// Calculate average and variance metrics
272+
const meanTimes = results.map((r) => r.stats.mean)
273+
const avgMean = meanTimes.reduce((a, b) => a + b, 0) / runs
274+
const maxVariation = Math.max(...meanTimes.map((t) => Math.abs(((t - avgMean) / avgMean) * 100)))
275+
276+
const throughputs = results.map((r) => parseFloat(r.peakThroughput))
277+
const avgThroughput = throughputs.reduce((a, b) => a + b, 0) / runs
278+
const throughputVariation = Math.max(
279+
...throughputs.map((t) => Math.abs(((t - avgThroughput) / avgThroughput) * 100)),
280+
)
281+
282+
console.log("\n=== Performance Variation Analysis ===")
283+
console.log(`Mean execution time: ${avgMean.toFixed(3)} ms (±${maxVariation.toFixed(2)}%)`)
284+
console.log(`Peak throughput: ${avgThroughput.toFixed(2)} MB/s (±${throughputVariation.toFixed(2)}%)`)
285+
286+
return { results, avgMean, maxVariation, avgThroughput, throughputVariation }
287+
}
288+
289+
// Run benchmark with different data sizes and complexity levels
197290
function runBenchmark() {
198-
// Define test configurations: [size, complexity]
199-
const testConfigs: [number, "simple" | "medium" | "complex"][] = [
291+
// Define regular test configurations: [size, complexity]
292+
const standardTestConfigs: [number, "simple" | "medium" | "complex"][] = [
200293
[10000, "simple"],
201294
[10000, "complex"],
202295
[100000, "simple"],
203296
[100000, "complex"],
297+
[500000, "complex"], // Large data test
298+
]
299+
300+
// Define long line test configurations: [lineLengthKB, updateCount]
301+
const longLineTestConfigs: [number, number][] = [
302+
[100, 20], // 100KB line with 20 updates
303+
[1000, 50], // 1MB line with 50 updates
304+
[5000, 200], // 5MB line with 200 updates
305+
]
306+
307+
// Define high-density CR test configurations: [size]
308+
const highDensityCRConfigs: number[] = [
309+
10000, // 10K updates
310+
100000, // 100K updates
204311
]
205312

206313
console.log("=".repeat(80))
@@ -217,7 +324,12 @@ function runBenchmark() {
217324
}
218325
console.log("Warmup complete")
219326

220-
for (const [size, complexity] of testConfigs) {
327+
// Run standard tests
328+
console.log("\n" + "=".repeat(80))
329+
console.log("STANDARD TESTS")
330+
console.log("=".repeat(80))
331+
332+
for (const [size, complexity] of standardTestConfigs) {
221333
console.log(`\n${"-".repeat(80)}`)
222334
console.log(`Testing with ${size} lines, ${complexity} complexity...`)
223335

@@ -262,16 +374,70 @@ function runBenchmark() {
262374
lineLimit,
263375
])
264376

265-
// Test combined pipeline (real-world usage)
266-
console.log("\n--- Combined Pipeline (all 3 functions) ---")
377+
// Run baseline test to measure variance between identical runs
378+
runBaselineTest(testData, Math.max(5, Math.floor(iterations / 4)))
379+
380+
// Test combined pipeline
381+
console.log("\n--- Combined Pipeline ---")
267382
runPerformanceTest(
268383
"Full Pipeline",
269384
(input) => truncateOutput(applyRunLengthEncoding(processCarriageReturns(input)), lineLimit),
270385
testData,
271-
Math.max(5, Math.floor(iterations / 2)),
386+
Math.max(3, Math.floor(iterations / 5)),
272387
)
273388
}
274389

390+
// Run long line tests
391+
console.log("\n" + "=".repeat(80))
392+
console.log("EXTRA LONG LINE TESTS")
393+
console.log("=".repeat(80))
394+
395+
for (const [lineLength, updateCount] of longLineTestConfigs) {
396+
console.log(`\n${"-".repeat(80)}`)
397+
console.log(`Testing with ${lineLength}KB single line, ${updateCount} carriage return updates...`)
398+
399+
// Generate long line test data
400+
const startGenTime = performance.now()
401+
const testData = generateLongLineTestData(lineLength, updateCount)
402+
const genTime = performance.now() - startGenTime
403+
const dataSize = (testData.length / (1024 * 1024)).toFixed(2)
404+
405+
console.log(`Generated ${dataSize} MB of long line test data in ${genTime.toFixed(2)}ms`)
406+
console.log(`Test data contains ${updateCount} carriage returns`)
407+
408+
// Use fewer iterations for long line tests
409+
const iterations = Math.max(3, Math.min(10, getIterationCount(lineLength * 100)))
410+
console.log(`Running ${iterations} iterations...`)
411+
412+
console.log("\n--- Testing processCarriageReturns with long line ---")
413+
runPerformanceTest("processCarriageReturns (long line)", processCarriageReturns, testData, iterations)
414+
}
415+
416+
// Run high-density carriage return tests
417+
console.log("\n" + "=".repeat(80))
418+
console.log("HIGH-DENSITY CARRIAGE RETURN TESTS")
419+
console.log("=".repeat(80))
420+
421+
for (const size of highDensityCRConfigs) {
422+
console.log(`\n${"-".repeat(80)}`)
423+
console.log(`Testing with ${size} high-density CR updates...`)
424+
425+
// Generate high-density CR test data
426+
const startGenTime = performance.now()
427+
const testData = generateHighDensityCRData(size)
428+
const genTime = performance.now() - startGenTime
429+
const dataSize = (testData.length / (1024 * 1024)).toFixed(2)
430+
431+
console.log(`Generated ${dataSize} MB of high-density CR test data in ${genTime.toFixed(2)}ms`)
432+
433+
// Use fewer iterations for these intensive tests
434+
const iterations = Math.max(5, Math.floor(getIterationCount(size) / 2))
435+
console.log(`Running ${iterations} iterations...`)
436+
437+
console.log("\n--- Testing processCarriageReturns with high-density CRs ---")
438+
runPerformanceTest("processCarriageReturns (high-density CR)", processCarriageReturns, testData, iterations)
439+
}
440+
275441
console.log("\n" + "=".repeat(80))
276442
console.log("Benchmark complete")
277443
console.log("=".repeat(80))

0 commit comments

Comments
 (0)