Optimize processCarriageReturns function for performance and multi-byte character handling

Yikai-Liao · Yikai-Liao · commit 903600dc6f78 · 2025-04-16T20:43:49.000+08:00
This commit enhances the `processCarriageReturns` function by implementing in-place string operations to improve performance, especially with large outputs. Key features include:
- Line-by-line processing to maximize chunk handling.
- Use of string indexes and substring operations instead of arrays.
- Single-pass traversal of input for efficiency.
- Special handling for multi-byte characters to prevent corruption during overwrites.

Additionally, tests have been updated to validate the new functionality, ensuring correct behavior with various character sets, including emojis and non-ASCII text.

Highly Density CR case is added to Benchmark
diff --git a/src/integrations/misc/__tests__/extract-text.test.ts b/src/integrations/misc/__tests__/extract-text.test.ts
@@ -384,18 +384,47 @@ describe("processCarriageReturns", () => {
 	})
 
 	it("should handle carriage returns with special characters", () => {
-		const input = "Line with 🚀 emoji\rUpdated with 🔥 emoji"
-		const expected = "Updated with 🔥 emoji"
+		// This test demonstrates our handling of multi-byte characters (like emoji) when they get partially overwritten.
+		// When a carriage return causes partial overwrite of a multi-byte character (like an emoji),
+		// we need to handle this special case to prevent display issues or corruption.
+		//
+		// In this example:
+		// 1. "Line with 🚀 emoji" is printed (note that the emoji is a multi-byte character)
+		// 2. CR moves cursor to start of line
+		// 3. "Line with a" is printed, which partially overwrites the line
+		// 4. The 'a' character ends at a position that would split the 🚀 emoji
+		// 5. Instead of creating corrupted output, we insert a space to replace the partial emoji
+		//
+		// This behavior mimics terminals that can detect and properly handle these situations
+		// by replacing partial characters with spaces to maintain text integrity.
+		const input = "Line with 🚀 emoji\rLine with a"
+		const expected = "Line with a  emoji"
 		expect(processCarriageReturns(input)).toBe(expected)
 	})
 
 	it("should correctly handle multiple consecutive newlines with carriage returns", () => {
+		// Another test case for multi-byte character handling during carriage return overwrites.
+		// In this case, we're testing with a different emoji and pattern to ensure robustness.
+		//
+		// When a new line with an emoji partially overlaps with text from the previous line,
+		// we need to properly detect surrogate pairs and other multi-byte sequences to avoid
+		// creating invalid Unicode output.
+		//
+		// Note: The expected result might look strange but it's consistent with how real
+		// terminals process such content - they only overwrite at character boundaries
+		// and don't attempt to interpret or normalize the resulting text.
 		const input = "Line with not a emoji\rLine with 🔥 emoji"
 		const expected = "Line with 🔥 emojioji"
 		expect(processCarriageReturns(input)).toBe(expected)
 	})
 
 	it("should handle carriage returns in the middle of non-ASCII text", () => {
+		// Tests handling of non-Latin text (like Chinese characters)
+		// Non-ASCII text uses multi-byte encodings, so this test verifies our handling works
+		// properly with such character sets.
+		//
+		// Our implementation ensures we preserve character boundaries and don't create
+		// invalid sequences when carriage returns cause partial overwrites.
 		const input = "你好世界啊\r你好地球"
 		const expected = "你好地球啊"
 		expect(processCarriageReturns(input)).toBe(expected)
diff --git a/src/integrations/misc/__tests__/performance/processCarriageReturns.benchmark.ts b/src/integrations/misc/__tests__/performance/processCarriageReturns.benchmark.ts
@@ -3,10 +3,10 @@ import { processCarriageReturns, applyRunLengthEncoding, truncateOutput } from "
 /**
  * Enhanced Benchmark test for terminal output processing functions
  *
- * This script tests three key functions:
- * 1. processCarriageReturns - Handles carriage returns like a real terminal
- * 2. applyRunLengthEncoding - Compresses repetitive output patterns
- * 3. truncateOutput - Limits output to a specified line count
+ * This script tests terminal output processing with various data patterns:
+ * 1. Regular output with carriage returns (various sizes)
+ * 2. Extremely long single lines with carriage returns
+ * 3. High-density carriage return patterns
  *
  * Tests with various data sizes and complexity levels for real-world performance metrics
  */
@@ -94,30 +94,89 @@ function generateTestData(size: number, complexity: "simple" | "medium" | "compl
 	return result
 }
 
+// Generate a test with extremely long single lines
+function generateLongLineTestData(lineLengthKB: number, updateCount: number): string {
+	// Create a base string that's lineLengthKB kilobytes
+	const baseLength = lineLengthKB * 1024
+	let baseString = ""
+
+	// Generate a long string with repeating characters
+	for (let i = 0; i < baseLength; i++) {
+		baseString += String.fromCharCode(32 + (i % 94)) // Printable ASCII chars
+	}
+
+	let result = baseString
+
+	// Add carriage returns and modifications at various positions
+	for (let i = 0; i < updateCount; i++) {
+		// Calculate update position (divide the string into updateCount segments)
+		const updateLength = Math.floor(baseLength / updateCount)
+		const updatePosition = updateLength * i
+
+		// Create update string that's 10% of the update segment length
+		const modificationLength = Math.floor(updateLength * 0.1)
+		let modification = ""
+		for (let j = 0; j < modificationLength; j++) {
+			modification += String.fromCharCode(65 + (j % 26)) // A-Z
+		}
+
+		// Add carriage return and modification
+		result += `\r${modification}${baseString.substring(modification.length, updatePosition)}`
+	}
+
+	return result
+}
+
+// Generate high-density carriage return data
+function generateHighDensityCRData(size: number): string {
+	let result = ""
+
+	// Create small text segments separated by carriage returns
+	for (let i = 0; i < size; i++) {
+		// Add a small text segment (3-10 chars)
+		const segmentLength = 3 + Math.floor(random() * 8)
+		let segment = ""
+		for (let j = 0; j < segmentLength; j++) {
+			segment += String.fromCharCode(97 + Math.floor(random() * 26)) // a-z
+		}
+
+		result += segment
+
+		// 90% chance to add a carriage return
+		if (random() < 0.9) {
+			result += "\r"
+		} else {
+			result += "\n"
+		}
+	}
+
+	return result
+}
+
 // Get appropriate iteration count for different sizes to ensure meaningful timing
 function getIterationCount(size: number): number {
 	if (size <= 10000) return 100
 	if (size <= 100000) return 20
-	return 10
+	if (size <= 500000) return 10
+	return 5 // For very large tests
 }
 
 // Calculate statistical measures
 function calculateStats(durations: number[]) {
 	// Sort durations for percentile calculations
 	const sorted = [...durations].sort((a, b) => a - b)
 
+	// Calculate mean once to avoid repeating this calculation
+	const mean = durations.reduce((a, b) => a + b, 0) / durations.length
+
 	return {
 		min: sorted[0],
 		max: sorted[sorted.length - 1],
 		median: sorted[Math.floor(sorted.length / 2)],
 		p95: sorted[Math.floor(sorted.length * 0.95)],
 		p99: sorted[Math.floor(sorted.length * 0.99)],
-		mean: durations.reduce((a, b) => a + b, 0) / durations.length,
-		stdDev: Math.sqrt(
-			durations
-				.map((x) => Math.pow(x - durations.reduce((a, b) => a + b, 0) / durations.length, 2))
-				.reduce((a, b) => a + b, 0) / durations.length,
-		),
+		mean,
+		stdDev: Math.sqrt(durations.map((x) => Math.pow(x - mean, 2)).reduce((a, b) => a + b, 0) / durations.length),
 	}
 }
 
@@ -168,6 +227,8 @@ function runPerformanceTest(
 	const totalBenchmarkTime = durations.reduce((a, b) => a + b, 0) / 1000 // seconds
 	const averageThroughput = (totalSizeProcessed / totalBenchmarkTime).toFixed(2) // MB/s
 	const peakThroughput = (input.length / (1024 * 1024) / (stats.min / 1000)).toFixed(2) // MB/s
+	// Add a more stable "reliable throughput" metric based on p95
+	const reliableThroughput = (input.length / (1024 * 1024) / (stats.p95 / 1000)).toFixed(2) // MB/s
 
 	// Output metrics
 	console.log(`- Time Statistics (in ms):`)
@@ -180,6 +241,7 @@ function runPerformanceTest(
 	console.log(`- Throughput:`)
 	console.log(`  • Average: ${averageThroughput} MB/s`)
 	console.log(`  • Peak: ${peakThroughput} MB/s`)
+	console.log(`  • Reliable (P95): ${reliableThroughput} MB/s`)
 	console.log(
 		`- Output size: ${resultSize} MB (${reduction}% ${parseFloat(reduction) < 0 ? "increase" : "reduction"})`,
 	)
@@ -190,17 +252,62 @@ function runPerformanceTest(
 		reduction,
 		averageThroughput,
 		peakThroughput,
+		reliableThroughput,
 	}
 }
 
-// Run benchmark with different data sizes and complexities
+// Run comparative test between identical runs to measure variance
+function runBaselineTest(input: string, iterations: number) {
+	console.log("\n=== Baseline Performance Test ===")
+	console.log(`Testing with ${(input.length / (1024 * 1024)).toFixed(2)} MB of data`)
+
+	const runs = 5 // Run 5 times for better variance analysis
+	const results = []
+
+	for (let i = 0; i < runs; i++) {
+		results.push(runPerformanceTest(`Run ${i + 1}`, processCarriageReturns, input, iterations))
+	}
+
+	// Calculate average and variance metrics
+	const meanTimes = results.map((r) => r.stats.mean)
+	const avgMean = meanTimes.reduce((a, b) => a + b, 0) / runs
+	const maxVariation = Math.max(...meanTimes.map((t) => Math.abs(((t - avgMean) / avgMean) * 100)))
+
+	const throughputs = results.map((r) => parseFloat(r.peakThroughput))
+	const avgThroughput = throughputs.reduce((a, b) => a + b, 0) / runs
+	const throughputVariation = Math.max(
+		...throughputs.map((t) => Math.abs(((t - avgThroughput) / avgThroughput) * 100)),
+	)
+
+	console.log("\n=== Performance Variation Analysis ===")
+	console.log(`Mean execution time: ${avgMean.toFixed(3)} ms (±${maxVariation.toFixed(2)}%)`)
+	console.log(`Peak throughput: ${avgThroughput.toFixed(2)} MB/s (±${throughputVariation.toFixed(2)}%)`)
+
+	return { results, avgMean, maxVariation, avgThroughput, throughputVariation }
+}
+
+// Run benchmark with different data sizes and complexity levels
 function runBenchmark() {
-	// Define test configurations: [size, complexity]
-	const testConfigs: [number, "simple" | "medium" | "complex"][] = [
+	// Define regular test configurations: [size, complexity]
+	const standardTestConfigs: [number, "simple" | "medium" | "complex"][] = [
 		[10000, "simple"],
 		[10000, "complex"],
 		[100000, "simple"],
 		[100000, "complex"],
+		[500000, "complex"], // Large data test
+	]
+
+	// Define long line test configurations: [lineLengthKB, updateCount]
+	const longLineTestConfigs: [number, number][] = [
+		[100, 20], // 100KB line with 20 updates
+		[1000, 50], // 1MB line with 50 updates
+		[5000, 200], // 5MB line with 200 updates
+	]
+
+	// Define high-density CR test configurations: [size]
+	const highDensityCRConfigs: number[] = [
+		10000, // 10K updates
+		100000, // 100K updates
 	]
 
 	console.log("=".repeat(80))
@@ -217,7 +324,12 @@ function runBenchmark() {
 	}
 	console.log("Warmup complete")
 
-	for (const [size, complexity] of testConfigs) {
+	// Run standard tests
+	console.log("\n" + "=".repeat(80))
+	console.log("STANDARD TESTS")
+	console.log("=".repeat(80))
+
+	for (const [size, complexity] of standardTestConfigs) {
 		console.log(`\n${"-".repeat(80)}`)
 		console.log(`Testing with ${size} lines, ${complexity} complexity...`)
 
@@ -262,16 +374,70 @@ function runBenchmark() {
 			lineLimit,
 		])
 
-		// Test combined pipeline (real-world usage)
-		console.log("\n--- Combined Pipeline (all 3 functions) ---")
+		// Run baseline test to measure variance between identical runs
+		runBaselineTest(testData, Math.max(5, Math.floor(iterations / 4)))
+
+		// Test combined pipeline
+		console.log("\n--- Combined Pipeline ---")
 		runPerformanceTest(
 			"Full Pipeline",
 			(input) => truncateOutput(applyRunLengthEncoding(processCarriageReturns(input)), lineLimit),
 			testData,
-			Math.max(5, Math.floor(iterations / 2)),
+			Math.max(3, Math.floor(iterations / 5)),
 		)
 	}
 
+	// Run long line tests
+	console.log("\n" + "=".repeat(80))
+	console.log("EXTRA LONG LINE TESTS")
+	console.log("=".repeat(80))
+
+	for (const [lineLength, updateCount] of longLineTestConfigs) {
+		console.log(`\n${"-".repeat(80)}`)
+		console.log(`Testing with ${lineLength}KB single line, ${updateCount} carriage return updates...`)
+
+		// Generate long line test data
+		const startGenTime = performance.now()
+		const testData = generateLongLineTestData(lineLength, updateCount)
+		const genTime = performance.now() - startGenTime
+		const dataSize = (testData.length / (1024 * 1024)).toFixed(2)
+
+		console.log(`Generated ${dataSize} MB of long line test data in ${genTime.toFixed(2)}ms`)
+		console.log(`Test data contains ${updateCount} carriage returns`)
+
+		// Use fewer iterations for long line tests
+		const iterations = Math.max(3, Math.min(10, getIterationCount(lineLength * 100)))
+		console.log(`Running ${iterations} iterations...`)
+
+		console.log("\n--- Testing processCarriageReturns with long line ---")
+		runPerformanceTest("processCarriageReturns (long line)", processCarriageReturns, testData, iterations)
+	}
+
+	// Run high-density carriage return tests
+	console.log("\n" + "=".repeat(80))
+	console.log("HIGH-DENSITY CARRIAGE RETURN TESTS")
+	console.log("=".repeat(80))
+
+	for (const size of highDensityCRConfigs) {
+		console.log(`\n${"-".repeat(80)}`)
+		console.log(`Testing with ${size} high-density CR updates...`)
+
+		// Generate high-density CR test data
+		const startGenTime = performance.now()
+		const testData = generateHighDensityCRData(size)
+		const genTime = performance.now() - startGenTime
+		const dataSize = (testData.length / (1024 * 1024)).toFixed(2)
+
+		console.log(`Generated ${dataSize} MB of high-density CR test data in ${genTime.toFixed(2)}ms`)
+
+		// Use fewer iterations for these intensive tests
+		const iterations = Math.max(5, Math.floor(getIterationCount(size) / 2))
+		console.log(`Running ${iterations} iterations...`)
+
+		console.log("\n--- Testing processCarriageReturns with high-density CRs ---")
+		runPerformanceTest("processCarriageReturns (high-density CR)", processCarriageReturns, testData, iterations)
+	}
+
 	console.log("\n" + "=".repeat(80))
 	console.log("Benchmark complete")
 	console.log("=".repeat(80))
diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts