Fall back on aggressive line number stripping in diffs (#2453)

mrubens · web-flow · commit c18e25f4bc2c · 2025-04-10T00:29:50.000-04:00
* Add option for aggressive line number stripping

* Fall back on aggressive line number stripping in diffs
diff --git a/src/core/diff/strategies/__tests__/multi-search-replace.test.ts b/src/core/diff/strategies/__tests__/multi-search-replace.test.ts
@@ -815,23 +815,6 @@ function five() {
 					}
 				})
 
-				it("should not strip when not all lines have numbers in either section", async () => {
-					const originalContent = "function test() {\n    return true;\n}\n"
-					const diffContent = `test.ts
-<<<<<<< SEARCH
-1 | function test() {
-2 |     return true;
-3 | }
-=======
-1 | function test() {
-    return false;
-3 | }
->>>>>>> REPLACE`
-
-					const result = await strategy.applyDiff(originalContent, diffContent)
-					expect(result.success).toBe(false)
-				})
-
 				it("should preserve content that naturally starts with pipe", async () => {
 					const originalContent = "|header|another|\n|---|---|\n|data|more|\n"
 					const diffContent = `test.ts
@@ -852,6 +835,59 @@ function five() {
 					}
 				})
 
+				describe("aggressive line number stripping fallback", () => {
+					// Tests for aggressive line number stripping fallback
+					it("should use aggressive line number stripping when line numbers are inconsistent", async () => {
+						const originalContent = "function test() {\n    return true;\n}\n"
+
+						const diffContent = [
+							"<<<<<<< SEARCH",
+							":start_line:1",
+							":end_line:3",
+							"-------",
+							"1 | function test() {",
+							"    return true;", // missing line number
+							"3 | }",
+							"=======",
+							"function test() {",
+							"    return fallback;",
+							"}",
+							">>>>>>> REPLACE",
+						].join("\n")
+
+						const result = await strategy.applyDiff(originalContent, diffContent)
+						expect(result.success).toBe(true)
+						if (result.success) {
+							expect(result.content).toBe("function test() {\n    return fallback;\n}\n")
+						}
+					})
+
+					it("should handle pipe characters without numbers using aggressive fallback", async () => {
+						const originalContent = "function test() {\n    return true;\n}\n"
+
+						const diffContent = [
+							"<<<<<<< SEARCH",
+							":start_line:1",
+							":end_line:3",
+							"-------",
+							"| function test() {",
+							"|     return true;",
+							"| }",
+							"=======",
+							"function test() {",
+							"    return piped;",
+							"}",
+							">>>>>>> REPLACE",
+						].join("\n")
+
+						const result = await strategy.applyDiff(originalContent, diffContent)
+						expect(result.success).toBe(true)
+						if (result.success) {
+							expect(result.content).toBe("function test() {\n    return piped;\n}\n")
+						}
+					})
+				})
+
 				it("should preserve indentation when stripping line numbers", async () => {
 					const originalContent = "    function test() {\n        return true;\n    }\n"
 					const diffContent = `test.ts
diff --git a/src/core/diff/strategies/multi-search-replace.ts b/src/core/diff/strategies/multi-search-replace.ts
@@ -29,6 +29,48 @@ function getSimilarity(original: string, search: string): number {
 	return 1 - dist / maxLength
 }
 
+/**
+ * Performs a "middle-out" search of `lines` (between [startIndex, endIndex]) to find
+ * the slice that is most similar to `searchChunk`. Returns the best score, index, and matched text.
+ */
+function fuzzySearch(lines: string[], searchChunk: string, startIndex: number, endIndex: number) {
+	let bestScore = 0
+	let bestMatchIndex = -1
+	let bestMatchContent = ""
+	const searchLen = searchChunk.split(/\r?\n/).length
+
+	// Middle-out from the midpoint
+	const midPoint = Math.floor((startIndex + endIndex) / 2)
+	let leftIndex = midPoint
+	let rightIndex = midPoint + 1
+
+	while (leftIndex >= startIndex || rightIndex <= endIndex - searchLen) {
+		if (leftIndex >= startIndex) {
+			const originalChunk = lines.slice(leftIndex, leftIndex + searchLen).join("\n")
+			const similarity = getSimilarity(originalChunk, searchChunk)
+			if (similarity > bestScore) {
+				bestScore = similarity
+				bestMatchIndex = leftIndex
+				bestMatchContent = originalChunk
+			}
+			leftIndex--
+		}
+
+		if (rightIndex <= endIndex - searchLen) {
+			const originalChunk = lines.slice(rightIndex, rightIndex + searchLen).join("\n")
+			const similarity = getSimilarity(originalChunk, searchChunk)
+			if (similarity > bestScore) {
+				bestScore = similarity
+				bestMatchIndex = rightIndex
+				bestMatchContent = originalChunk
+			}
+			rightIndex++
+		}
+	}
+
+	return { bestScore, bestMatchIndex, bestMatchContent }
+}
+
 export class MultiSearchReplaceDiffStrategy implements DiffStrategy {
 	private fuzzyThreshold: number
 	private bufferLines: number
@@ -253,7 +295,9 @@ Only use a single line of '=======' between search and replacement content, beca
 			? { success: true }
 			: {
 					success: false,
-					error: `ERROR: Unexpected end of sequence: Expected '${state.current === State.AFTER_SEARCH ? SEP : REPLACE}' was not found.`,
+					error: `ERROR: Unexpected end of sequence: Expected '${
+						state.current === State.AFTER_SEARCH ? "=======" : ">>>>>>> REPLACE"
+					}' was not found.`,
 				}
 	}
 
@@ -329,19 +373,21 @@ Only use a single line of '=======' between search and replacement content, beca
 			}))
 			.sort((a, b) => a.startLine - b.startLine)
 
-		for (let { searchContent, replaceContent, startLine, endLine } of replacements) {
-			startLine += startLine === 0 ? 0 : delta
-			endLine += delta
+		for (const replacement of replacements) {
+			let { searchContent, replaceContent } = replacement
+			let startLine = replacement.startLine + (replacement.startLine === 0 ? 0 : delta)
+			let endLine = replacement.endLine + delta
 
 			// First unescape any escaped markers in the content
 			searchContent = this.unescapeMarkers(searchContent)
 			replaceContent = this.unescapeMarkers(replaceContent)
 
 			// Strip line numbers from search and replace content if every line starts with a line number
-			if (
+			const hasAllLineNumbers =
 				(everyLineHasLineNumbers(searchContent) && everyLineHasLineNumbers(replaceContent)) ||
 				(everyLineHasLineNumbers(searchContent) && replaceContent.trim() === "")
-			) {
+
+			if (hasAllLineNumbers) {
 				searchContent = stripLineNumbers(searchContent)
 				replaceContent = stripLineNumbers(replaceContent)
 			}
@@ -360,8 +406,8 @@ Only use a single line of '=======' between search and replacement content, beca
 			}
 
 			// Split content into lines, handling both \n and \r\n
-			const searchLines = searchContent === "" ? [] : searchContent.split(/\r?\n/)
-			const replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/)
+			let searchLines = searchContent === "" ? [] : searchContent.split(/\r?\n/)
+			let replaceLines = replaceContent === "" ? [] : replaceContent.split(/\r?\n/)
 
 			// Validate that empty search requires start line
 			if (searchLines.length === 0 && !startLine) {
@@ -385,7 +431,7 @@ Only use a single line of '=======' between search and replacement content, beca
 			let matchIndex = -1
 			let bestMatchScore = 0
 			let bestMatchContent = ""
-			const searchChunk = searchLines.join("\n")
+			let searchChunk = searchLines.join("\n")
 
 			// Determine search bounds
 			let searchStartIndex = 0
@@ -421,68 +467,70 @@ Only use a single line of '=======' between search and replacement content, beca
 
 			// If no match found yet, try middle-out search within bounds
 			if (matchIndex === -1) {
-				const midPoint = Math.floor((searchStartIndex + searchEndIndex) / 2)
-				let leftIndex = midPoint
-				let rightIndex = midPoint + 1
-
-				// Search outward from the middle within bounds
-				while (leftIndex >= searchStartIndex || rightIndex <= searchEndIndex - searchLines.length) {
-					// Check left side if still in range
-					if (leftIndex >= searchStartIndex) {
-						const originalChunk = resultLines.slice(leftIndex, leftIndex + searchLines.length).join("\n")
-						const similarity = getSimilarity(originalChunk, searchChunk)
-						if (similarity > bestMatchScore) {
-							bestMatchScore = similarity
-							matchIndex = leftIndex
-							bestMatchContent = originalChunk
-						}
-						leftIndex--
-					}
-
-					// Check right side if still in range
-					if (rightIndex <= searchEndIndex - searchLines.length) {
-						const originalChunk = resultLines.slice(rightIndex, rightIndex + searchLines.length).join("\n")
-						const similarity = getSimilarity(originalChunk, searchChunk)
-						if (similarity > bestMatchScore) {
-							bestMatchScore = similarity
-							matchIndex = rightIndex
-							bestMatchContent = originalChunk
-						}
-						rightIndex++
-					}
-				}
+				const {
+					bestScore,
+					bestMatchIndex,
+					bestMatchContent: midContent,
+				} = fuzzySearch(resultLines, searchChunk, searchStartIndex, searchEndIndex)
+				matchIndex = bestMatchIndex
+				bestMatchScore = bestScore
+				bestMatchContent = midContent
 			}
 
-			// Require similarity to meet threshold
+			// Try aggressive line number stripping as a fallback if regular matching fails
 			if (matchIndex === -1 || bestMatchScore < this.fuzzyThreshold) {
-				const searchChunk = searchLines.join("\n")
-				const originalContentSection =
-					startLine !== undefined && endLine !== undefined
-						? `\n\nOriginal Content:\n${addLineNumbers(
-								resultLines
-									.slice(
-										Math.max(0, startLine - 1 - this.bufferLines),
-										Math.min(resultLines.length, endLine + this.bufferLines),
-									)
-									.join("\n"),
-								Math.max(1, startLine - this.bufferLines),
-							)}`
-						: `\n\nOriginal Content:\n${addLineNumbers(resultLines.join("\n"))}`
-
-				const bestMatchSection = bestMatchContent
-					? `\n\nBest Match Found:\n${addLineNumbers(bestMatchContent, matchIndex + 1)}`
-					: `\n\nBest Match Found:\n(no match)`
-
-				const lineRange =
-					startLine || endLine
-						? ` at ${startLine ? `start: ${startLine}` : "start"} to ${endLine ? `end: ${endLine}` : "end"}`
-						: ""
+				// Strip both search and replace content once (simultaneously)
+				const aggressiveSearchContent = stripLineNumbers(searchContent, true)
+				const aggressiveReplaceContent = stripLineNumbers(replaceContent, true)
+
+				const aggressiveSearchLines = aggressiveSearchContent ? aggressiveSearchContent.split(/\r?\n/) : []
+				const aggressiveSearchChunk = aggressiveSearchLines.join("\n")
+
+				// Try middle-out search again with aggressive stripped content (respecting the same search bounds)
+				const {
+					bestScore,
+					bestMatchIndex,
+					bestMatchContent: aggContent,
+				} = fuzzySearch(resultLines, aggressiveSearchChunk, searchStartIndex, searchEndIndex)
+				if (bestMatchIndex !== -1 && bestScore >= this.fuzzyThreshold) {
+					matchIndex = bestMatchIndex
+					bestMatchScore = bestScore
+					bestMatchContent = aggContent
+					// Replace the original search/replace with their stripped versions
+					searchContent = aggressiveSearchContent
+					replaceContent = aggressiveReplaceContent
+					searchLines = aggressiveSearchLines
+					replaceLines = replaceContent ? replaceContent.split(/\r?\n/) : []
+				} else {
+					// No match found with either method
+					const originalContentSection =
+						startLine !== undefined && endLine !== undefined
+							? `\n\nOriginal Content:\n${addLineNumbers(
+									resultLines
+										.slice(
+											Math.max(0, startLine - 1 - this.bufferLines),
+											Math.min(resultLines.length, endLine + this.bufferLines),
+										)
+										.join("\n"),
+									Math.max(1, startLine - this.bufferLines),
+								)}`
+							: `\n\nOriginal Content:\n${addLineNumbers(resultLines.join("\n"))}`
+
+					const bestMatchSection = bestMatchContent
+						? `\n\nBest Match Found:\n${addLineNumbers(bestMatchContent, matchIndex + 1)}`
+						: `\n\nBest Match Found:\n(no match)`
+
+					const lineRange =
+						startLine || endLine
+							? ` at ${startLine ? `start: ${startLine}` : "start"} to ${endLine ? `end: ${endLine}` : "end"}`
+							: ""
 
-				diffResults.push({
-					success: false,
-					error: `No sufficiently similar match found${lineRange} (${Math.floor(bestMatchScore * 100)}% similar, needs ${Math.floor(this.fuzzyThreshold * 100)}%)\n\nDebug Info:\n- Similarity Score: ${Math.floor(bestMatchScore * 100)}%\n- Required Threshold: ${Math.floor(this.fuzzyThreshold * 100)}%\n- Search Range: ${startLine && endLine ? `lines ${startLine}-${endLine}` : "start to end"}\n- Tip: Use the read_file tool to get the latest content of the file before attempting to use the apply_diff tool again, as the file content may have changed\n\nSearch Content:\n${searchChunk}${bestMatchSection}${originalContentSection}`,
-				})
-				continue
+					diffResults.push({
+						success: false,
+						error: `No sufficiently similar match found${lineRange} (${Math.floor(bestMatchScore * 100)}% similar, needs ${Math.floor(this.fuzzyThreshold * 100)}%)\n\nDebug Info:\n- Similarity Score: ${Math.floor(bestMatchScore * 100)}%\n- Required Threshold: ${Math.floor(this.fuzzyThreshold * 100)}%\n- Search Range: ${startLine && endLine ? `lines ${startLine}-${endLine}` : "start to end"}\n- Tried both standard and aggressive line number stripping\n- Tip: Use the read_file tool to get the latest content of the file before attempting to use the apply_diff tool again, as the file content may have changed\n\nSearch Content:\n${searchChunk}${bestMatchSection}${originalContentSection}`,
+					})
+					continue
+				}
 			}
 
 			// Get the matched lines from the original content
diff --git a/src/integrations/misc/__tests__/extract-text.test.ts b/src/integrations/misc/__tests__/extract-text.test.ts
@@ -137,6 +137,46 @@ describe("stripLineNumbers", () => {
 		const expected = "line one\nline two\nline three"
 		expect(stripLineNumbers(input)).toBe(expected)
 	})
+
+	describe("aggressive mode", () => {
+		it("should strip content with just a pipe character", () => {
+			const input = "| line one\n| line two\n| line three"
+			const expected = "line one\nline two\nline three"
+			expect(stripLineNumbers(input, true)).toBe(expected)
+		})
+
+		it("should strip content with mixed formats in aggressive mode", () => {
+			const input = "1 | line one\n| line two\n123 | line three"
+			const expected = "line one\nline two\nline three"
+			expect(stripLineNumbers(input, true)).toBe(expected)
+		})
+
+		it("should not strip content with pipe characters not at start in aggressive mode", () => {
+			const input = "text | more text\nx | y"
+			expect(stripLineNumbers(input, true)).toBe(input)
+		})
+
+		it("should handle empty content in aggressive mode", () => {
+			expect(stripLineNumbers("", true)).toBe("")
+		})
+
+		it("should preserve padding after pipe in aggressive mode", () => {
+			const input = "|  line with extra spaces\n1 |  indented content"
+			const expected = " line with extra spaces\n indented content"
+			expect(stripLineNumbers(input, true)).toBe(expected)
+		})
+
+		it("should preserve windows-style line endings in aggressive mode", () => {
+			const input = "| line one\r\n| line two\r\n| line three"
+			const expected = "line one\r\nline two\r\nline three"
+			expect(stripLineNumbers(input, true)).toBe(expected)
+		})
+
+		it("should not affect regular content when using aggressive mode", () => {
+			const input = "regular line\nanother line\nno pipes here"
+			expect(stripLineNumbers(input, true)).toBe(input)
+		})
+	})
 })
 
 describe("truncateOutput", () => {
diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts
@@ -86,17 +86,23 @@ export function everyLineHasLineNumbers(content: string): boolean {
 	return lines.length > 0 && lines.every((line) => /^\s*\d+\s+\|(?!\|)/.test(line))
 }
 
-// Strips line numbers from content while preserving the actual content
-// Handles formats like "1 | content", " 12 | content", "123 | content"
-// Preserves content that naturally starts with pipe characters
-export function stripLineNumbers(content: string): string {
+/**
+ * Strips line numbers from content while preserving the actual content.
+ *
+ * @param content The content to process
+ * @param aggressive When false (default): Only strips lines with clear number patterns like "123 | content"
+ *                   When true: Uses a more lenient pattern that also matches lines with just a pipe character,
+ *                   which can be useful when LLMs don't perfectly format the line numbers in diffs
+ * @returns The content with line numbers removed
+ */
+export function stripLineNumbers(content: string, aggressive: boolean = false): string {
 	// Split into lines to handle each line individually
 	const lines = content.split(/\r?\n/)
 
 	// Process each line
 	const processedLines = lines.map((line) => {
 		// Match line number pattern and capture everything after the pipe
-		const match = line.match(/^\s*\d+\s+\|(?!\|)\s?(.*)$/)
+		const match = aggressive ? line.match(/^\s*(?:\d+\s)?\|\s(.*)$/) : line.match(/^\s*\d+\s+\|(?!\|)\s?(.*)$/)
 		return match ? match[1] : line
 	})