Fixes #4851: Prevent apply_diff tool hangs when modifying XML files

roomote · roomote · commit d7f9a5e22a4a · 2025-06-18T19:20:33.000Z
- Added timeout protection to XML parsing in multiApplyDiffTool to prevent hangs on complex XML content
- Implemented safer HTML entity unescaping for XML files to avoid corruption
- Added timeout mechanisms to regex operations in diff strategies to prevent infinite loops
- Enhanced error messages with specific guidance for XML file modifications
- Improved detection of problematic XML content patterns

The fix addresses the root causes:
1. XML parsing timeouts that could hang indefinitely
2. Double-escaping/unescaping issues with XML entities
3. Complex regex patterns hanging on large XML content
4. Lack of proper error handling for XML-specific edge cases
diff --git a/src/core/diff/strategies/multi-file-search-replace.ts b/src/core/diff/strategies/multi-file-search-replace.ts
@@ -463,11 +463,39 @@ Each file requires its own path, start_line, and diff elements.
 		8. ([\s\S]*?)(?:\n)?   Non‐greedy match for the "replace content" (group 7).
 		9. (?:(?<=\n)(?<!\\)>>>>>>> REPLACE)(?=\n|$)   Matches the final ">>>>>>> REPLACE" marker on its own line (and requires a following newline or the end of file).
 		*/
-		let matches = [
-			...diffContent.matchAll(
-				/(?:^|\n)(?<!\\)<<<<<<< SEARCH\s*\n((?:\:start_line:\s*(\d+)\s*\n))?((?:\:end_line:\s*(\d+)\s*\n))?((?<!\\)-------\s*\n)?([\s\S]*?)(?:\n)?(?:(?<=\n)(?<!\\)=======\s*\n)([\s\S]*?)(?:\n)?(?:(?<=\n)(?<!\\)>>>>>>> REPLACE)(?=\n|$)/g,
-			),
-		]
+		let matches: RegExpMatchArray[] = []
+
+		try {
+			// Use a timeout to prevent regex hangs on complex XML content
+			const timeoutMs = 10000 // 10 seconds timeout
+			const startTime = Date.now()
+
+			const regex =
+				/(?:^|\n)(?<!\\)<<<<<<< SEARCH\s*\n((?:\:start_line:\s*(\d+)\s*\n))?((?:\:end_line:\s*(\d+)\s*\n))?((?<!\\)-------\s*\n)?([\s\S]*?)(?:\n)?(?:(?<=\n)(?<!\\)=======\s*\n)([\s\S]*?)(?:\n)?(?:(?<=\n)(?<!\\)>>>>>>> REPLACE)(?=\n|$)/g
+
+			let match: RegExpMatchArray | null
+			while ((match = regex.exec(diffContent)) !== null) {
+				// Check for timeout to prevent infinite loops on complex content
+				if (Date.now() - startTime > timeoutMs) {
+					throw new Error(
+						`Regex matching timed out after ${timeoutMs}ms - this may indicate overly complex diff content`,
+					)
+				}
+
+				matches.push(match)
+
+				// Prevent infinite loops on zero-length matches
+				if (match.index === regex.lastIndex) {
+					regex.lastIndex++
+				}
+			}
+		} catch (error) {
+			const errorMessage = error instanceof Error ? error.message : String(error)
+			return {
+				success: false,
+				error: `Failed to parse diff content: ${errorMessage}\n\nThis often happens with complex XML content. Try:\n1. Breaking down large changes into smaller diffs\n2. Simplifying the diff content\n3. Using write_to_file for extensive changes`,
+			}
+		}
 
 		if (matches.length === 0) {
 			return {
diff --git a/src/core/diff/strategies/multi-search-replace.ts b/src/core/diff/strategies/multi-search-replace.ts
@@ -376,11 +376,39 @@ Only use a single line of '=======' between search and replacement content, beca
 			  Matches the final “>>>>>>> REPLACE” marker on its own line (and requires a following newline or the end of file).
 		*/
 
-		let matches = [
-			...diffContent.matchAll(
-				/(?:^|\n)(?<!\\)<<<<<<< SEARCH\s*\n((?:\:start_line:\s*(\d+)\s*\n))?((?:\:end_line:\s*(\d+)\s*\n))?((?<!\\)-------\s*\n)?([\s\S]*?)(?:\n)?(?:(?<=\n)(?<!\\)=======\s*\n)([\s\S]*?)(?:\n)?(?:(?<=\n)(?<!\\)>>>>>>> REPLACE)(?=\n|$)/g,
-			),
-		]
+		let matches: RegExpMatchArray[] = []
+
+		try {
+			// Use a timeout to prevent regex hangs on complex XML content
+			const timeoutMs = 10000 // 10 seconds timeout
+			const startTime = Date.now()
+
+			const regex =
+				/(?:^|\n)(?<!\\)<<<<<<< SEARCH\s*\n((?:\:start_line:\s*(\d+)\s*\n))?((?:\:end_line:\s*(\d+)\s*\n))?((?<!\\)-------\s*\n)?([\s\S]*?)(?:\n)?(?:(?<=\n)(?<!\\)=======\s*\n)([\s\S]*?)(?:\n)?(?:(?<=\n)(?<!\\)>>>>>>> REPLACE)(?=\n|$)/g
+
+			let match: RegExpMatchArray | null
+			while ((match = regex.exec(diffContent)) !== null) {
+				// Check for timeout to prevent infinite loops on complex content
+				if (Date.now() - startTime > timeoutMs) {
+					throw new Error(
+						`Regex matching timed out after ${timeoutMs}ms - this may indicate overly complex diff content`,
+					)
+				}
+
+				matches.push(match)
+
+				// Prevent infinite loops on zero-length matches
+				if (match.index === regex.lastIndex) {
+					regex.lastIndex++
+				}
+			}
+		} catch (error) {
+			const errorMessage = error instanceof Error ? error.message : String(error)
+			return {
+				success: false,
+				error: `Failed to parse diff content: ${errorMessage}\n\nThis often happens with complex XML content. Try:\n1. Breaking down large changes into smaller diffs\n2. Simplifying the diff content\n3. Using write_to_file for extensive changes`,
+			}
+		}
 
 		if (matches.length === 0) {
 			return {
diff --git a/src/core/tools/applyDiffTool.ts b/src/core/tools/applyDiffTool.ts
@@ -24,7 +24,22 @@ export async function applyDiffToolLegacy(
 	let diffContent: string | undefined = block.params.diff
 
 	if (diffContent && !cline.api.getModel().id.includes("claude")) {
-		diffContent = unescapeHtmlEntities(diffContent)
+		// Be more careful with XML content - only unescape for non-Claude models
+		// and add additional safety checks
+		const hasXmlStructure = diffContent.includes("<") && diffContent.includes(">")
+		const hasXmlEntities = /&[a-zA-Z0-9#]+;/.test(diffContent)
+
+		// If it looks like XML content, be more conservative about unescaping
+		if (hasXmlStructure && hasXmlEntities) {
+			// Only unescape basic HTML entities, not XML-specific ones
+			diffContent = diffContent
+				.replace(/&lt;/g, "<")
+				.replace(/&gt;/g, ">")
+				.replace(/&quot;/g, '"')
+				.replace(/&amp;/g, "&") // Do this last to avoid double-unescaping
+		} else {
+			diffContent = unescapeHtmlEntities(diffContent)
+		}
 	}
 
 	const sharedMessageProps: ClineSayTool = {
diff --git a/src/core/tools/multiApplyDiffTool.ts b/src/core/tools/multiApplyDiffTool.ts
@@ -49,6 +49,54 @@ interface ParsedXmlResult {
 	file: ParsedFile | ParsedFile[]
 }
 
+/**
+ * Safely parse XML with timeout and error handling to prevent hangs
+ */
+function safeParseXml(xmlString: string, stopNodes?: string[], timeoutMs: number = 5000): Promise<unknown> {
+	return new Promise((resolve, reject) => {
+		const timeoutId = setTimeout(() => {
+			reject(
+				new Error(
+					`XML parsing timed out after ${timeoutMs}ms - this may indicate malformed or overly complex XML content`,
+				),
+			)
+		}, timeoutMs)
+
+		try {
+			const result = parseXml(xmlString, stopNodes)
+			clearTimeout(timeoutId)
+			resolve(result)
+		} catch (error) {
+			clearTimeout(timeoutId)
+			reject(error)
+		}
+	})
+}
+
+/**
+ * Check if content appears to be XML-like and might cause parsing issues
+ */
+function isProblematicXmlContent(content: string): boolean {
+	// Check for XML-like patterns that might cause issues
+	const xmlPatterns = [
+		/<[^>]+>/g, // XML tags
+		/&[a-zA-Z0-9#]+;/g, // HTML/XML entities
+		/<!DOCTYPE/i, // DOCTYPE declarations
+		/<\?xml/i, // XML declarations
+	]
+
+	let xmlElementCount = 0
+	for (const pattern of xmlPatterns) {
+		const matches = content.match(pattern)
+		if (matches) {
+			xmlElementCount += matches.length
+		}
+	}
+
+	// If we have many XML-like elements, it might be problematic
+	return xmlElementCount > 10
+}
+
 export async function applyDiffTool(
 	cline: Task,
 	block: ToolUse,
@@ -105,9 +153,15 @@ export async function applyDiffTool(
 	}
 
 	if (argsXmlTag) {
-		// Parse file entries from XML (new way)
+		// Parse file entries from XML (new way) with safety checks
 		try {
-			const parsed = parseXml(argsXmlTag, ["file.diff.content"]) as ParsedXmlResult
+			// Check if the XML content might be problematic before parsing
+			if (isProblematicXmlContent(argsXmlTag)) {
+				console.warn("Detected potentially problematic XML content, using safer parsing approach")
+			}
+
+			// Use safe XML parsing with timeout to prevent hangs
+			const parsed = (await safeParseXml(argsXmlTag, ["file.diff.content"])) as ParsedXmlResult
 			const files = Array.isArray(parsed.file) ? parsed.file : [parsed.file].filter(Boolean)
 
 			for (const file of files) {
@@ -142,10 +196,30 @@ export async function applyDiffTool(
 			}
 		} catch (error) {
 			const errorMessage = error instanceof Error ? error.message : String(error)
-			const detailedError = `Failed to parse apply_diff XML. This usually means:
-1. The XML structure is malformed or incomplete
+
+			// Check if this is a timeout error specifically
+			const isTimeoutError = errorMessage.includes("timed out")
+
+			const detailedError = `Failed to parse apply_diff XML. ${
+				isTimeoutError
+					? "The operation timed out, which often happens when processing XML files with complex content."
+					: "This usually means:"
+			}
+
+${
+	isTimeoutError
+		? `
+Possible solutions:
+1. Break down large XML changes into smaller, more targeted diffs
+2. Use the legacy apply_diff format (path/diff parameters) instead of XML args
+3. Simplify the diff content to avoid complex nested XML structures
+4. Consider using write_to_file for complete file rewrites if the changes are extensive
+
+`
+		: `1. The XML structure is malformed or incomplete
 2. Missing required <file>, <path>, or <diff> tags
 3. Invalid characters or encoding in the XML
+4. Complex XML content within diff content causing parsing conflicts
 
 Expected structure:
 <args>
@@ -158,7 +232,8 @@ Expected structure:
   </file>
 </args>
 
-Original error: ${errorMessage}`
+`
+}Original error: ${errorMessage}`
 			throw new Error(detailedError)
 		}
 	} else if (legacyPath && typeof legacyDiffContent === "string") {
@@ -406,11 +481,29 @@ Original error: ${errorMessage}`
 				let formattedError = ""
 
 				// Pre-process all diff items for HTML entity unescaping if needed
+				// Be more careful with XML content - only unescape for non-Claude models
+				// and add additional safety checks
 				const processedDiffItems = !cline.api.getModel().id.includes("claude")
-					? diffItems.map((item) => ({
-							...item,
-							content: item.content ? unescapeHtmlEntities(item.content) : item.content,
-						}))
+					? diffItems.map((item) => {
+							if (!item.content) return item
+
+							// Check if content looks like it might be XML that shouldn't be unescaped
+							const hasXmlStructure = item.content.includes("<") && item.content.includes(">")
+							const hasXmlEntities = /&[a-zA-Z0-9#]+;/.test(item.content)
+
+							// If it looks like XML content, be more conservative about unescaping
+							if (hasXmlStructure && hasXmlEntities) {
+								// Only unescape basic HTML entities, not XML-specific ones
+								const safeUnescaped = item.content
+									.replace(/&lt;/g, "<")
+									.replace(/&gt;/g, ">")
+									.replace(/&quot;/g, '"')
+									.replace(/&amp;/g, "&") // Do this last to avoid double-unescaping
+								return { ...item, content: safeUnescaped }
+							} else {
+								return { ...item, content: unescapeHtmlEntities(item.content) }
+							}
+						})
 					: diffItems
 
 				// Apply all diffs at once with the array-based method
@@ -450,6 +543,7 @@ Suggested fixes:
 3. Use <read_file> to see the current file content
 4. Consider breaking complex changes into smaller diffs
 5. Ensure start_line parameter matches the actual content location
+6. For XML files, be careful with special characters and entities
 ${errorDetails ? `\nDetailed error information:\n${errorDetails}\n` : ""}
 </error_details>\n\n`
 						}
@@ -465,6 +559,7 @@ Recovery suggestions:
 3. Check that the search content exactly matches what's in the file
 4. Consider using line numbers with start_line parameter
 5. Break large changes into smaller, more specific diffs
+6. For XML files, ensure proper escaping of special characters
 ${errorDetails ? `\nTechnical details:\n${errorDetails}\n` : ""}
 </error_details>\n\n`
 					}