Merge pull request RooCodeInc#1964 from cline/ocasta181/ENG-209

ocasta181 · web-flow · commit ca17a06618f5 · 2025-03-11T17:23:07.000-07:00
[Fix] Cline crashes when reading files with large outputs
diff --git a/.changeset/seven-flowers-lay.md b/.changeset/seven-flowers-lay.md
@@ -0,0 +1,5 @@
+---
+"claude-dev": patch
+---
+
+Fix a bug where cline crashes when reading large data from files
diff --git a/src/core/Cline.ts b/src/core/Cline.ts
@@ -62,6 +62,7 @@ import { ClineHandler } from "../api/providers/cline"
 import { ClineProvider, GlobalFileNames } from "./webview/ClineProvider"
 import { DEFAULT_LANGUAGE_SETTINGS, getLanguageKey, LanguageDisplay, LanguageKey } from "../shared/Languages"
 import { telemetryService } from "../services/telemetry/TelemetryService"
+import { getMaxAllowedSize } from "../utils/content-size"
 
 const cwd = vscode.workspace.workspaceFolders?.map((folder) => folder.uri.fsPath).at(0) ?? path.join(os.homedir(), "Desktop") // may or may not exist but fs checking existence would immediately ask for permission which would be bad UX, need to come up with a better solution
 
@@ -1353,25 +1354,8 @@ export class Cline {
 			if (previousRequest && previousRequest.text) {
 				const { tokensIn, tokensOut, cacheWrites, cacheReads }: ClineApiReqInfo = JSON.parse(previousRequest.text)
 				const totalTokens = (tokensIn || 0) + (tokensOut || 0) + (cacheWrites || 0) + (cacheReads || 0)
-				let contextWindow = this.api.getModel().info.contextWindow || 128_000
-				// FIXME: hack to get anyone using openai compatible with deepseek to have the proper context window instead of the default 128k. We need a way for the user to specify the context window for models they input through openai compatible
-				if (this.api instanceof OpenAiHandler && this.api.getModel().id.toLowerCase().includes("deepseek")) {
-					contextWindow = 64_000
-				}
-				let maxAllowedSize: number
-				switch (contextWindow) {
-					case 64_000: // deepseek models
-						maxAllowedSize = contextWindow - 27_000
-						break
-					case 128_000: // most models
-						maxAllowedSize = contextWindow - 30_000
-						break
-					case 200_000: // claude models
-						maxAllowedSize = contextWindow - 40_000
-						break
-					default:
-						maxAllowedSize = Math.max(contextWindow - 40_000, contextWindow * 0.8) // for deepseek, 80% of 64k meant only ~10k buffer which was too small and resulted in users getting context window errors.
-				}
+				let contextWindow = this.api.getModel().info.contextWindow || 64_000 // minimum context (Deepseek)
+				const maxAllowedSize = getMaxAllowedSize(contextWindow)
 
 				// This is the most reliable way to know when we're close to hitting the context window.
 				if (totalTokens >= maxAllowedSize) {
@@ -2015,15 +1999,17 @@ export class Cline {
 									}
 									telemetryService.captureToolUsage(this.taskId, block.name, false, true)
 								}
-								// now execute the tool like normal
-								const content = await extractTextFromFile(absolutePath)
+								// Get context window and used context from API model
+								const contextWindow = this.api.getModel().info.contextWindow
+
+								// Pass the raw context window size - extractTextFromFile will calculate the appropriate limit
+								const content = await extractTextFromFile(absolutePath, contextWindow)
 								pushToolResult(content)
 
 								break
 							}
 						} catch (error) {
 							await handleError("reading file", error)
-
 							break
 						}
 					}
@@ -3390,9 +3376,10 @@ export class Cline {
 							block.text.includes("<task>") ||
 							block.text.includes("<user_message>")
 						) {
+							let contextWindow = this.api.getModel().info.contextWindow
 							return {
 								...block,
-								text: await parseMentions(block.text, cwd, this.urlContentFetcher),
+								text: await parseMentions(block.text, cwd, this.urlContentFetcher, contextWindow),
 							}
 						}
 					}
@@ -3497,11 +3484,12 @@ export class Cline {
 				}
 			}
 		}
+
 		// only show inactive terminals if there's output to show
 		if (inactiveTerminals.length > 0) {
 			const inactiveTerminalOutputs = new Map<number, string>()
 			for (const inactiveTerminal of inactiveTerminals) {
-				const newOutput = this.terminalManager.getUnretrievedOutput(inactiveTerminal.id)
+				const newOutput = await this.terminalManager.getUnretrievedOutput(inactiveTerminal.id)
 				if (newOutput) {
 					inactiveTerminalOutputs.set(inactiveTerminal.id, newOutput)
 				}
diff --git a/src/core/mentions/index.ts b/src/core/mentions/index.ts
@@ -38,7 +38,12 @@ export function openMention(mention?: string): void {
 	}
 }
 
-export async function parseMentions(text: string, cwd: string, urlContentFetcher: UrlContentFetcher): Promise<string> {
+export async function parseMentions(
+	text: string,
+	cwd: string,
+	urlContentFetcher: UrlContentFetcher,
+	contextWindow?: number,
+): Promise<string> {
 	const mentions: Set<string> = new Set()
 	let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
 		mentions.add(mention)
@@ -90,7 +95,7 @@ export async function parseMentions(text: string, cwd: string, urlContentFetcher
 		} else if (mention.startsWith("/")) {
 			const mentionPath = mention.slice(1)
 			try {
-				const content = await getFileOrFolderContent(mentionPath, cwd)
+				const content = await getFileOrFolderContent(mentionPath, cwd, contextWindow)
 				if (mention.endsWith("/")) {
 					parsedText += `\n\n<folder_content path="${mentionPath}">\n${content}\n</folder_content>`
 				} else {
@@ -145,7 +150,7 @@ export async function parseMentions(text: string, cwd: string, urlContentFetcher
 	return parsedText
 }
 
-async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise<string> {
+async function getFileOrFolderContent(mentionPath: string, cwd: string, contextWindow?: number): Promise<string> {
 	const absPath = path.resolve(cwd, mentionPath)
 
 	try {
@@ -156,7 +161,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
 			if (isBinary) {
 				return "(Binary file, unable to display content)"
 			}
-			const content = await extractTextFromFile(absPath)
+			const content = await extractTextFromFile(absPath, contextWindow)
 			return content
 		} else if (stats.isDirectory()) {
 			const entries = await fs.readdir(absPath, { withFileTypes: true })
@@ -177,7 +182,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
 								if (isBinary) {
 									return undefined
 								}
-								const content = await extractTextFromFile(absoluteFilePath)
+								const content = await extractTextFromFile(absoluteFilePath, contextWindow)
 								return `<file_content path="${filePath.toPosix()}">\n${content}\n</file_content>`
 							} catch (error) {
 								return undefined
diff --git a/src/integrations/misc/extract-text.test.ts b/src/integrations/misc/extract-text.test.ts
@@ -0,0 +1,67 @@
+import { expect } from "chai"
+import { extractTextFromFile } from "./extract-text"
+import fs from "fs/promises"
+import path from "path"
+import os from "os"
+import { ContentTooLargeError } from "../../shared/errors"
+
+const CONTEXT_LIMIT = 1000 // Context limit of 1000 tokens means max allowed size is 500 tokens
+
+describe("extract-text", () => {
+	let tempFilePath: string
+
+	beforeEach(async () => {
+		tempFilePath = path.join(os.tmpdir(), "test-file.txt")
+	})
+
+	afterEach(async () => {
+		await fs.unlink(tempFilePath).catch(() => {})
+	})
+
+	it("throws error for non-existent file", async () => {
+		const nonExistentPath = path.join(os.tmpdir(), "non-existent.txt")
+		try {
+			await extractTextFromFile(nonExistentPath, CONTEXT_LIMIT)
+			throw new Error("Should have thrown error")
+		} catch (error) {
+			expect(error.message).to.include("File not found")
+		}
+	})
+
+	it("throws ContentTooLargeError when file would exceed max allowed size", async () => {
+		// Create content that would exceed max allowed size (37k tokens)
+		const largeContent = "x".repeat(148000) // 37k tokens
+		await fs.writeFile(tempFilePath, largeContent)
+
+		try {
+			await extractTextFromFile(tempFilePath, 37_000) // Pass pre-processed maxAllowedSize
+			throw new Error("Should have thrown error")
+		} catch (error) {
+			expect(error).to.be.instanceOf(ContentTooLargeError)
+			expect(error.details.type).to.equal("file")
+			expect(error.details.path).to.equal(tempFilePath)
+			expect(error.details.size.wouldExceedLimit).to.equal(true)
+		}
+	})
+
+	it("reads text file content when within size limit", async () => {
+		const content = "Hello world"
+		await fs.writeFile(tempFilePath, content)
+
+		const result = await extractTextFromFile(tempFilePath, CONTEXT_LIMIT)
+		expect(result).to.equal(content)
+	})
+
+	it("throws error for binary files", async () => {
+		// Create a simple binary file
+		const buffer = new Uint8Array([0x89, 0x50, 0x4e, 0x47]) // PNG file header
+		await fs.writeFile(tempFilePath, buffer, { encoding: "binary" })
+
+		try {
+			await extractTextFromFile(tempFilePath, CONTEXT_LIMIT)
+			throw new Error("Should have thrown error")
+		} catch (error) {
+			expect(error.message).to.include("Cannot read text for file type")
+		}
+	})
+})
diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts
@@ -4,29 +4,102 @@ import pdf from "pdf-parse/lib/pdf-parse"
 import mammoth from "mammoth"
 import fs from "fs/promises"
 import { isBinaryFile } from "isbinaryfile"
+import { estimateContentSize, estimateFileSize, wouldExceedSizeLimit, getMaxAllowedSize } from "../../utils/content-size"
+import { ContentTooLargeError } from "../../shared/errors"
 
-export async function extractTextFromFile(filePath: string): Promise<string> {
+/**
+ * Checks if terminal output would exceed size limits and returns the content if safe
+ * @param content The terminal output content to check
+ * @param contextWindow Context window limit in tokens
+ * @param command The command that generated this output (for error reporting)
+ * @returns The validated content
+ * @throws ContentTooLargeError if content exceeds size limit
+ */
+export async function extractTextFromTerminal(content: string | Buffer, contextWindow: number, command: string): Promise<string> {
+	console.debug(`[TERMINAL_SIZE_CHECK] Checking size for command output: ${command}`)
+
+	// Convert to string but don't trim yet
+	const rawContent = content.toString()
+	console.debug(`[TERMINAL_SIZE_CHECK] Raw content length: ${rawContent.length}`)
+
+	// Check size before trimming
+	const sizeEstimate = estimateContentSize(rawContent, contextWindow)
+	console.debug(`[TERMINAL_SIZE_CHECK] Content size: ${sizeEstimate.bytes} bytes`)
+	console.debug(`[TERMINAL_SIZE_CHECK] Estimated tokens: ${sizeEstimate.estimatedTokens}`)
+	console.debug(`[TERMINAL_SIZE_CHECK] Context window: ${contextWindow}`)
+
+	if (sizeEstimate.wouldExceedLimit) {
+		console.debug(`[TERMINAL_SIZE_CHECK] Output exceeds size limit`)
+		throw new ContentTooLargeError({
+			type: "terminal",
+			command,
+			size: sizeEstimate,
+		})
+	}
+
+	// Only trim after size check passes
+	const cleanContent = rawContent.trim()
+	console.debug(`[TERMINAL_SIZE_CHECK] Clean content length: ${cleanContent.length}`)
+	console.debug(`[TERMINAL_SIZE_CHECK] Size check passed`)
+	return cleanContent
+}
+
+export async function extractTextFromFile(
+	filePath: string,
+	contextWindow: number = 64_000 /* minimum context (Deepseek) */,
+): Promise<string> {
 	try {
 		await fs.access(filePath)
 	} catch (error) {
 		throw new Error(`File not found: ${filePath}`)
 	}
+
+	console.debug(`[FILE_READ_CHECK] Checking size for file: ${filePath}`)
+
+	// Get file stats to check size
+	const stats = await fs.stat(filePath)
+	console.debug(`[FILE_SIZE_CHECK] File size: ${stats.size} bytes`)
+
+	// Calculate max allowed size from context window
+	const maxAllowedSize = getMaxAllowedSize(contextWindow)
+	console.debug(`[FILE_SIZE_CHECK] Max allowed size: ${maxAllowedSize} tokens`)
+
+	// Check if file size would exceed limit before attempting to read
+	// This is more efficient than creating a full SizeEstimate object when we just need a boolean check
+	if (wouldExceedSizeLimit(stats.size, contextWindow)) {
+		console.debug(`[FILE_SIZE_CHECK] File exceeds size limit`)
+		// Only create the full size estimate when we need it for the error
+		const sizeEstimate = await estimateFileSize(filePath, maxAllowedSize)
+		throw new ContentTooLargeError({
+			type: "file",
+			path: filePath,
+			size: sizeEstimate,
+		})
+	}
+	console.debug(`[FILE_SIZE_CHECK] File size check passed`)
 	const fileExtension = path.extname(filePath).toLowerCase()
+	console.debug(`[FILE_READ] Reading file: ${filePath}`)
+	let content: string
 	switch (fileExtension) {
 		case ".pdf":
-			return extractTextFromPDF(filePath)
+			content = await extractTextFromPDF(filePath)
+			break
 		case ".docx":
-			return extractTextFromDOCX(filePath)
+			content = await extractTextFromDOCX(filePath)
+			break
 		case ".ipynb":
-			return extractTextFromIPYNB(filePath)
+			content = await extractTextFromIPYNB(filePath)
+			break
 		default:
 			const isBinary = await isBinaryFile(filePath).catch(() => false)
 			if (!isBinary) {
-				return await fs.readFile(filePath, "utf8")
+				content = await fs.readFile(filePath, "utf8")
 			} else {
 				throw new Error(`Cannot read text for file type: ${fileExtension}`)
 			}
 	}
+	console.debug(`[FILE_READ_COMPLETE] File read complete. Content length: ${content.length} chars`)
+	return content
 }
 
 async function extractTextFromPDF(filePath: string): Promise<string> {
diff --git a/src/shared/errors.ts b/src/shared/errors.ts
@@ -0,0 +1,18 @@
+import { SizeEstimate } from "../utils/content-size"
+
+/**
+ * Error thrown when content would exceed the model's context window limit
+ */
+export class ContentTooLargeError extends Error {
+	constructor(
+		public details: {
+			type: "file" | "terminal"
+			path?: string
+			command?: string
+			size: SizeEstimate
+		},
+	) {
+		super("Content too large for context window")
+		this.name = "ContentTooLargeError"
+	}
+}
diff --git a/src/utils/content-size.test.ts b/src/utils/content-size.test.ts
diff --git a/src/utils/content-size.ts b/src/utils/content-size.ts
diff --git a/src/utils/fs.ts b/src/utils/fs.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"claude-dev": patch
 +---
++
 +Fix a bug where cline crashes when reading large data from files