Skip to content

Commit ca17a06

Browse files
authored
Merge pull request RooCodeInc#1964 from cline/ocasta181/ENG-209
[Fix] Cline crashes when reading files with large outputs
2 parents 6b9c2a1 + 48ea04f commit ca17a06

File tree

9 files changed

+391
-38
lines changed

9 files changed

+391
-38
lines changed

.changeset/seven-flowers-lay.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"claude-dev": patch
3+
---
4+
5+
Fix a bug where cline crashes when reading large data from files

src/core/Cline.ts

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ import { ClineHandler } from "../api/providers/cline"
6262
import { ClineProvider, GlobalFileNames } from "./webview/ClineProvider"
6363
import { DEFAULT_LANGUAGE_SETTINGS, getLanguageKey, LanguageDisplay, LanguageKey } from "../shared/Languages"
6464
import { telemetryService } from "../services/telemetry/TelemetryService"
65+
import { getMaxAllowedSize } from "../utils/content-size"
6566

6667
const cwd = vscode.workspace.workspaceFolders?.map((folder) => folder.uri.fsPath).at(0) ?? path.join(os.homedir(), "Desktop") // may or may not exist but fs checking existence would immediately ask for permission which would be bad UX, need to come up with a better solution
6768

@@ -1353,25 +1354,8 @@ export class Cline {
13531354
if (previousRequest && previousRequest.text) {
13541355
const { tokensIn, tokensOut, cacheWrites, cacheReads }: ClineApiReqInfo = JSON.parse(previousRequest.text)
13551356
const totalTokens = (tokensIn || 0) + (tokensOut || 0) + (cacheWrites || 0) + (cacheReads || 0)
1356-
let contextWindow = this.api.getModel().info.contextWindow || 128_000
1357-
// FIXME: hack to get anyone using openai compatible with deepseek to have the proper context window instead of the default 128k. We need a way for the user to specify the context window for models they input through openai compatible
1358-
if (this.api instanceof OpenAiHandler && this.api.getModel().id.toLowerCase().includes("deepseek")) {
1359-
contextWindow = 64_000
1360-
}
1361-
let maxAllowedSize: number
1362-
switch (contextWindow) {
1363-
case 64_000: // deepseek models
1364-
maxAllowedSize = contextWindow - 27_000
1365-
break
1366-
case 128_000: // most models
1367-
maxAllowedSize = contextWindow - 30_000
1368-
break
1369-
case 200_000: // claude models
1370-
maxAllowedSize = contextWindow - 40_000
1371-
break
1372-
default:
1373-
maxAllowedSize = Math.max(contextWindow - 40_000, contextWindow * 0.8) // for deepseek, 80% of 64k meant only ~10k buffer which was too small and resulted in users getting context window errors.
1374-
}
1357+
let contextWindow = this.api.getModel().info.contextWindow || 64_000 // minimum context (Deepseek)
1358+
const maxAllowedSize = getMaxAllowedSize(contextWindow)
13751359

13761360
// This is the most reliable way to know when we're close to hitting the context window.
13771361
if (totalTokens >= maxAllowedSize) {
@@ -2015,15 +1999,17 @@ export class Cline {
20151999
}
20162000
telemetryService.captureToolUsage(this.taskId, block.name, false, true)
20172001
}
2018-
// now execute the tool like normal
2019-
const content = await extractTextFromFile(absolutePath)
2002+
// Get context window and used context from API model
2003+
const contextWindow = this.api.getModel().info.contextWindow
2004+
2005+
// Pass the raw context window size - extractTextFromFile will calculate the appropriate limit
2006+
const content = await extractTextFromFile(absolutePath, contextWindow)
20202007
pushToolResult(content)
20212008

20222009
break
20232010
}
20242011
} catch (error) {
20252012
await handleError("reading file", error)
2026-
20272013
break
20282014
}
20292015
}
@@ -3390,9 +3376,10 @@ export class Cline {
33903376
block.text.includes("<task>") ||
33913377
block.text.includes("<user_message>")
33923378
) {
3379+
let contextWindow = this.api.getModel().info.contextWindow
33933380
return {
33943381
...block,
3395-
text: await parseMentions(block.text, cwd, this.urlContentFetcher),
3382+
text: await parseMentions(block.text, cwd, this.urlContentFetcher, contextWindow),
33963383
}
33973384
}
33983385
}
@@ -3497,11 +3484,12 @@ export class Cline {
34973484
}
34983485
}
34993486
}
3487+
35003488
// only show inactive terminals if there's output to show
35013489
if (inactiveTerminals.length > 0) {
35023490
const inactiveTerminalOutputs = new Map<number, string>()
35033491
for (const inactiveTerminal of inactiveTerminals) {
3504-
const newOutput = this.terminalManager.getUnretrievedOutput(inactiveTerminal.id)
3492+
const newOutput = await this.terminalManager.getUnretrievedOutput(inactiveTerminal.id)
35053493
if (newOutput) {
35063494
inactiveTerminalOutputs.set(inactiveTerminal.id, newOutput)
35073495
}

src/core/mentions/index.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,12 @@ export function openMention(mention?: string): void {
3838
}
3939
}
4040

41-
export async function parseMentions(text: string, cwd: string, urlContentFetcher: UrlContentFetcher): Promise<string> {
41+
export async function parseMentions(
42+
text: string,
43+
cwd: string,
44+
urlContentFetcher: UrlContentFetcher,
45+
contextWindow?: number,
46+
): Promise<string> {
4247
const mentions: Set<string> = new Set()
4348
let parsedText = text.replace(mentionRegexGlobal, (match, mention) => {
4449
mentions.add(mention)
@@ -90,7 +95,7 @@ export async function parseMentions(text: string, cwd: string, urlContentFetcher
9095
} else if (mention.startsWith("/")) {
9196
const mentionPath = mention.slice(1)
9297
try {
93-
const content = await getFileOrFolderContent(mentionPath, cwd)
98+
const content = await getFileOrFolderContent(mentionPath, cwd, contextWindow)
9499
if (mention.endsWith("/")) {
95100
parsedText += `\n\n<folder_content path="${mentionPath}">\n${content}\n</folder_content>`
96101
} else {
@@ -145,7 +150,7 @@ export async function parseMentions(text: string, cwd: string, urlContentFetcher
145150
return parsedText
146151
}
147152

148-
async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise<string> {
153+
async function getFileOrFolderContent(mentionPath: string, cwd: string, contextWindow?: number): Promise<string> {
149154
const absPath = path.resolve(cwd, mentionPath)
150155

151156
try {
@@ -156,7 +161,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
156161
if (isBinary) {
157162
return "(Binary file, unable to display content)"
158163
}
159-
const content = await extractTextFromFile(absPath)
164+
const content = await extractTextFromFile(absPath, contextWindow)
160165
return content
161166
} else if (stats.isDirectory()) {
162167
const entries = await fs.readdir(absPath, { withFileTypes: true })
@@ -177,7 +182,7 @@ async function getFileOrFolderContent(mentionPath: string, cwd: string): Promise
177182
if (isBinary) {
178183
return undefined
179184
}
180-
const content = await extractTextFromFile(absoluteFilePath)
185+
const content = await extractTextFromFile(absoluteFilePath, contextWindow)
181186
return `<file_content path="${filePath.toPosix()}">\n${content}\n</file_content>`
182187
} catch (error) {
183188
return undefined
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import { expect } from "chai"
2+
import { extractTextFromFile } from "./extract-text"
3+
import fs from "fs/promises"
4+
import path from "path"
5+
import os from "os"
6+
import { ContentTooLargeError } from "../../shared/errors"
7+
8+
const CONTEXT_LIMIT = 1000 // Context limit of 1000 tokens means max allowed size is 500 tokens
9+
10+
describe("extract-text", () => {
11+
let tempFilePath: string
12+
13+
beforeEach(async () => {
14+
tempFilePath = path.join(os.tmpdir(), "test-file.txt")
15+
})
16+
17+
afterEach(async () => {
18+
await fs.unlink(tempFilePath).catch(() => {})
19+
})
20+
21+
it("throws error for non-existent file", async () => {
22+
const nonExistentPath = path.join(os.tmpdir(), "non-existent.txt")
23+
try {
24+
await extractTextFromFile(nonExistentPath, CONTEXT_LIMIT)
25+
throw new Error("Should have thrown error")
26+
} catch (error) {
27+
expect(error.message).to.include("File not found")
28+
}
29+
})
30+
31+
it("throws ContentTooLargeError when file would exceed max allowed size", async () => {
32+
// Create content that would exceed max allowed size (37k tokens)
33+
const largeContent = "x".repeat(148000) // 37k tokens
34+
await fs.writeFile(tempFilePath, largeContent)
35+
36+
try {
37+
await extractTextFromFile(tempFilePath, 37_000) // Pass pre-processed maxAllowedSize
38+
throw new Error("Should have thrown error")
39+
} catch (error) {
40+
expect(error).to.be.instanceOf(ContentTooLargeError)
41+
expect(error.details.type).to.equal("file")
42+
expect(error.details.path).to.equal(tempFilePath)
43+
expect(error.details.size.wouldExceedLimit).to.equal(true)
44+
}
45+
})
46+
47+
it("reads text file content when within size limit", async () => {
48+
const content = "Hello world"
49+
await fs.writeFile(tempFilePath, content)
50+
51+
const result = await extractTextFromFile(tempFilePath, CONTEXT_LIMIT)
52+
expect(result).to.equal(content)
53+
})
54+
55+
it("throws error for binary files", async () => {
56+
// Create a simple binary file
57+
const buffer = new Uint8Array([0x89, 0x50, 0x4e, 0x47]) // PNG file header
58+
await fs.writeFile(tempFilePath, buffer, { encoding: "binary" })
59+
60+
try {
61+
await extractTextFromFile(tempFilePath, CONTEXT_LIMIT)
62+
throw new Error("Should have thrown error")
63+
} catch (error) {
64+
expect(error.message).to.include("Cannot read text for file type")
65+
}
66+
})
67+
})

src/integrations/misc/extract-text.ts

Lines changed: 78 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,29 +4,102 @@ import pdf from "pdf-parse/lib/pdf-parse"
44
import mammoth from "mammoth"
55
import fs from "fs/promises"
66
import { isBinaryFile } from "isbinaryfile"
7+
import { estimateContentSize, estimateFileSize, wouldExceedSizeLimit, getMaxAllowedSize } from "../../utils/content-size"
8+
import { ContentTooLargeError } from "../../shared/errors"
79

8-
export async function extractTextFromFile(filePath: string): Promise<string> {
10+
/**
11+
* Checks if terminal output would exceed size limits and returns the content if safe
12+
* @param content The terminal output content to check
13+
* @param contextWindow Context window limit in tokens
14+
* @param command The command that generated this output (for error reporting)
15+
* @returns The validated content
16+
* @throws ContentTooLargeError if content exceeds size limit
17+
*/
18+
export async function extractTextFromTerminal(content: string | Buffer, contextWindow: number, command: string): Promise<string> {
19+
console.debug(`[TERMINAL_SIZE_CHECK] Checking size for command output: ${command}`)
20+
21+
// Convert to string but don't trim yet
22+
const rawContent = content.toString()
23+
console.debug(`[TERMINAL_SIZE_CHECK] Raw content length: ${rawContent.length}`)
24+
25+
// Check size before trimming
26+
const sizeEstimate = estimateContentSize(rawContent, contextWindow)
27+
console.debug(`[TERMINAL_SIZE_CHECK] Content size: ${sizeEstimate.bytes} bytes`)
28+
console.debug(`[TERMINAL_SIZE_CHECK] Estimated tokens: ${sizeEstimate.estimatedTokens}`)
29+
console.debug(`[TERMINAL_SIZE_CHECK] Context window: ${contextWindow}`)
30+
31+
if (sizeEstimate.wouldExceedLimit) {
32+
console.debug(`[TERMINAL_SIZE_CHECK] Output exceeds size limit`)
33+
throw new ContentTooLargeError({
34+
type: "terminal",
35+
command,
36+
size: sizeEstimate,
37+
})
38+
}
39+
40+
// Only trim after size check passes
41+
const cleanContent = rawContent.trim()
42+
console.debug(`[TERMINAL_SIZE_CHECK] Clean content length: ${cleanContent.length}`)
43+
console.debug(`[TERMINAL_SIZE_CHECK] Size check passed`)
44+
return cleanContent
45+
}
46+
47+
export async function extractTextFromFile(
48+
filePath: string,
49+
contextWindow: number = 64_000 /* minimum context (Deepseek) */,
50+
): Promise<string> {
951
try {
1052
await fs.access(filePath)
1153
} catch (error) {
1254
throw new Error(`File not found: ${filePath}`)
1355
}
56+
57+
console.debug(`[FILE_READ_CHECK] Checking size for file: ${filePath}`)
58+
59+
// Get file stats to check size
60+
const stats = await fs.stat(filePath)
61+
console.debug(`[FILE_SIZE_CHECK] File size: ${stats.size} bytes`)
62+
63+
// Calculate max allowed size from context window
64+
const maxAllowedSize = getMaxAllowedSize(contextWindow)
65+
console.debug(`[FILE_SIZE_CHECK] Max allowed size: ${maxAllowedSize} tokens`)
66+
67+
// Check if file size would exceed limit before attempting to read
68+
// This is more efficient than creating a full SizeEstimate object when we just need a boolean check
69+
if (wouldExceedSizeLimit(stats.size, contextWindow)) {
70+
console.debug(`[FILE_SIZE_CHECK] File exceeds size limit`)
71+
// Only create the full size estimate when we need it for the error
72+
const sizeEstimate = await estimateFileSize(filePath, maxAllowedSize)
73+
throw new ContentTooLargeError({
74+
type: "file",
75+
path: filePath,
76+
size: sizeEstimate,
77+
})
78+
}
79+
console.debug(`[FILE_SIZE_CHECK] File size check passed`)
1480
const fileExtension = path.extname(filePath).toLowerCase()
81+
console.debug(`[FILE_READ] Reading file: ${filePath}`)
82+
let content: string
1583
switch (fileExtension) {
1684
case ".pdf":
17-
return extractTextFromPDF(filePath)
85+
content = await extractTextFromPDF(filePath)
86+
break
1887
case ".docx":
19-
return extractTextFromDOCX(filePath)
88+
content = await extractTextFromDOCX(filePath)
89+
break
2090
case ".ipynb":
21-
return extractTextFromIPYNB(filePath)
91+
content = await extractTextFromIPYNB(filePath)
92+
break
2293
default:
2394
const isBinary = await isBinaryFile(filePath).catch(() => false)
2495
if (!isBinary) {
25-
return await fs.readFile(filePath, "utf8")
96+
content = await fs.readFile(filePath, "utf8")
2697
} else {
2798
throw new Error(`Cannot read text for file type: ${fileExtension}`)
2899
}
29100
}
101+
console.debug(`[FILE_READ_COMPLETE] File read complete. Content length: ${content.length} chars`)
102+
return content
30103
}
31104

32105
async function extractTextFromPDF(filePath: string): Promise<string> {

src/shared/errors.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import { SizeEstimate } from "../utils/content-size"
2+
3+
/**
4+
* Error thrown when content would exceed the model's context window limit
5+
*/
6+
export class ContentTooLargeError extends Error {
7+
constructor(
8+
public details: {
9+
type: "file" | "terminal"
10+
path?: string
11+
command?: string
12+
size: SizeEstimate
13+
},
14+
) {
15+
super("Content too large for context window")
16+
this.name = "ContentTooLargeError"
17+
}
18+
}

0 commit comments

Comments
 (0)