Skip to content

Commit 40ad094

Browse files
committed
feat: implement token-based file reading to prevent context exhaustion
- Add maxReadFileTokens setting to global settings schema (default: 10000) - Update extractTextFromFile to use token counting with tiktoken - Modify UI components to show token-based settings instead of line-based - Update all callers of extractTextFromFile to pass the new parameter - Add comprehensive tests for token-based truncation - Token-based truncation takes precedence over line-based when both are set This addresses issue #6274 by preventing context window exhaustion when reading files with very long lines that contain many tokens.
1 parent 7a6e852 commit 40ad094

File tree

13 files changed

+330
-21
lines changed

13 files changed

+330
-21
lines changed

packages/types/src/global-settings.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ export const globalSettingsSchema = z.object({
101101
maxWorkspaceFiles: z.number().optional(),
102102
showRooIgnoredFiles: z.boolean().optional(),
103103
maxReadFileLine: z.number().optional(),
104+
maxReadFileTokens: z.number().optional(),
104105

105106
terminalOutputLineLimit: z.number().optional(),
106107
terminalOutputCharacterLimit: z.number().optional(),
@@ -273,6 +274,7 @@ export const EVALS_SETTINGS: RooCodeSettings = {
273274
maxWorkspaceFiles: 200,
274275
showRooIgnoredFiles: true,
275276
maxReadFileLine: -1, // -1 to enable full file reading.
277+
maxReadFileTokens: -1, // -1 to enable full file reading.
276278

277279
includeDiagnosticMessages: true,
278280
maxDiagnosticMessages: 50,

src/core/mentions/index.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ export async function parseMentions(
8484
includeDiagnosticMessages: boolean = true,
8585
maxDiagnosticMessages: number = 50,
8686
maxReadFileLine?: number,
87+
maxReadFileTokens?: number,
8788
): Promise<string> {
8889
const mentions: Set<string> = new Set()
8990
const commandMentions: Set<string> = new Set()
@@ -166,6 +167,7 @@ export async function parseMentions(
166167
rooIgnoreController,
167168
showRooIgnoredFiles,
168169
maxReadFileLine,
170+
maxReadFileTokens,
169171
)
170172
if (mention.endsWith("/")) {
171173
parsedText += `\n\n<folder_content path="${mentionPath}">\n${content}\n</folder_content>`
@@ -244,6 +246,7 @@ async function getFileOrFolderContent(
244246
rooIgnoreController?: any,
245247
showRooIgnoredFiles: boolean = true,
246248
maxReadFileLine?: number,
249+
maxReadFileTokens?: number,
247250
): Promise<string> {
248251
const unescapedPath = unescapeSpaces(mentionPath)
249252
const absPath = path.resolve(cwd, unescapedPath)
@@ -256,7 +259,7 @@ async function getFileOrFolderContent(
256259
return `(File ${mentionPath} is ignored by .rooignore)`
257260
}
258261
try {
259-
const content = await extractTextFromFile(absPath, maxReadFileLine)
262+
const content = await extractTextFromFile(absPath, maxReadFileLine, maxReadFileTokens)
260263
return content
261264
} catch (error) {
262265
return `(Failed to read contents of ${mentionPath}): ${error.message}`
@@ -296,7 +299,11 @@ async function getFileOrFolderContent(
296299
if (isBinary) {
297300
return undefined
298301
}
299-
const content = await extractTextFromFile(absoluteFilePath, maxReadFileLine)
302+
const content = await extractTextFromFile(
303+
absoluteFilePath,
304+
maxReadFileLine,
305+
maxReadFileTokens,
306+
)
300307
return `<file_content path="${filePath.toPosix()}">\n${content}\n</file_content>`
301308
} catch (error) {
302309
return undefined

src/core/tools/readFileTool.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,8 @@ export async function readFileTool(
253253

254254
// Handle batch approval if there are multiple files to approve
255255
if (filesToApprove.length > 1) {
256-
const { maxReadFileLine = -1 } = (await cline.providerRef.deref()?.getState()) ?? {}
256+
const { maxReadFileLine = -1, maxReadFileTokens = 10000 } =
257+
(await cline.providerRef.deref()?.getState()) ?? {}
257258

258259
// Prepare batch file data
259260
const batchFiles = filesToApprove.map((fileResult) => {
@@ -368,7 +369,8 @@ export async function readFileTool(
368369
const relPath = fileResult.path
369370
const fullPath = path.resolve(cline.cwd, relPath)
370371
const isOutsideWorkspace = isPathOutsideWorkspace(fullPath)
371-
const { maxReadFileLine = -1 } = (await cline.providerRef.deref()?.getState()) ?? {}
372+
const { maxReadFileLine = -1, maxReadFileTokens = 10000 } =
373+
(await cline.providerRef.deref()?.getState()) ?? {}
372374

373375
// Create line snippet for approval message
374376
let lineSnippet = ""
@@ -429,7 +431,8 @@ export async function readFileTool(
429431

430432
const relPath = fileResult.path
431433
const fullPath = path.resolve(cline.cwd, relPath)
432-
const { maxReadFileLine = -1 } = (await cline.providerRef.deref()?.getState()) ?? {}
434+
const { maxReadFileLine = -1, maxReadFileTokens = 10000 } =
435+
(await cline.providerRef.deref()?.getState()) ?? {}
433436

434437
// Process approved files
435438
try {
@@ -517,7 +520,7 @@ export async function readFileTool(
517520
}
518521

519522
// Handle normal file read
520-
const content = await extractTextFromFile(fullPath)
523+
const content = await extractTextFromFile(fullPath, maxReadFileTokens)
521524
const lineRangeAttr = ` lines="1-${totalLines}"`
522525
let xmlInfo = totalLines > 0 ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
523526

src/core/webview/ClineProvider.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,6 +1425,7 @@ export class ClineProvider
14251425
showRooIgnoredFiles,
14261426
language,
14271427
maxReadFileLine,
1428+
maxReadFileTokens,
14281429
terminalCompressProgressBar,
14291430
historyPreviewCollapsed,
14301431
cloudUserInfo,
@@ -1532,6 +1533,7 @@ export class ClineProvider
15321533
language: language ?? formatLanguage(vscode.env.language),
15331534
renderContext: this.renderContext,
15341535
maxReadFileLine: maxReadFileLine ?? -1,
1536+
maxReadFileTokens: maxReadFileTokens ?? 10000,
15351537
maxConcurrentFileReads: maxConcurrentFileReads ?? 5,
15361538
settingsImportedAt: this.settingsImportedAt,
15371539
terminalCompressProgressBar: terminalCompressProgressBar ?? true,
@@ -1702,6 +1704,7 @@ export class ClineProvider
17021704
telemetrySetting: stateValues.telemetrySetting || "unset",
17031705
showRooIgnoredFiles: stateValues.showRooIgnoredFiles ?? true,
17041706
maxReadFileLine: stateValues.maxReadFileLine ?? -1,
1707+
maxReadFileTokens: stateValues.maxReadFileTokens ?? 10000,
17051708
maxConcurrentFileReads: stateValues.maxConcurrentFileReads ?? 5,
17061709
historyPreviewCollapsed: stateValues.historyPreviewCollapsed ?? false,
17071710
cloudUserInfo,

src/core/webview/webviewMessageHandler.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,10 @@ export const webviewMessageHandler = async (
12651265
await updateGlobalState("maxReadFileLine", message.value)
12661266
await provider.postStateToWebview()
12671267
break
1268+
case "maxReadFileTokens":
1269+
await updateGlobalState("maxReadFileTokens", message.value)
1270+
await provider.postStateToWebview()
1271+
break
12681272
case "maxConcurrentFileReads":
12691273
const valueToSave = message.value // Capture the value intended for saving
12701274
await updateGlobalState("maxConcurrentFileReads", valueToSave)
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
// npx vitest run integrations/misc/__tests__/extract-text-token-based.spec.ts
2+
3+
import { describe, it, expect, vi, beforeEach, Mock } from "vitest"
4+
import * as fs from "fs/promises"
5+
import { Anthropic } from "@anthropic-ai/sdk"
6+
import { extractTextFromFile } from "../extract-text"
7+
import { countFileLines } from "../line-counter"
8+
import { readLines } from "../read-lines"
9+
import { isBinaryFile } from "isbinaryfile"
10+
import { countTokens } from "../../../utils/countTokens"
11+
12+
// Mock all dependencies
13+
vi.mock("fs/promises")
14+
vi.mock("../line-counter")
15+
vi.mock("../read-lines")
16+
vi.mock("isbinaryfile")
17+
vi.mock("../../../utils/countTokens")
18+
19+
describe("extractTextFromFile - Token-based Truncation", () => {
20+
// Type the mocks
21+
const mockedFs = vi.mocked(fs)
22+
const mockedCountFileLines = vi.mocked(countFileLines)
23+
const mockedReadLines = vi.mocked(readLines)
24+
const mockedIsBinaryFile = vi.mocked(isBinaryFile)
25+
const mockedCountTokens = vi.mocked(countTokens)
26+
27+
beforeEach(() => {
28+
vi.clearAllMocks()
29+
// Set default mock behavior
30+
mockedFs.access.mockResolvedValue(undefined)
31+
mockedIsBinaryFile.mockResolvedValue(false)
32+
33+
// Mock countTokens to return a predictable token count
34+
mockedCountTokens.mockImplementation(async (content: Anthropic.Messages.ContentBlockParam[]) => {
35+
// Simulate token counting based on text content
36+
const text = content
37+
.filter((block) => block.type === "text")
38+
.map((block) => (block as Anthropic.Messages.TextBlockParam).text)
39+
.join("")
40+
const words = text.split(/\s+/).length
41+
return Math.floor(words * 1.5)
42+
})
43+
})
44+
45+
it("should truncate files based on token count when maxReadFileTokens is provided", async () => {
46+
const fileContent = Array(100)
47+
.fill(null)
48+
.map((_, i) => `Line ${i + 1}: This is a test line with some content that has multiple words`)
49+
.join("\n")
50+
51+
mockedFs.readFile.mockResolvedValue(fileContent as any)
52+
53+
// Mock token counting to exceed limit after 50 lines
54+
let tokenCount = 0
55+
mockedCountTokens.mockImplementation(async (content: Anthropic.Messages.ContentBlockParam[]) => {
56+
const text = content
57+
.filter((block) => block.type === "text")
58+
.map((block) => (block as Anthropic.Messages.TextBlockParam).text)
59+
.join("")
60+
const lines = text.split("\n").length
61+
// Each line has ~15 tokens, so 50 lines = 750 tokens
62+
tokenCount = lines * 15
63+
return tokenCount
64+
})
65+
66+
const result = await extractTextFromFile("/test/large-file.ts", -1, 750)
67+
68+
// Should truncate based on tokens, not lines
69+
expect(result).toContain("1 | Line 1:")
70+
expect(result).toContain("[File truncated")
71+
expect(result).toMatch(/\d+ of ~?\d+ tokens/)
72+
})
73+
74+
it("should not truncate when token count is within limit", async () => {
75+
const fileContent = Array(10)
76+
.fill(null)
77+
.map((_, i) => `Line ${i + 1}: Short content`)
78+
.join("\n")
79+
80+
mockedFs.readFile.mockResolvedValue(fileContent as any)
81+
82+
// Mock token counting to stay under limit
83+
mockedCountTokens.mockResolvedValue(100) // Well under 10000 default
84+
85+
const result = await extractTextFromFile("/test/small-file.ts", -1, 10000)
86+
87+
// Should include all content
88+
expect(result).toContain(" 1 | Line 1: Short content")
89+
expect(result).toContain("10 | Line 10: Short content")
90+
expect(result).not.toContain("[File truncated")
91+
})
92+
93+
it("should prioritize token-based truncation over line-based when both limits are set", async () => {
94+
const fileContent = Array(200)
95+
.fill(null)
96+
.map((_, i) => `Line ${i + 1}: This line has many words to increase token count significantly`)
97+
.join("\n")
98+
99+
mockedCountFileLines.mockResolvedValue(200)
100+
mockedFs.readFile.mockResolvedValue(fileContent as any)
101+
102+
// Mock to exceed token limit before line limit
103+
let callCount = 0
104+
mockedCountTokens.mockImplementation(async (content: Anthropic.Messages.ContentBlockParam[]) => {
105+
callCount++
106+
const text = content
107+
.filter((block) => block.type === "text")
108+
.map((block) => (block as Anthropic.Messages.TextBlockParam).text)
109+
.join("")
110+
const lines = text.split("\n").length
111+
// Make it exceed token limit at ~30 lines (30 * 20 = 600 tokens)
112+
return lines * 20
113+
})
114+
115+
// maxReadFileLine=100, maxReadFileTokens=500
116+
const result = await extractTextFromFile("/test/file.ts", 100, 500)
117+
118+
// Should truncate based on tokens (500), not lines (100)
119+
expect(result).toContain("[File truncated")
120+
expect(result).toMatch(/\d+ of ~?\d+ tokens/)
121+
122+
// Should have stopped before reaching line limit
123+
const resultLines = result.split("\n").filter((line) => line.match(/^\s*\d+\s*\|/))
124+
expect(resultLines.length).toBeLessThan(100)
125+
})
126+
127+
it("should handle maxReadFileTokens of 0 by throwing an error", async () => {
128+
await expect(extractTextFromFile("/test/file.ts", -1, 0)).rejects.toThrow(
129+
"Invalid maxReadFileTokens: 0. Must be a positive integer or -1 for unlimited.",
130+
)
131+
})
132+
133+
it("should handle negative maxReadFileTokens by throwing an error", async () => {
134+
await expect(extractTextFromFile("/test/file.ts", -1, -100)).rejects.toThrow(
135+
"Invalid maxReadFileTokens: -100. Must be a positive integer or -1 for unlimited.",
136+
)
137+
})
138+
139+
it("should work with both line and token limits disabled", async () => {
140+
const fileContent = "Line 1\nLine 2\nLine 3"
141+
mockedFs.readFile.mockResolvedValue(fileContent as any)
142+
143+
const result = await extractTextFromFile("/test/file.ts", -1, undefined)
144+
145+
// Should include all content
146+
expect(result).toContain("1 | Line 1")
147+
expect(result).toContain("2 | Line 2")
148+
expect(result).toContain("3 | Line 3")
149+
expect(result).not.toContain("[File truncated")
150+
})
151+
152+
it("should handle empty files with token-based truncation", async () => {
153+
mockedFs.readFile.mockResolvedValue("" as any)
154+
mockedCountTokens.mockResolvedValue(0)
155+
156+
const result = await extractTextFromFile("/test/empty.ts", -1, 1000)
157+
158+
expect(result).toBe("")
159+
})
160+
161+
it("should efficiently handle very large token counts", async () => {
162+
// Simulate a file that would have millions of tokens
163+
const hugeContent = Array(10000)
164+
.fill(null)
165+
.map((_, i) => `Line ${i + 1}: ${Array(100).fill("word").join(" ")}`)
166+
.join("\n")
167+
168+
mockedFs.readFile.mockResolvedValue(hugeContent as any)
169+
170+
// Mock progressive token counting
171+
mockedCountTokens.mockImplementation(async (content: Anthropic.Messages.ContentBlockParam[]) => {
172+
const text = content
173+
.filter((block) => block.type === "text")
174+
.map((block) => (block as Anthropic.Messages.TextBlockParam).text)
175+
.join("")
176+
const lines = text.split("\n").length
177+
return lines * 150 // Each line has ~150 tokens
178+
})
179+
180+
const result = await extractTextFromFile("/test/huge.ts", -1, 5000)
181+
182+
// Should truncate early based on tokens
183+
expect(result).toContain("[File truncated")
184+
expect(result).toMatch(/\d+ of ~?\d+ tokens/)
185+
186+
// Should have stopped processing early
187+
const resultLines = result.split("\n").filter((line) => line.match(/^\s*\d+\s*\|/))
188+
expect(resultLines.length).toBeLessThan(50) // Should stop around 33 lines (5000/150)
189+
})
190+
})

0 commit comments

Comments
 (0)