Skip to content

Commit 93c13e2

Browse files
feat: add token-budget based file reading with intelligent preview (#8789)
Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
1 parent b9110dc commit 93c13e2

File tree

6 files changed

+827
-132
lines changed

6 files changed

+827
-132
lines changed

src/core/tools/__tests__/readFileTool.spec.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,13 @@ function createMockCline(): any {
201201
recordToolUsage: vi.fn().mockReturnValue(undefined),
202202
recordToolError: vi.fn().mockReturnValue(undefined),
203203
didRejectTool: false,
204+
getTokenUsage: vi.fn().mockReturnValue({
205+
contextTokens: 10000,
206+
}),
204207
// CRITICAL: Always ensure image support is enabled
205208
api: {
206209
getModel: vi.fn().mockReturnValue({
207-
info: { supportsImages: true },
210+
info: { supportsImages: true, contextWindow: 200000 },
208211
}),
209212
},
210213
}
Lines changed: 357 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,357 @@
1+
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"
2+
import {
3+
validateFileTokenBudget,
4+
truncateFileContent,
5+
FILE_SIZE_THRESHOLD,
6+
MAX_FILE_SIZE_FOR_TOKENIZATION,
7+
PREVIEW_SIZE_FOR_LARGE_FILES,
8+
} from "../fileTokenBudget"
9+
10+
// Mock dependencies
11+
vi.mock("fs/promises", () => ({
12+
stat: vi.fn(),
13+
readFile: vi.fn(),
14+
open: vi.fn(),
15+
}))
16+
17+
vi.mock("../../../../utils/countTokens", () => ({
18+
countTokens: vi.fn(),
19+
}))
20+
21+
// Import after mocking
22+
const fs = await import("fs/promises")
23+
const { countTokens } = await import("../../../../utils/countTokens")
24+
25+
const mockStat = vi.mocked(fs.stat)
26+
const mockReadFile = vi.mocked(fs.readFile)
27+
const mockOpen = vi.mocked(fs.open)
28+
const mockCountTokens = vi.mocked(countTokens)
29+
30+
describe("fileTokenBudget", () => {
31+
beforeEach(() => {
32+
vi.clearAllMocks()
33+
mockOpen.mockReset()
34+
})
35+
36+
afterEach(() => {
37+
vi.restoreAllMocks()
38+
})
39+
40+
describe("validateFileTokenBudget", () => {
41+
it("should not truncate files smaller than FILE_SIZE_THRESHOLD", async () => {
42+
const filePath = "/test/small-file.txt"
43+
const contextWindow = 200000
44+
const currentTokens = 10000
45+
46+
// Mock file stats - small file (50KB)
47+
mockStat.mockResolvedValue({
48+
size: 50000,
49+
} as any)
50+
51+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
52+
53+
expect(result.shouldTruncate).toBe(false)
54+
expect(mockReadFile).not.toHaveBeenCalled()
55+
expect(mockCountTokens).not.toHaveBeenCalled()
56+
})
57+
58+
it("should validate and not truncate large files that fit within budget", async () => {
59+
const filePath = "/test/large-file.txt"
60+
const contextWindow = 200000
61+
const currentTokens = 10000
62+
const fileContent = "x".repeat(150000) // 150KB file
63+
64+
// Mock file stats - large file (150KB)
65+
mockStat.mockResolvedValue({
66+
size: 150000,
67+
} as any)
68+
69+
// Mock file read
70+
mockReadFile.mockResolvedValue(fileContent)
71+
72+
// Mock token counting - file uses 30k tokens (within 60% of 190k remaining = 114k budget)
73+
mockCountTokens.mockResolvedValue(30000)
74+
75+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
76+
77+
expect(result.shouldTruncate).toBe(false)
78+
expect(mockReadFile).toHaveBeenCalledWith(filePath, "utf-8")
79+
expect(mockCountTokens).toHaveBeenCalled()
80+
})
81+
82+
it("should truncate large files that exceed token budget", async () => {
83+
const filePath = "/test/huge-file.txt"
84+
const contextWindow = 200000
85+
const currentTokens = 10000
86+
const fileContent = "x".repeat(500000) // 500KB file
87+
88+
// Mock file stats - huge file (500KB)
89+
mockStat.mockResolvedValue({
90+
size: 500000,
91+
} as any)
92+
93+
// Mock file read
94+
mockReadFile.mockResolvedValue(fileContent)
95+
96+
// Mock token counting - file uses 150k tokens (exceeds 60% of 190k remaining = 114k budget)
97+
mockCountTokens.mockResolvedValue(150000)
98+
99+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
100+
101+
expect(result.shouldTruncate).toBe(true)
102+
expect(result.maxChars).toBeDefined()
103+
expect(result.maxChars).toBeGreaterThan(0)
104+
expect(result.reason).toContain("150000 tokens")
105+
expect(result.reason).toContain("114000 tokens available")
106+
})
107+
108+
it("should handle case where no budget is available", async () => {
109+
const filePath = "/test/file.txt"
110+
const contextWindow = 200000
111+
const currentTokens = 200000 // Context is full
112+
113+
// Mock file stats - large file
114+
mockStat.mockResolvedValue({
115+
size: 150000,
116+
} as any)
117+
118+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
119+
120+
expect(result.shouldTruncate).toBe(true)
121+
expect(result.maxChars).toBe(0)
122+
expect(result.reason).toContain("No available context budget")
123+
})
124+
125+
it("should handle errors gracefully and not truncate", async () => {
126+
const filePath = "/test/error-file.txt"
127+
const contextWindow = 200000
128+
const currentTokens = 10000
129+
130+
// Mock file stats to throw an error
131+
mockStat.mockRejectedValue(new Error("File not found"))
132+
133+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
134+
135+
expect(result.shouldTruncate).toBe(false)
136+
})
137+
138+
it("should calculate correct token budget with 60/40 split", async () => {
139+
const filePath = "/test/file.txt"
140+
const contextWindow = 100000
141+
const currentTokens = 20000 // 80k remaining
142+
const fileContent = "test content"
143+
144+
mockStat.mockResolvedValue({ size: 150000 } as any)
145+
mockReadFile.mockResolvedValue(fileContent)
146+
147+
// Available budget should be: (100000 - 20000) * 0.6 = 48000
148+
// File uses 50k tokens, should be truncated
149+
mockCountTokens.mockResolvedValue(50000)
150+
151+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
152+
153+
expect(result.shouldTruncate).toBe(true)
154+
// maxChars should be approximately 48000 * 3 = 144000
155+
expect(result.maxChars).toBe(144000)
156+
})
157+
158+
it("should validate files at the FILE_SIZE_THRESHOLD boundary", async () => {
159+
const filePath = "/test/boundary-file.txt"
160+
const contextWindow = 200000
161+
const currentTokens = 10000
162+
const fileContent = "x".repeat(1000)
163+
164+
// Mock file stats - exactly at threshold (should trigger validation)
165+
mockStat.mockResolvedValue({
166+
size: FILE_SIZE_THRESHOLD,
167+
} as any)
168+
169+
mockReadFile.mockResolvedValue(fileContent)
170+
mockCountTokens.mockResolvedValue(30000) // Within budget
171+
172+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
173+
174+
// At exactly the threshold, it should validate
175+
expect(mockReadFile).toHaveBeenCalled()
176+
expect(mockCountTokens).toHaveBeenCalled()
177+
expect(result.shouldTruncate).toBe(false)
178+
})
179+
180+
it("should provide preview for files exceeding MAX_FILE_SIZE_FOR_TOKENIZATION", async () => {
181+
const filePath = "/test/huge-file.txt"
182+
const contextWindow = 200000
183+
const currentTokens = 10000
184+
const previewContent = "x".repeat(PREVIEW_SIZE_FOR_LARGE_FILES)
185+
186+
// Mock file stats - file exceeds max tokenization size (e.g., 10MB when max is 5MB)
187+
mockStat.mockResolvedValue({
188+
size: MAX_FILE_SIZE_FOR_TOKENIZATION + 1000000, // 1MB over the limit
189+
} as any)
190+
191+
// Mock file.open and read for preview
192+
const mockRead = vi.fn().mockResolvedValue({
193+
bytesRead: PREVIEW_SIZE_FOR_LARGE_FILES,
194+
})
195+
const mockClose = vi.fn().mockResolvedValue(undefined)
196+
mockOpen.mockResolvedValue({
197+
read: mockRead,
198+
close: mockClose,
199+
} as any)
200+
201+
// Mock token counting for the preview
202+
mockCountTokens.mockResolvedValue(30000) // Preview fits within budget
203+
204+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
205+
206+
expect(result.shouldTruncate).toBe(true)
207+
expect(result.isPreview).toBe(true)
208+
expect(result.reason).toContain("too large")
209+
expect(result.reason).toContain("preview")
210+
// Should read preview and count tokens
211+
expect(mockOpen).toHaveBeenCalled()
212+
expect(mockCountTokens).toHaveBeenCalled()
213+
})
214+
215+
it("should handle files exactly at MAX_FILE_SIZE_FOR_TOKENIZATION boundary", async () => {
216+
const filePath = "/test/boundary-file.txt"
217+
const contextWindow = 200000
218+
const currentTokens = 10000
219+
const fileContent = "x".repeat(1000)
220+
221+
// Mock file stats - exactly at max size
222+
mockStat.mockResolvedValue({
223+
size: MAX_FILE_SIZE_FOR_TOKENIZATION,
224+
} as any)
225+
226+
mockReadFile.mockResolvedValue(fileContent)
227+
mockCountTokens.mockResolvedValue(30000) // Within budget
228+
229+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
230+
231+
// At exactly the limit, should still attempt to tokenize
232+
expect(mockReadFile).toHaveBeenCalled()
233+
expect(mockCountTokens).toHaveBeenCalled()
234+
})
235+
236+
it("should handle tokenizer unreachable errors gracefully", async () => {
237+
const filePath = "/test/problematic-file.txt"
238+
const contextWindow = 200000
239+
const currentTokens = 10000
240+
const fileContent = "x".repeat(200000) // Content that might cause issues
241+
242+
// Mock file stats - within size limits but content causes tokenizer crash
243+
mockStat.mockResolvedValue({
244+
size: 200000,
245+
} as any)
246+
247+
mockReadFile.mockResolvedValue(fileContent)
248+
// Simulate tokenizer "unreachable" error
249+
mockCountTokens.mockRejectedValue(new Error("unreachable"))
250+
251+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
252+
253+
// Should fallback with conservative estimation
254+
const remainingTokens = contextWindow - currentTokens
255+
const safeReadBudget = Math.floor(remainingTokens * 0.6) // 114000
256+
257+
expect(result.shouldTruncate).toBe(true)
258+
expect(result.isPreview).toBe(true)
259+
expect(result.reason).toContain("tokenizer error")
260+
261+
// The actual maxChars depends on conservative estimation
262+
// content.length (200000) is used as estimate since tokenizer failed
263+
expect(result.maxChars).toBeDefined()
264+
expect(typeof result.maxChars).toBe("number")
265+
})
266+
267+
it("should handle other tokenizer errors conservatively", async () => {
268+
const filePath = "/test/error-file.txt"
269+
const contextWindow = 200000
270+
const currentTokens = 10000
271+
const fileContent = "test content"
272+
273+
mockStat.mockResolvedValue({ size: 150000 } as any)
274+
mockReadFile.mockResolvedValue(fileContent)
275+
// Simulate a different error
276+
mockCountTokens.mockRejectedValue(new Error("Network error"))
277+
278+
const result = await validateFileTokenBudget(filePath, contextWindow, currentTokens)
279+
280+
// Should return safe fallback (don't truncate, let normal error handling take over)
281+
expect(result.shouldTruncate).toBe(false)
282+
})
283+
})
284+
285+
describe("truncateFileContent", () => {
286+
it("should truncate content to specified character limit", () => {
287+
const content = "a".repeat(1000)
288+
const maxChars = 500
289+
const totalChars = 1000
290+
291+
const result = truncateFileContent(content, maxChars, totalChars, false)
292+
293+
expect(result.content).toHaveLength(500)
294+
expect(result.content).toBe("a".repeat(500))
295+
expect(result.notice).toContain("500 of 1000 characters")
296+
expect(result.notice).toContain("context limitations")
297+
})
298+
299+
it("should show preview message for large files", () => {
300+
const content = "x".repeat(10000000) // ~10MB (9.54MB in binary)
301+
const maxChars = 100000 // 100KB preview
302+
const totalChars = 10000000
303+
304+
const result = truncateFileContent(content, maxChars, totalChars, true)
305+
306+
expect(result.content).toHaveLength(maxChars)
307+
expect(result.notice).toContain("Preview")
308+
expect(result.notice).toContain("0.1MB") // 100KB = 0.1MB
309+
expect(result.notice).toContain("9.54MB") // Binary MB calculation
310+
expect(result.notice).toContain("line_range")
311+
})
312+
313+
it("should include helpful notice about using line_range", () => {
314+
const content = "test content that is very long"
315+
const maxChars = 10
316+
const totalChars = 31
317+
318+
const result = truncateFileContent(content, maxChars, totalChars)
319+
320+
expect(result.notice).toContain("line_range")
321+
expect(result.notice).toContain("specific sections")
322+
})
323+
324+
it("should handle empty content", () => {
325+
const content = ""
326+
const maxChars = 100
327+
const totalChars = 0
328+
329+
const result = truncateFileContent(content, maxChars, totalChars)
330+
331+
expect(result.content).toBe("")
332+
expect(result.notice).toContain("0 of 0 characters")
333+
})
334+
335+
it("should truncate multi-line content correctly", () => {
336+
const content = "line1\nline2\nline3\nline4\nline5"
337+
const maxChars = 15
338+
const totalChars = content.length
339+
340+
const result = truncateFileContent(content, maxChars, totalChars)
341+
342+
expect(result.content).toBe("line1\nline2\nlin")
343+
expect(result.content).toHaveLength(15)
344+
})
345+
346+
it("should work with unicode characters", () => {
347+
const content = "Hello 😀 World 🌍 Test 🎉"
348+
const maxChars = 10
349+
const totalChars = content.length
350+
351+
const result = truncateFileContent(content, maxChars, totalChars)
352+
353+
expect(result.content).toHaveLength(10)
354+
expect(result.notice).toBeDefined()
355+
})
356+
})
357+
})

0 commit comments

Comments
 (0)