Skip to content

Commit f52f374

Browse files
committed
feat: improve readFileTool safeguard with character-based limiting and i18n
- Replace line-based limiting with character-based limiting to handle files with very long lines - Move threshold constants to packages/types/src/file-limits.ts for better organization - Add readLinesWithCharLimit function that truncates at complete line boundaries - Optimize file reading to avoid double reads when checking token count - Add i18n support for safeguard notice messages in all 18 supported languages - Update tests to match new character-based implementation - Safeguard now limits by character count (200KB default) instead of line count - Ensures files are never truncated in the middle of a line
1 parent 89790bc commit f52f374

File tree

24 files changed

+501
-50
lines changed

24 files changed

+501
-50
lines changed

packages/types/src/file-limits.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/**
2+
* File size and limit constants used across the application
3+
*/
4+
5+
/**
6+
* Files larger than this threshold will be checked for token count
7+
* to prevent consuming too much of the context window
8+
*/
9+
export const LARGE_FILE_SIZE_THRESHOLD = 100 * 1024 // 100KB
10+
11+
/**
12+
* Files larger than this size will have the safeguard applied automatically
13+
* without token counting
14+
*/
15+
export const VERY_LARGE_FILE_SIZE = 1024 * 1024 // 1MB
16+
17+
/**
18+
* Default number of lines to read when applying the large file safeguard
19+
*/
20+
export const FALLBACK_MAX_LINES = 2000
21+
22+
/**
23+
* Maximum character count for file reading when safeguard is applied.
24+
* Based on typical token-to-character ratio (1 token ≈ 4 characters),
25+
* this ensures we don't consume too much of the context window.
26+
* For a 100k token context window at 50%, this would be ~200k characters.
27+
*/
28+
export const MAX_CHAR_LIMIT = 200_000 // 200k characters
29+
30+
/**
31+
* Percentage of the context window to use as the maximum token threshold
32+
* for file reading operations
33+
*/
34+
export const CONTEXT_WINDOW_PERCENTAGE = 0.5 // 50%
35+
36+
/**
37+
* Average characters per token ratio used for estimation
38+
*/
39+
export const CHARS_PER_TOKEN_RATIO = 4

packages/types/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ export * from "./api.js"
44
export * from "./codebase-index.js"
55
export * from "./cloud.js"
66
export * from "./experiment.js"
7+
export * from "./file-limits.js"
78
export * from "./followup.js"
89
export * from "./global-settings.js"
910
export * from "./history.js"

src/core/tools/__tests__/readFileTool.spec.ts

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import { stat } from "fs/promises"
55

66
import { countFileLines } from "../../../integrations/misc/line-counter"
77
import { readLines } from "../../../integrations/misc/read-lines"
8+
import { readLinesWithCharLimit } from "../../../integrations/misc/read-lines-char-limit"
89
import { extractTextFromFile } from "../../../integrations/misc/extract-text"
910
import { parseSourceCodeDefinitionsForFile } from "../../../services/tree-sitter"
1011
import { isBinaryFile } from "isbinaryfile"
@@ -33,6 +34,7 @@ vi.mock("isbinaryfile")
3334

3435
vi.mock("../../../integrations/misc/line-counter")
3536
vi.mock("../../../integrations/misc/read-lines")
37+
vi.mock("../../../integrations/misc/read-lines-char-limit")
3638

3739
// Mock input content for tests
3840
let mockInputContent = ""
@@ -655,7 +657,15 @@ describe("read_file tool with large file safeguard", () => {
655657
const partialContent = Array(2000).fill("This is a line of text").join("\n")
656658

657659
mockedExtractTextFromFile.mockResolvedValue(largeFileContent)
658-
mockedReadLines.mockResolvedValue(partialContent)
660+
661+
// Mock readLinesWithCharLimit
662+
const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit)
663+
mockedReadLinesWithCharLimit.mockResolvedValue({
664+
content: partialContent,
665+
linesRead: 2000,
666+
charactersRead: partialContent.length,
667+
wasTruncated: true,
668+
})
659669

660670
// Setup addLineNumbers mock for this test
661671
addLineNumbersMock.mockImplementation((text: string) => {
@@ -685,11 +695,10 @@ describe("read_file tool with large file safeguard", () => {
685695

686696
// Verify safeguard was applied
687697
expect(mockedTiktoken).toHaveBeenCalled()
688-
expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0)
698+
expect(mockedReadLinesWithCharLimit).toHaveBeenCalled()
689699

690700
// Verify the result contains the safeguard notice
691-
expect(result).toContain("<notice>This file is 200KB and contains approximately 60,000 tokens")
692-
expect(result).toContain("Showing only the first 2000 lines to preserve context space")
701+
expect(result).toContain("<notice>readFile.safeguardNotice</notice>")
693702
expect(result).toContain(`<content lines="1-2000">`)
694703
})
695704

@@ -725,7 +734,8 @@ describe("read_file tool with large file safeguard", () => {
725734

726735
// Verify safeguard was NOT applied
727736
expect(mockedTiktoken).toHaveBeenCalled()
728-
expect(mockedReadLines).not.toHaveBeenCalled()
737+
const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit)
738+
expect(mockedReadLinesWithCharLimit).not.toHaveBeenCalled()
729739
expect(mockedExtractTextFromFile).toHaveBeenCalled()
730740

731741
// Verify no safeguard notice
@@ -765,7 +775,8 @@ describe("read_file tool with large file safeguard", () => {
765775

766776
// Verify tiktoken was NOT called (optimization)
767777
expect(mockedTiktoken).not.toHaveBeenCalled()
768-
expect(mockedReadLines).not.toHaveBeenCalled()
778+
const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit)
779+
expect(mockedReadLinesWithCharLimit).not.toHaveBeenCalled()
769780
expect(mockedExtractTextFromFile).toHaveBeenCalled()
770781

771782
// Verify no safeguard notice
@@ -778,7 +789,15 @@ describe("read_file tool with large file safeguard", () => {
778789
const partialContent = Array(2000).fill("This is a line of text").join("\n")
779790

780791
mockedExtractTextFromFile.mockResolvedValue("Large content")
781-
mockedReadLines.mockResolvedValue(partialContent)
792+
793+
// Mock readLinesWithCharLimit
794+
const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit)
795+
mockedReadLinesWithCharLimit.mockResolvedValue({
796+
content: partialContent,
797+
linesRead: 2000,
798+
charactersRead: partialContent.length,
799+
wasTruncated: true,
800+
})
782801

783802
// Setup addLineNumbers mock for partial content
784803
addLineNumbersMock.mockImplementation((text: string) => {
@@ -825,11 +844,10 @@ describe("read_file tool with large file safeguard", () => {
825844

826845
// Verify safeguard was applied despite token counting failure
827846
expect(mockedTiktoken).toHaveBeenCalled()
828-
expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0)
847+
expect(mockedReadLinesWithCharLimit).toHaveBeenCalled()
829848

830-
// Verify the result contains the safeguard notice (without token count)
831-
expect(toolResult).toContain("<notice>This file is 2048KB")
832-
expect(toolResult).toContain("Showing only the first 2000 lines to preserve context space")
849+
// Verify the result contains the safeguard notice
850+
expect(toolResult).toContain("<notice>readFile.safeguardNotice</notice>")
833851
expect(toolResult).toContain(`<content lines="1-2000">`)
834852
})
835853

@@ -861,8 +879,10 @@ describe("read_file tool with large file safeguard", () => {
861879
// Verify tiktoken was NOT called
862880
expect(mockedTiktoken).not.toHaveBeenCalled()
863881

864-
// The normal maxReadFileLine logic should apply
882+
// The normal maxReadFileLine logic should apply (using readLines, not readLinesWithCharLimit)
865883
expect(mockedReadLines).toHaveBeenCalled()
884+
const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit)
885+
expect(mockedReadLinesWithCharLimit).not.toHaveBeenCalled()
866886
})
867887

868888
it("should handle line ranges correctly with safeguard", async () => {
@@ -955,10 +975,16 @@ describe("read_file tool with large file safeguard", () => {
955975
}),
956976
}
957977
mockedExtractTextFromFile.mockResolvedValue("content")
958-
mockedReadLines.mockResolvedValue("partial content")
978+
const mockedReadLinesWithCharLimit = vi.mocked(readLinesWithCharLimit)
979+
mockedReadLinesWithCharLimit.mockResolvedValue({
980+
content: "partial content",
981+
linesRead: 2000,
982+
charactersRead: 50000,
983+
wasTruncated: true,
984+
})
959985
await executeReadFileTool({}, { fileSize: 100 * 1024 + 1, maxReadFileLine: -1, tokenCount: 50001 })
960-
expect(mockedReadLines).toHaveBeenCalled()
961-
expect(toolResult).toContain("preserve context space")
986+
expect(mockedReadLinesWithCharLimit).toHaveBeenCalled()
987+
expect(toolResult).toContain("<notice>readFile.safeguardNotice</notice>")
962988
})
963989
})
964990
})

src/core/tools/readFileTool.ts

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,15 @@ import { extractTextFromFile, addLineNumbers, getSupportedBinaryFormats } from "
1616
import { parseSourceCodeDefinitionsForFile } from "../../services/tree-sitter"
1717
import { parseXml } from "../../utils/xml"
1818
import { tiktoken } from "../../utils/tiktoken"
19+
import {
20+
LARGE_FILE_SIZE_THRESHOLD,
21+
VERY_LARGE_FILE_SIZE,
22+
FALLBACK_MAX_LINES,
23+
CONTEXT_WINDOW_PERCENTAGE,
24+
MAX_CHAR_LIMIT,
25+
CHARS_PER_TOKEN_RATIO,
26+
} from "@roo-code/types"
27+
import { readLinesWithCharLimit } from "../../integrations/misc/read-lines-char-limit"
1928

2029
export function getReadFileToolDescription(blockName: string, blockParams: any): string {
2130
// Handle both single path and multiple files via args
@@ -519,19 +528,16 @@ export async function readFileTool(
519528
}
520529

521530
// Handle normal file read with safeguard for large files
522-
// Define thresholds for the safeguard
523-
const LARGE_FILE_SIZE_THRESHOLD = 100 * 1024 // 100KB - files larger than this will be checked for token count
524-
const VERY_LARGE_FILE_SIZE = 1024 * 1024 // 1MB - apply safeguard automatically
525-
const FALLBACK_MAX_LINES = 2000 // Default number of lines to read when applying safeguard
526-
527531
// Get the actual context window size from the model
528532
const contextWindow = cline.api.getModel().info.contextWindow || 100000 // Default to 100k if not available
529-
const MAX_TOKEN_THRESHOLD = Math.floor(contextWindow * 0.5) // Use 50% of the actual context window
533+
const MAX_TOKEN_THRESHOLD = Math.floor(contextWindow * CONTEXT_WINDOW_PERCENTAGE)
534+
const MAX_CHAR_THRESHOLD = MAX_TOKEN_THRESHOLD * CHARS_PER_TOKEN_RATIO
530535

531536
// Check if we should apply the safeguard
532537
let shouldApplySafeguard = false
533538
let safeguardNotice = ""
534-
let linesToRead = totalLines
539+
let fullContent: string | null = null
540+
let actualLinesRead = totalLines
535541

536542
if (maxReadFileLine === -1) {
537543
// Get file size
@@ -541,22 +547,22 @@ export async function readFileTool(
541547
if (fileStats.size > LARGE_FILE_SIZE_THRESHOLD) {
542548
// File is large enough to warrant token count check
543549
try {
544-
const fullContent = await extractTextFromFile(fullPath)
550+
// Read the full content once
551+
fullContent = await extractTextFromFile(fullPath)
545552
const tokenCount = await tiktoken([{ type: "text", text: fullContent }])
546553

547554
if (tokenCount > MAX_TOKEN_THRESHOLD) {
548555
shouldApplySafeguard = true
549-
linesToRead = FALLBACK_MAX_LINES
550-
safeguardNotice = `<notice>This file is ${fileSizeKB}KB and contains approximately ${tokenCount.toLocaleString()} tokens, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.</notice>\n`
556+
// Clear fullContent to avoid using it when we need partial content
557+
fullContent = null
551558
}
559+
// If tokenCount <= MAX_TOKEN_THRESHOLD, we keep fullContent to reuse it
552560
} catch (error) {
553561
// If token counting fails, apply safeguard based on file size alone
554562
console.warn(`Failed to count tokens for large file ${relPath}:`, error)
555563
if (fileStats.size > VERY_LARGE_FILE_SIZE) {
556564
// For very large files (>1MB), apply safeguard anyway
557565
shouldApplySafeguard = true
558-
linesToRead = FALLBACK_MAX_LINES
559-
safeguardNotice = `<notice>This file is ${fileSizeKB}KB, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.</notice>\n`
560566
}
561567
}
562568
}
@@ -566,12 +572,32 @@ export async function readFileTool(
566572
let lineRangeAttr: string
567573

568574
if (shouldApplySafeguard) {
569-
// Read partial file with safeguard
570-
content = addLineNumbers(await readLines(fullPath, linesToRead - 1, 0))
571-
lineRangeAttr = ` lines="1-${linesToRead}"`
575+
// Read partial file with character-based safeguard
576+
// Use the smaller of MAX_CHAR_LIMIT or the calculated character threshold
577+
const charLimit = Math.min(MAX_CHAR_LIMIT, MAX_CHAR_THRESHOLD)
578+
const result = await readLinesWithCharLimit(fullPath, charLimit)
579+
580+
content = addLineNumbers(result.content, 1)
581+
actualLinesRead = result.linesRead
582+
lineRangeAttr = ` lines="1-${actualLinesRead}"`
583+
584+
const fileStats = await stat(fullPath)
585+
const fileSizeKB = Math.round(fileStats.size / 1024)
586+
587+
if (result.wasTruncated) {
588+
safeguardNotice = `<notice>${t("tools:readFile.safeguardNotice", {
589+
fileSizeKB,
590+
actualLinesRead,
591+
charactersRead: result.charactersRead.toLocaleString(),
592+
})}</notice>\n`
593+
}
572594
} else {
573-
// Read full file as normal
574-
content = await extractTextFromFile(fullPath)
595+
// Read full file - reuse fullContent if we already have it
596+
if (fullContent !== null) {
597+
content = fullContent
598+
} else {
599+
content = await extractTextFromFile(fullPath)
600+
}
575601
lineRangeAttr = ` lines="1-${totalLines}"`
576602
}
577603

src/i18n/locales/ca/tools.json

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/i18n/locales/de/tools.json

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/i18n/locales/en/tools.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
"readFile": {
33
"linesRange": " (lines {{start}}-{{end}})",
44
"definitionsOnly": " (definitions only)",
5-
"maxLines": " (max {{max}} lines)"
5+
"maxLines": " (max {{max}} lines)",
6+
"safeguardNotice": "This file is {{fileSizeKB}}KB and would consume a significant portion of the context window. Showing only the first {{actualLinesRead}} complete lines ({{charactersRead}} characters) to preserve context space. Use line_range if you need to read specific sections."
67
},
78
"toolRepetitionLimitReached": "Roo appears to be stuck in a loop, attempting the same action ({{toolName}}) repeatedly. This might indicate a problem with its current strategy. Consider rephrasing the task, providing more specific instructions, or guiding it towards a different approach.",
89
"codebaseSearch": {

src/i18n/locales/es/tools.json

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/i18n/locales/fr/tools.json

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/i18n/locales/hi/tools.json

Lines changed: 2 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)