Skip to content

Commit 595afb7

Browse files
committed
feat: add safeguard for large files in readFileTool when maxReadFileLine is -1
- Add token counting check using tiktoken for files over 1000 lines - Automatically switch to partial read (first 2000 lines) when token count exceeds 50k - Add fallback safeguard for very large files (>5000 lines) when token counting fails - Include informative notice explaining why partial read is being used - Add comprehensive test coverage for all safeguard scenarios This prevents consuming the entire context window when reading very large files.
1 parent a824557 commit 595afb7

File tree

2 files changed

+380
-8
lines changed

2 files changed

+380
-8
lines changed

src/core/tools/__tests__/readFileTool.spec.ts

Lines changed: 329 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { isBinaryFile } from "isbinaryfile"
1010
import { ReadFileToolUse, ToolParamName, ToolResponse } from "../../../shared/tools"
1111
import { readFileTool } from "../readFileTool"
1212
import { formatResponse } from "../../prompts/responses"
13+
import { tiktoken } from "../../../utils/tiktoken"
1314

1415
vi.mock("path", async () => {
1516
const originalPath = await vi.importActual("path")
@@ -35,18 +36,27 @@ vi.mock("../../../integrations/misc/read-lines")
3536
let mockInputContent = ""
3637

3738
// First create all the mocks
38-
vi.mock("../../../integrations/misc/extract-text")
39+
vi.mock("../../../integrations/misc/extract-text", () => ({
40+
extractTextFromFile: vi.fn(),
41+
addLineNumbers: vi.fn(),
42+
getSupportedBinaryFormats: vi.fn(() => [".pdf", ".docx", ".ipynb"]),
43+
}))
3944
vi.mock("../../../services/tree-sitter")
45+
vi.mock("../../../utils/tiktoken")
46+
47+
// Import the mocked functions
48+
import { addLineNumbers, getSupportedBinaryFormats } from "../../../integrations/misc/extract-text"
4049

4150
// Then create the mock functions
42-
const addLineNumbersMock = vi.fn().mockImplementation((text, startLine = 1) => {
51+
const addLineNumbersMock = vi.mocked(addLineNumbers)
52+
addLineNumbersMock.mockImplementation((text: string, startLine = 1) => {
4353
if (!text) return ""
4454
const lines = typeof text === "string" ? text.split("\n") : [text]
45-
return lines.map((line, i) => `${startLine + i} | ${line}`).join("\n")
55+
return lines.map((line: string, i: number) => `${startLine + i} | ${line}`).join("\n")
4656
})
4757

48-
const extractTextFromFileMock = vi.fn()
49-
const getSupportedBinaryFormatsMock = vi.fn(() => [".pdf", ".docx", ".ipynb"])
58+
const extractTextFromFileMock = vi.mocked(extractTextFromFile)
59+
const getSupportedBinaryFormatsMock = vi.mocked(getSupportedBinaryFormats)
5060

5161
vi.mock("../../ignore/RooIgnoreController", () => ({
5262
RooIgnoreController: class {
@@ -520,3 +530,317 @@ describe("read_file tool XML output structure", () => {
520530
})
521531
})
522532
})
533+
534+
describe("read_file tool with large file safeguard", () => {
535+
// Test data
536+
const testFilePath = "test/largefile.txt"
537+
const absoluteFilePath = "/test/largefile.txt"
538+
539+
// Mocked functions
540+
const mockedCountFileLines = vi.mocked(countFileLines)
541+
const mockedReadLines = vi.mocked(readLines)
542+
const mockedExtractTextFromFile = vi.mocked(extractTextFromFile)
543+
const mockedIsBinaryFile = vi.mocked(isBinaryFile)
544+
const mockedPathResolve = vi.mocked(path.resolve)
545+
const mockedTiktoken = vi.mocked(tiktoken)
546+
547+
const mockCline: any = {}
548+
let mockProvider: any
549+
let toolResult: ToolResponse | undefined
550+
551+
beforeEach(() => {
552+
vi.clearAllMocks()
553+
554+
mockedPathResolve.mockReturnValue(absoluteFilePath)
555+
mockedIsBinaryFile.mockResolvedValue(false)
556+
557+
mockProvider = {
558+
getState: vi.fn(),
559+
deref: vi.fn().mockReturnThis(),
560+
}
561+
562+
mockCline.cwd = "/"
563+
mockCline.task = "Test"
564+
mockCline.providerRef = mockProvider
565+
mockCline.rooIgnoreController = {
566+
validateAccess: vi.fn().mockReturnValue(true),
567+
}
568+
mockCline.say = vi.fn().mockResolvedValue(undefined)
569+
mockCline.ask = vi.fn().mockResolvedValue({ response: "yesButtonClicked" })
570+
mockCline.fileContextTracker = {
571+
trackFileContext: vi.fn().mockResolvedValue(undefined),
572+
}
573+
mockCline.recordToolUsage = vi.fn().mockReturnValue(undefined)
574+
mockCline.recordToolError = vi.fn().mockReturnValue(undefined)
575+
576+
toolResult = undefined
577+
})
578+
579+
async function executeReadFileTool(
580+
params: Partial<ReadFileToolUse["params"]> = {},
581+
options: {
582+
maxReadFileLine?: number
583+
totalLines?: number
584+
tokenCount?: number
585+
} = {},
586+
): Promise<ToolResponse | undefined> {
587+
const maxReadFileLine = options.maxReadFileLine ?? -1
588+
const totalLines = options.totalLines ?? 5
589+
const tokenCount = options.tokenCount ?? 100
590+
591+
mockProvider.getState.mockResolvedValue({ maxReadFileLine })
592+
mockedCountFileLines.mockResolvedValue(totalLines)
593+
mockedTiktoken.mockResolvedValue(tokenCount)
594+
595+
const argsContent = `<file><path>${testFilePath}</path></file>`
596+
597+
const toolUse: ReadFileToolUse = {
598+
type: "tool_use",
599+
name: "read_file",
600+
params: { args: argsContent, ...params },
601+
partial: false,
602+
}
603+
604+
await readFileTool(
605+
mockCline,
606+
toolUse,
607+
mockCline.ask,
608+
vi.fn(),
609+
(result: ToolResponse) => {
610+
toolResult = result
611+
},
612+
(_: ToolParamName, content?: string) => content ?? "",
613+
)
614+
615+
return toolResult
616+
}
617+
618+
describe("when file has many lines and high token count", () => {
619+
it("should apply safeguard and read only first 2000 lines", async () => {
620+
// Setup - large file with high token count
621+
const largeFileContent = Array(1500).fill("This is a line of text").join("\n")
622+
const partialContent = Array(2000).fill("This is a line of text").join("\n")
623+
624+
mockedExtractTextFromFile.mockResolvedValue(largeFileContent)
625+
mockedReadLines.mockResolvedValue(partialContent)
626+
627+
// Setup addLineNumbers mock for this test
628+
addLineNumbersMock.mockImplementation((text: string) => {
629+
const lines = text.split("\n")
630+
return lines.map((line: string, i: number) => `${i + 1} | ${line}`).join("\n")
631+
})
632+
633+
// Execute with high line count and token count
634+
const result = await executeReadFileTool(
635+
{},
636+
{
637+
maxReadFileLine: -1,
638+
totalLines: 1500,
639+
tokenCount: 60000, // Above threshold
640+
},
641+
)
642+
643+
// Verify safeguard was applied
644+
expect(mockedTiktoken).toHaveBeenCalled()
645+
expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0)
646+
647+
// Verify the result contains the safeguard notice
648+
expect(result).toContain("<notice>This file contains 1500 lines and approximately 60,000 tokens")
649+
expect(result).toContain("Showing only the first 2000 lines to preserve context space")
650+
expect(result).toContain(`<content lines="1-2000">`)
651+
})
652+
653+
it("should not apply safeguard when token count is below threshold", async () => {
654+
// Setup - large file but with low token count
655+
const fileContent = Array(1500).fill("Short").join("\n")
656+
const numberedContent = fileContent
657+
.split("\n")
658+
.map((line, i) => `${i + 1} | ${line}`)
659+
.join("\n")
660+
661+
mockedExtractTextFromFile.mockImplementation(() => Promise.resolve(numberedContent))
662+
663+
// Execute with high line count but low token count
664+
const result = await executeReadFileTool(
665+
{},
666+
{
667+
maxReadFileLine: -1,
668+
totalLines: 1500,
669+
tokenCount: 30000, // Below threshold
670+
},
671+
)
672+
673+
// Verify safeguard was NOT applied
674+
expect(mockedTiktoken).toHaveBeenCalled()
675+
expect(mockedReadLines).not.toHaveBeenCalled()
676+
expect(mockedExtractTextFromFile).toHaveBeenCalled()
677+
678+
// Verify no safeguard notice
679+
expect(result).not.toContain("preserve context space")
680+
expect(result).toContain(`<content lines="1-1500">`)
681+
})
682+
683+
it("should not apply safeguard for files under 1000 lines", async () => {
684+
// Setup - file with less than 1000 lines
685+
const fileContent = Array(999).fill("This is a line of text").join("\n")
686+
const numberedContent = fileContent
687+
.split("\n")
688+
.map((line, i) => `${i + 1} | ${line}`)
689+
.join("\n")
690+
691+
mockedExtractTextFromFile.mockImplementation(() => Promise.resolve(numberedContent))
692+
693+
// Execute
694+
const result = await executeReadFileTool(
695+
{},
696+
{
697+
maxReadFileLine: -1,
698+
totalLines: 999,
699+
tokenCount: 100000, // Even with high token count
700+
},
701+
)
702+
703+
// Verify tiktoken was NOT called (optimization)
704+
expect(mockedTiktoken).not.toHaveBeenCalled()
705+
expect(mockedReadLines).not.toHaveBeenCalled()
706+
expect(mockedExtractTextFromFile).toHaveBeenCalled()
707+
708+
// Verify no safeguard notice
709+
expect(result).not.toContain("preserve context space")
710+
expect(result).toContain(`<content lines="1-999">`)
711+
})
712+
713+
it("should apply safeguard for very large files even if token counting fails", async () => {
714+
// Setup - very large file and token counting fails
715+
const partialContent = Array(2000).fill("This is a line of text").join("\n")
716+
717+
mockedExtractTextFromFile.mockResolvedValue("Large content")
718+
mockedReadLines.mockResolvedValue(partialContent)
719+
720+
// Setup addLineNumbers mock for partial content
721+
addLineNumbersMock.mockImplementation((text: string) => {
722+
const lines = text.split("\n")
723+
return lines.map((line: string, i: number) => `${i + 1} | ${line}`).join("\n")
724+
})
725+
726+
// Set up the provider state
727+
mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
728+
mockedCountFileLines.mockResolvedValue(6000)
729+
730+
// IMPORTANT: Set up tiktoken to reject AFTER other mocks are set
731+
mockedTiktoken.mockRejectedValue(new Error("Token counting failed"))
732+
733+
const argsContent = `<file><path>${testFilePath}</path></file>`
734+
735+
const toolUse: ReadFileToolUse = {
736+
type: "tool_use",
737+
name: "read_file",
738+
params: { args: argsContent },
739+
partial: false,
740+
}
741+
742+
await readFileTool(
743+
mockCline,
744+
toolUse,
745+
mockCline.ask,
746+
vi.fn(),
747+
(result: ToolResponse) => {
748+
toolResult = result
749+
},
750+
(_: ToolParamName, content?: string) => content ?? "",
751+
)
752+
753+
// Verify safeguard was applied despite token counting failure
754+
expect(mockedTiktoken).toHaveBeenCalled()
755+
expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0)
756+
757+
// Verify the result contains the safeguard notice (without token count)
758+
expect(toolResult).toContain("<notice>This file contains 6000 lines")
759+
expect(toolResult).toContain("Showing only the first 2000 lines to preserve context space")
760+
expect(toolResult).toContain(`<content lines="1-2000">`)
761+
})
762+
763+
it("should not apply safeguard when maxReadFileLine is not -1", async () => {
764+
// Setup
765+
const fileContent = Array(2000).fill("This is a line of text").join("\n")
766+
mockedExtractTextFromFile.mockResolvedValue(fileContent)
767+
768+
// Execute with maxReadFileLine = 500 (not -1)
769+
const result = await executeReadFileTool(
770+
{},
771+
{
772+
maxReadFileLine: 500,
773+
totalLines: 2000,
774+
tokenCount: 100000,
775+
},
776+
)
777+
778+
// Verify tiktoken was NOT called
779+
expect(mockedTiktoken).not.toHaveBeenCalled()
780+
781+
// The normal maxReadFileLine logic should apply
782+
expect(mockedReadLines).toHaveBeenCalled()
783+
})
784+
785+
it("should handle line ranges correctly with safeguard", async () => {
786+
// When line ranges are specified, safeguard should not apply
787+
const rangeContent = "Line 100\nLine 101\nLine 102"
788+
mockedReadLines.mockResolvedValue(rangeContent)
789+
790+
const argsContent = `<file><path>${testFilePath}</path><line_range>100-102</line_range></file>`
791+
792+
const toolUse: ReadFileToolUse = {
793+
type: "tool_use",
794+
name: "read_file",
795+
params: { args: argsContent },
796+
partial: false,
797+
}
798+
799+
mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
800+
mockedCountFileLines.mockResolvedValue(10000)
801+
802+
await readFileTool(
803+
mockCline,
804+
toolUse,
805+
mockCline.ask,
806+
vi.fn(),
807+
(result: ToolResponse) => {
808+
toolResult = result
809+
},
810+
(_: ToolParamName, content?: string) => content ?? "",
811+
)
812+
813+
// Verify tiktoken was NOT called for range reads
814+
expect(mockedTiktoken).not.toHaveBeenCalled()
815+
expect(toolResult).toContain(`<content lines="100-102">`)
816+
expect(toolResult).not.toContain("preserve context space")
817+
})
818+
})
819+
820+
describe("safeguard thresholds", () => {
821+
it("should use correct thresholds for line count and token count", async () => {
822+
// Test boundary conditions
823+
824+
// Just below line threshold - no token check
825+
await executeReadFileTool({}, { totalLines: 1000, maxReadFileLine: -1 })
826+
expect(mockedTiktoken).not.toHaveBeenCalled()
827+
828+
// Just above line threshold - token check performed
829+
vi.clearAllMocks()
830+
mockedExtractTextFromFile.mockResolvedValue("content")
831+
await executeReadFileTool({}, { totalLines: 1001, maxReadFileLine: -1, tokenCount: 40000 })
832+
expect(mockedTiktoken).toHaveBeenCalled()
833+
834+
// Token count just below threshold - no safeguard
835+
expect(toolResult).not.toContain("preserve context space")
836+
837+
// Token count just above threshold - safeguard applied
838+
vi.clearAllMocks()
839+
mockedExtractTextFromFile.mockResolvedValue("content")
840+
mockedReadLines.mockResolvedValue("partial content")
841+
await executeReadFileTool({}, { totalLines: 1001, maxReadFileLine: -1, tokenCount: 50001 })
842+
expect(mockedReadLines).toHaveBeenCalled()
843+
expect(toolResult).toContain("preserve context space")
844+
})
845+
})
846+
})

0 commit comments

Comments
 (0)