Skip to content

Commit 89790bc

Browse files
committed
refactor: use file size instead of line count for large file detection
- Replace line count threshold with file size threshold (100KB) - Files larger than 100KB trigger token count check - Files larger than 1MB automatically apply safeguard if token counting fails - Update tests to reflect new file size-based approach - This better handles files with large amounts of content on single lines As requested by @cte in PR comment
1 parent 9fb7392 commit 89790bc

File tree

2 files changed

+59
-40
lines changed

2 files changed

+59
-40
lines changed

src/core/tools/__tests__/readFileTool.spec.ts

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// npx vitest src/core/tools/__tests__/readFileTool.spec.ts
22

33
import * as path from "path"
4+
import { stat } from "fs/promises"
45

56
import { countFileLines } from "../../../integrations/misc/line-counter"
67
import { readLines } from "../../../integrations/misc/read-lines"
@@ -25,6 +26,7 @@ vi.mock("fs/promises", () => ({
2526
mkdir: vi.fn().mockResolvedValue(undefined),
2627
writeFile: vi.fn().mockResolvedValue(undefined),
2728
readFile: vi.fn().mockResolvedValue("{}"),
29+
stat: vi.fn().mockResolvedValue({ size: 1024 }), // Default 1KB file
2830
}))
2931

3032
vi.mock("isbinaryfile")
@@ -561,6 +563,7 @@ describe("read_file tool with large file safeguard", () => {
561563
const mockedIsBinaryFile = vi.mocked(isBinaryFile)
562564
const mockedPathResolve = vi.mocked(path.resolve)
563565
const mockedTiktoken = vi.mocked(tiktoken)
566+
const mockedStat = vi.mocked(stat)
564567

565568
const mockCline: any = {}
566569
let mockProvider: any
@@ -609,15 +612,18 @@ describe("read_file tool with large file safeguard", () => {
609612
maxReadFileLine?: number
610613
totalLines?: number
611614
tokenCount?: number
615+
fileSize?: number
612616
} = {},
613617
): Promise<ToolResponse | undefined> {
614618
const maxReadFileLine = options.maxReadFileLine ?? -1
615619
const totalLines = options.totalLines ?? 5
616620
const tokenCount = options.tokenCount ?? 100
621+
const fileSize = options.fileSize ?? 1024 // Default 1KB
617622

618623
mockProvider.getState.mockResolvedValue({ maxReadFileLine })
619624
mockedCountFileLines.mockResolvedValue(totalLines)
620625
mockedTiktoken.mockResolvedValue(tokenCount)
626+
mockedStat.mockResolvedValue({ size: fileSize } as any)
621627

622628
const argsContent = `<file><path>${testFilePath}</path></file>`
623629

@@ -642,7 +648,7 @@ describe("read_file tool with large file safeguard", () => {
642648
return toolResult
643649
}
644650

645-
describe("when file has many lines and high token count", () => {
651+
describe("when file has large size and high token count", () => {
646652
it("should apply safeguard and read only first 2000 lines", async () => {
647653
// Setup - large file with high token count
648654
const largeFileContent = Array(15000).fill("This is a line of text").join("\n")
@@ -666,13 +672,14 @@ describe("read_file tool with large file safeguard", () => {
666672
}),
667673
}
668674

669-
// Execute with high line count and token count
675+
// Execute with large file size and high token count
670676
const result = await executeReadFileTool(
671677
{},
672678
{
673679
maxReadFileLine: -1,
674680
totalLines: 15000,
675681
tokenCount: 60000, // Above threshold
682+
fileSize: 200 * 1024, // 200KB - above threshold
676683
},
677684
)
678685

@@ -681,7 +688,7 @@ describe("read_file tool with large file safeguard", () => {
681688
expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0)
682689

683690
// Verify the result contains the safeguard notice
684-
expect(result).toContain("<notice>This file contains 15000 lines and approximately 60,000 tokens")
691+
expect(result).toContain("<notice>This file is 200KB and contains approximately 60,000 tokens")
685692
expect(result).toContain("Showing only the first 2000 lines to preserve context space")
686693
expect(result).toContain(`<content lines="1-2000">`)
687694
})
@@ -705,13 +712,14 @@ describe("read_file tool with large file safeguard", () => {
705712
}),
706713
}
707714

708-
// Execute with high line count but low token count
715+
// Execute with large file size but low token count
709716
const result = await executeReadFileTool(
710717
{},
711718
{
712719
maxReadFileLine: -1,
713720
totalLines: 15000,
714721
tokenCount: 30000, // Below threshold
722+
fileSize: 200 * 1024, // 200KB - above threshold
715723
},
716724
)
717725

@@ -725,9 +733,9 @@ describe("read_file tool with large file safeguard", () => {
725733
expect(result).toContain(`<content lines="1-15000">`)
726734
})
727735

728-
it("should not apply safeguard for files under 10000 lines", async () => {
729-
// Setup - file with less than 10000 lines
730-
const fileContent = Array(9999).fill("This is a line of text").join("\n")
736+
it("should not apply safeguard for small files", async () => {
737+
// Setup - small file
738+
const fileContent = Array(999).fill("This is a line of text").join("\n")
731739
const numberedContent = fileContent
732740
.split("\n")
733741
.map((line, i) => `${i + 1} | ${line}`)
@@ -744,13 +752,14 @@ describe("read_file tool with large file safeguard", () => {
744752
}),
745753
}
746754

747-
// Execute
755+
// Execute with small file size
748756
const result = await executeReadFileTool(
749757
{},
750758
{
751759
maxReadFileLine: -1,
752-
totalLines: 9999,
760+
totalLines: 999,
753761
tokenCount: 100000, // Even with high token count
762+
fileSize: 50 * 1024, // 50KB - below threshold
754763
},
755764
)
756765

@@ -761,7 +770,7 @@ describe("read_file tool with large file safeguard", () => {
761770

762771
// Verify no safeguard notice
763772
expect(result).not.toContain("preserve context space")
764-
expect(result).toContain(`<content lines="1-9999">`)
773+
expect(result).toContain(`<content lines="1-999">`)
765774
})
766775

767776
it("should apply safeguard for very large files even if token counting fails", async () => {
@@ -788,7 +797,8 @@ describe("read_file tool with large file safeguard", () => {
788797

789798
// Set up the provider state
790799
mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
791-
mockedCountFileLines.mockResolvedValue(60000)
800+
mockedCountFileLines.mockResolvedValue(6000)
801+
mockedStat.mockResolvedValue({ size: 2 * 1024 * 1024 } as any) // 2MB file
792802

793803
// IMPORTANT: Set up tiktoken to reject AFTER other mocks are set
794804
mockedTiktoken.mockRejectedValue(new Error("Token counting failed"))
@@ -818,7 +828,7 @@ describe("read_file tool with large file safeguard", () => {
818828
expect(mockedReadLines).toHaveBeenCalledWith(absoluteFilePath, 1999, 0)
819829

820830
// Verify the result contains the safeguard notice (without token count)
821-
expect(toolResult).toContain("<notice>This file contains 60000 lines")
831+
expect(toolResult).toContain("<notice>This file is 2048KB")
822832
expect(toolResult).toContain("Showing only the first 2000 lines to preserve context space")
823833
expect(toolResult).toContain(`<content lines="1-2000">`)
824834
})
@@ -844,6 +854,7 @@ describe("read_file tool with large file safeguard", () => {
844854
maxReadFileLine: 500,
845855
totalLines: 20000,
846856
tokenCount: 100000,
857+
fileSize: 2 * 1024 * 1024, // 2MB
847858
},
848859
)
849860

@@ -878,7 +889,8 @@ describe("read_file tool with large file safeguard", () => {
878889
}
879890

880891
mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
881-
mockedCountFileLines.mockResolvedValue(100000)
892+
mockedCountFileLines.mockResolvedValue(10000)
893+
mockedStat.mockResolvedValue({ size: 10 * 1024 * 1024 } as any) // 10MB file
882894

883895
await readFileTool(
884896
mockCline,
@@ -899,7 +911,7 @@ describe("read_file tool with large file safeguard", () => {
899911
})
900912

901913
describe("safeguard thresholds", () => {
902-
it("should use correct thresholds for line count and token count", async () => {
914+
it("should use correct thresholds for file size and token count", async () => {
903915
// Mock the api.getModel() to return a model with context window
904916
mockCline.api = {
905917
getModel: vi.fn().mockReturnValue({
@@ -911,11 +923,11 @@ describe("read_file tool with large file safeguard", () => {
911923

912924
// Test boundary conditions
913925

914-
// Just below line threshold - no token check
915-
await executeReadFileTool({}, { totalLines: 10000, maxReadFileLine: -1 })
926+
// Just below size threshold - no token check
927+
await executeReadFileTool({}, { fileSize: 100 * 1024 - 1, maxReadFileLine: -1 }) // Just under 100KB
916928
expect(mockedTiktoken).not.toHaveBeenCalled()
917929

918-
// Just above line threshold - token check performed
930+
// Just above size threshold - token check performed
919931
vi.clearAllMocks()
920932
// Re-mock the api.getModel() after clearAllMocks
921933
mockCline.api = {
@@ -926,7 +938,7 @@ describe("read_file tool with large file safeguard", () => {
926938
}),
927939
}
928940
mockedExtractTextFromFile.mockResolvedValue("content")
929-
await executeReadFileTool({}, { totalLines: 10001, maxReadFileLine: -1, tokenCount: 40000 })
941+
await executeReadFileTool({}, { fileSize: 100 * 1024 + 1, maxReadFileLine: -1, tokenCount: 40000 }) // Just over 100KB
930942
expect(mockedTiktoken).toHaveBeenCalled()
931943

932944
// Token count just below threshold - no safeguard
@@ -944,7 +956,7 @@ describe("read_file tool with large file safeguard", () => {
944956
}
945957
mockedExtractTextFromFile.mockResolvedValue("content")
946958
mockedReadLines.mockResolvedValue("partial content")
947-
await executeReadFileTool({}, { totalLines: 10001, maxReadFileLine: -1, tokenCount: 50001 })
959+
await executeReadFileTool({}, { fileSize: 100 * 1024 + 1, maxReadFileLine: -1, tokenCount: 50001 })
948960
expect(mockedReadLines).toHaveBeenCalled()
949961
expect(toolResult).toContain("preserve context space")
950962
})

src/core/tools/readFileTool.ts

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import path from "path"
22
import { isBinaryFile } from "isbinaryfile"
3+
import { stat } from "fs/promises"
34

45
import { Task } from "../task/Task"
56
import { ClineSayTool } from "../../shared/ExtensionMessage"
@@ -519,7 +520,8 @@ export async function readFileTool(
519520

520521
// Handle normal file read with safeguard for large files
521522
// Define thresholds for the safeguard
522-
const LARGE_FILE_LINE_THRESHOLD = 10000 // Consider files with more than 10000 lines as "large"
523+
const LARGE_FILE_SIZE_THRESHOLD = 100 * 1024 // 100KB - files larger than this will be checked for token count
524+
const VERY_LARGE_FILE_SIZE = 1024 * 1024 // 1MB - apply safeguard automatically
523525
const FALLBACK_MAX_LINES = 2000 // Default number of lines to read when applying safeguard
524526

525527
// Get the actual context window size from the model
@@ -531,26 +533,31 @@ export async function readFileTool(
531533
let safeguardNotice = ""
532534
let linesToRead = totalLines
533535

534-
if (maxReadFileLine === -1 && totalLines > LARGE_FILE_LINE_THRESHOLD) {
535-
// File has many lines and we're trying to read the full file
536-
// Perform token count check
537-
try {
538-
const fullContent = await extractTextFromFile(fullPath)
539-
const tokenCount = await tiktoken([{ type: "text", text: fullContent }])
540-
541-
if (tokenCount > MAX_TOKEN_THRESHOLD) {
542-
shouldApplySafeguard = true
543-
linesToRead = FALLBACK_MAX_LINES
544-
safeguardNotice = `<notice>This file contains ${totalLines} lines and approximately ${tokenCount.toLocaleString()} tokens, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.</notice>\n`
545-
}
546-
} catch (error) {
547-
// If token counting fails, apply safeguard based on line count alone
548-
console.warn(`Failed to count tokens for large file ${relPath}:`, error)
549-
if (totalLines > LARGE_FILE_LINE_THRESHOLD * 5) {
550-
// For very large files (>50000 lines), apply safeguard anyway
551-
shouldApplySafeguard = true
552-
linesToRead = FALLBACK_MAX_LINES
553-
safeguardNotice = `<notice>This file contains ${totalLines} lines, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.</notice>\n`
536+
if (maxReadFileLine === -1) {
537+
// Get file size
538+
const fileStats = await stat(fullPath)
539+
const fileSizeKB = Math.round(fileStats.size / 1024)
540+
541+
if (fileStats.size > LARGE_FILE_SIZE_THRESHOLD) {
542+
// File is large enough to warrant token count check
543+
try {
544+
const fullContent = await extractTextFromFile(fullPath)
545+
const tokenCount = await tiktoken([{ type: "text", text: fullContent }])
546+
547+
if (tokenCount > MAX_TOKEN_THRESHOLD) {
548+
shouldApplySafeguard = true
549+
linesToRead = FALLBACK_MAX_LINES
550+
safeguardNotice = `<notice>This file is ${fileSizeKB}KB and contains approximately ${tokenCount.toLocaleString()} tokens, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.</notice>\n`
551+
}
552+
} catch (error) {
553+
// If token counting fails, apply safeguard based on file size alone
554+
console.warn(`Failed to count tokens for large file ${relPath}:`, error)
555+
if (fileStats.size > VERY_LARGE_FILE_SIZE) {
556+
// For very large files (>1MB), apply safeguard anyway
557+
shouldApplySafeguard = true
558+
linesToRead = FALLBACK_MAX_LINES
559+
safeguardNotice = `<notice>This file is ${fileSizeKB}KB, which could consume a significant portion of the context window. Showing only the first ${FALLBACK_MAX_LINES} lines to preserve context space. Use line_range if you need to read specific sections.</notice>\n`
560+
}
554561
}
555562
}
556563
}

0 commit comments

Comments
 (0)