|
| 1 | +// npx vitest run integrations/misc/__tests__/extract-text-token-based.spec.ts |
| 2 | + |
| 3 | +import { describe, it, expect, vi, beforeEach, Mock } from "vitest" |
| 4 | +import * as fs from "fs/promises" |
| 5 | +import { Anthropic } from "@anthropic-ai/sdk" |
| 6 | +import { extractTextFromFile } from "../extract-text" |
| 7 | +import { countFileLines } from "../line-counter" |
| 8 | +import { readLines } from "../read-lines" |
| 9 | +import { isBinaryFile } from "isbinaryfile" |
| 10 | +import { countTokens } from "../../../utils/countTokens" |
| 11 | + |
| 12 | +// Mock all dependencies |
| 13 | +vi.mock("fs/promises") |
| 14 | +vi.mock("../line-counter") |
| 15 | +vi.mock("../read-lines") |
| 16 | +vi.mock("isbinaryfile") |
| 17 | +vi.mock("../../../utils/countTokens") |
| 18 | + |
| 19 | +describe("extractTextFromFile - Token-based Truncation", () => { |
| 20 | + // Type the mocks |
| 21 | + const mockedFs = vi.mocked(fs) |
| 22 | + const mockedCountFileLines = vi.mocked(countFileLines) |
| 23 | + const mockedReadLines = vi.mocked(readLines) |
| 24 | + const mockedIsBinaryFile = vi.mocked(isBinaryFile) |
| 25 | + const mockedCountTokens = vi.mocked(countTokens) |
| 26 | + |
| 27 | + beforeEach(() => { |
| 28 | + vi.clearAllMocks() |
| 29 | + // Set default mock behavior |
| 30 | + mockedFs.access.mockResolvedValue(undefined) |
| 31 | + mockedIsBinaryFile.mockResolvedValue(false) |
| 32 | + |
| 33 | + // Mock countTokens to return a predictable token count |
| 34 | + mockedCountTokens.mockImplementation(async (content: Anthropic.Messages.ContentBlockParam[]) => { |
| 35 | + // Simulate token counting based on text content |
| 36 | + const text = content |
| 37 | + .filter((block) => block.type === "text") |
| 38 | + .map((block) => (block as Anthropic.Messages.TextBlockParam).text) |
| 39 | + .join("") |
| 40 | + const words = text.split(/\s+/).length |
| 41 | + return Math.floor(words * 1.5) |
| 42 | + }) |
| 43 | + }) |
| 44 | + |
| 45 | + it("should truncate files based on token count when maxReadFileTokens is provided", async () => { |
| 46 | + const fileContent = Array(100) |
| 47 | + .fill(null) |
| 48 | + .map((_, i) => `Line ${i + 1}: This is a test line with some content that has multiple words`) |
| 49 | + .join("\n") |
| 50 | + |
| 51 | + mockedFs.readFile.mockResolvedValue(fileContent as any) |
| 52 | + |
| 53 | + // Mock token counting to exceed limit after 50 lines |
| 54 | + let tokenCount = 0 |
| 55 | + mockedCountTokens.mockImplementation(async (content: Anthropic.Messages.ContentBlockParam[]) => { |
| 56 | + const text = content |
| 57 | + .filter((block) => block.type === "text") |
| 58 | + .map((block) => (block as Anthropic.Messages.TextBlockParam).text) |
| 59 | + .join("") |
| 60 | + const lines = text.split("\n").length |
| 61 | + // Each line has ~15 tokens, so 50 lines = 750 tokens |
| 62 | + tokenCount = lines * 15 |
| 63 | + return tokenCount |
| 64 | + }) |
| 65 | + |
| 66 | + const result = await extractTextFromFile("/test/large-file.ts", -1, 750) |
| 67 | + |
| 68 | + // Should truncate based on tokens, not lines |
| 69 | + expect(result).toContain("1 | Line 1:") |
| 70 | + expect(result).toContain("[File truncated") |
| 71 | + expect(result).toMatch(/\d+ of ~?\d+ tokens/) |
| 72 | + }) |
| 73 | + |
| 74 | + it("should not truncate when token count is within limit", async () => { |
| 75 | + const fileContent = Array(10) |
| 76 | + .fill(null) |
| 77 | + .map((_, i) => `Line ${i + 1}: Short content`) |
| 78 | + .join("\n") |
| 79 | + |
| 80 | + mockedFs.readFile.mockResolvedValue(fileContent as any) |
| 81 | + |
| 82 | + // Mock token counting to stay under limit |
| 83 | + mockedCountTokens.mockResolvedValue(100) // Well under 10000 default |
| 84 | + |
| 85 | + const result = await extractTextFromFile("/test/small-file.ts", -1, 10000) |
| 86 | + |
| 87 | + // Should include all content |
| 88 | + expect(result).toContain(" 1 | Line 1: Short content") |
| 89 | + expect(result).toContain("10 | Line 10: Short content") |
| 90 | + expect(result).not.toContain("[File truncated") |
| 91 | + }) |
| 92 | + |
| 93 | + it("should prioritize token-based truncation over line-based when both limits are set", async () => { |
| 94 | + const fileContent = Array(200) |
| 95 | + .fill(null) |
| 96 | + .map((_, i) => `Line ${i + 1}: This line has many words to increase token count significantly`) |
| 97 | + .join("\n") |
| 98 | + |
| 99 | + mockedCountFileLines.mockResolvedValue(200) |
| 100 | + mockedFs.readFile.mockResolvedValue(fileContent as any) |
| 101 | + |
| 102 | + // Mock to exceed token limit before line limit |
| 103 | + let callCount = 0 |
| 104 | + mockedCountTokens.mockImplementation(async (content: Anthropic.Messages.ContentBlockParam[]) => { |
| 105 | + callCount++ |
| 106 | + const text = content |
| 107 | + .filter((block) => block.type === "text") |
| 108 | + .map((block) => (block as Anthropic.Messages.TextBlockParam).text) |
| 109 | + .join("") |
| 110 | + const lines = text.split("\n").length |
| 111 | + // Make it exceed token limit at ~30 lines (30 * 20 = 600 tokens) |
| 112 | + return lines * 20 |
| 113 | + }) |
| 114 | + |
| 115 | + // maxReadFileLine=100, maxReadFileTokens=500 |
| 116 | + const result = await extractTextFromFile("/test/file.ts", 100, 500) |
| 117 | + |
| 118 | + // Should truncate based on tokens (500), not lines (100) |
| 119 | + expect(result).toContain("[File truncated") |
| 120 | + expect(result).toMatch(/\d+ of ~?\d+ tokens/) |
| 121 | + |
| 122 | + // Should have stopped before reaching line limit |
| 123 | + const resultLines = result.split("\n").filter((line) => line.match(/^\s*\d+\s*\|/)) |
| 124 | + expect(resultLines.length).toBeLessThan(100) |
| 125 | + }) |
| 126 | + |
| 127 | + it("should handle maxReadFileTokens of 0 by throwing an error", async () => { |
| 128 | + await expect(extractTextFromFile("/test/file.ts", -1, 0)).rejects.toThrow( |
| 129 | + "Invalid maxReadFileTokens: 0. Must be a positive integer or -1 for unlimited.", |
| 130 | + ) |
| 131 | + }) |
| 132 | + |
| 133 | + it("should handle negative maxReadFileTokens by throwing an error", async () => { |
| 134 | + await expect(extractTextFromFile("/test/file.ts", -1, -100)).rejects.toThrow( |
| 135 | + "Invalid maxReadFileTokens: -100. Must be a positive integer or -1 for unlimited.", |
| 136 | + ) |
| 137 | + }) |
| 138 | + |
| 139 | + it("should work with both line and token limits disabled", async () => { |
| 140 | + const fileContent = "Line 1\nLine 2\nLine 3" |
| 141 | + mockedFs.readFile.mockResolvedValue(fileContent as any) |
| 142 | + |
| 143 | + const result = await extractTextFromFile("/test/file.ts", -1, undefined) |
| 144 | + |
| 145 | + // Should include all content |
| 146 | + expect(result).toContain("1 | Line 1") |
| 147 | + expect(result).toContain("2 | Line 2") |
| 148 | + expect(result).toContain("3 | Line 3") |
| 149 | + expect(result).not.toContain("[File truncated") |
| 150 | + }) |
| 151 | + |
| 152 | + it("should handle empty files with token-based truncation", async () => { |
| 153 | + mockedFs.readFile.mockResolvedValue("" as any) |
| 154 | + mockedCountTokens.mockResolvedValue(0) |
| 155 | + |
| 156 | + const result = await extractTextFromFile("/test/empty.ts", -1, 1000) |
| 157 | + |
| 158 | + expect(result).toBe("") |
| 159 | + }) |
| 160 | + |
| 161 | + it("should efficiently handle very large token counts", async () => { |
| 162 | + // Simulate a file that would have millions of tokens |
| 163 | + const hugeContent = Array(10000) |
| 164 | + .fill(null) |
| 165 | + .map((_, i) => `Line ${i + 1}: ${Array(100).fill("word").join(" ")}`) |
| 166 | + .join("\n") |
| 167 | + |
| 168 | + mockedFs.readFile.mockResolvedValue(hugeContent as any) |
| 169 | + |
| 170 | + // Mock progressive token counting |
| 171 | + mockedCountTokens.mockImplementation(async (content: Anthropic.Messages.ContentBlockParam[]) => { |
| 172 | + const text = content |
| 173 | + .filter((block) => block.type === "text") |
| 174 | + .map((block) => (block as Anthropic.Messages.TextBlockParam).text) |
| 175 | + .join("") |
| 176 | + const lines = text.split("\n").length |
| 177 | + return lines * 150 // Each line has ~150 tokens |
| 178 | + }) |
| 179 | + |
| 180 | + const result = await extractTextFromFile("/test/huge.ts", -1, 5000) |
| 181 | + |
| 182 | + // Should truncate early based on tokens |
| 183 | + expect(result).toContain("[File truncated") |
| 184 | + expect(result).toMatch(/\d+ of ~?\d+ tokens/) |
| 185 | + |
| 186 | + // Should have stopped processing early |
| 187 | + const resultLines = result.split("\n").filter((line) => line.match(/^\s*\d+\s*\|/)) |
| 188 | + expect(resultLines.length).toBeLessThan(50) // Should stop around 33 lines (5000/150) |
| 189 | + }) |
| 190 | +}) |
0 commit comments