From 15531b82614eb7246248dbecd1853ea11a350413 Mon Sep 17 00:00:00 2001
From: hannesrudolph <hrudolph@gmail.com>
Date: Wed, 25 Jun 2025 00:03:18 -0600
Subject: [PATCH 1/4] fix: resolve Claude Code token counting inefficiency and
 enable caching (#5104)

- Remove 1.5x fudge factor from Claude Code token counting
- Enable prompt caching support for all Claude Code models
- Add comprehensive tests for token counting and caching
- Update existing tests to reflect accurate token counting

This fixes the extreme token inefficiency where simple messages would
jump from ~40k to over 60k tokens, causing API hangs when approaching
the artificial 120k limit. Claude Code now properly utilizes its full
200k context window with accurate token counting.
---
 packages/types/src/providers/claude-code.ts   |  10 +-
 .../__tests__/claude-code-caching.spec.ts     | 305 ++++++++++++++++++
 .../claude-code-token-counting.spec.ts        | 117 +++++++
 .../providers/__tests__/claude-code.spec.ts   |   2 +-
 src/api/providers/claude-code.ts              |  40 +++
 5 files changed, 468 insertions(+), 6 deletions(-)
 create mode 100644 src/api/providers/__tests__/claude-code-caching.spec.ts
 create mode 100644 src/api/providers/__tests__/claude-code-token-counting.spec.ts

diff --git a/packages/types/src/providers/claude-code.ts b/packages/types/src/providers/claude-code.ts
index 707312e915..d0fff0f2ee 100644
--- a/packages/types/src/providers/claude-code.ts
+++ b/packages/types/src/providers/claude-code.ts
@@ -8,7 +8,7 @@ export const claudeCodeModels = {
 	"claude-sonnet-4-20250514": {
 		...anthropicModels["claude-sonnet-4-20250514"],
 		supportsImages: false,
-		supportsPromptCache: false,
+		supportsPromptCache: true, // Claude Code does report cache tokens
 		supportsReasoningEffort: false,
 		supportsReasoningBudget: false,
 		requiredReasoningBudget: false,
@@ -16,7 +16,7 @@ export const claudeCodeModels = {
 	"claude-opus-4-20250514": {
 		...anthropicModels["claude-opus-4-20250514"],
 		supportsImages: false,
-		supportsPromptCache: false,
+		supportsPromptCache: true, // Claude Code does report cache tokens
 		supportsReasoningEffort: false,
 		supportsReasoningBudget: false,
 		requiredReasoningBudget: false,
@@ -24,7 +24,7 @@ export const claudeCodeModels = {
 	"claude-3-7-sonnet-20250219": {
 		...anthropicModels["claude-3-7-sonnet-20250219"],
 		supportsImages: false,
-		supportsPromptCache: false,
+		supportsPromptCache: true, // Claude Code does report cache tokens
 		supportsReasoningEffort: false,
 		supportsReasoningBudget: false,
 		requiredReasoningBudget: false,
@@ -32,7 +32,7 @@ export const claudeCodeModels = {
 	"claude-3-5-sonnet-20241022": {
 		...anthropicModels["claude-3-5-sonnet-20241022"],
 		supportsImages: false,
-		supportsPromptCache: false,
+		supportsPromptCache: true, // Claude Code does report cache tokens
 		supportsReasoningEffort: false,
 		supportsReasoningBudget: false,
 		requiredReasoningBudget: false,
@@ -40,7 +40,7 @@ export const claudeCodeModels = {
 	"claude-3-5-haiku-20241022": {
 		...anthropicModels["claude-3-5-haiku-20241022"],
 		supportsImages: false,
-		supportsPromptCache: false,
+		supportsPromptCache: true, // Claude Code does report cache tokens
 		supportsReasoningEffort: false,
 		supportsReasoningBudget: false,
 		requiredReasoningBudget: false,
diff --git a/src/api/providers/__tests__/claude-code-caching.spec.ts b/src/api/providers/__tests__/claude-code-caching.spec.ts
new file mode 100644
index 0000000000..b7f7ff852a
--- /dev/null
+++ b/src/api/providers/__tests__/claude-code-caching.spec.ts
@@ -0,0 +1,305 @@
+import { describe, it, expect, vi, beforeEach } from "vitest"
+import { ClaudeCodeHandler } from "../claude-code"
+import { runClaudeCode } from "../../../integrations/claude-code/run"
+import type { ApiHandlerOptions } from "../../../shared/api"
+import type { ClaudeCodeMessage } from "../../../integrations/claude-code/types"
+import type { ApiStreamUsageChunk } from "../../transform/stream"
+import type { Anthropic } from "@anthropic-ai/sdk"
+
+// Mock the runClaudeCode function
+vi.mock("../../../integrations/claude-code/run", () => ({
+	runClaudeCode: vi.fn(),
+}))
+
+describe("ClaudeCodeHandler - Caching Support", () => {
+	let handler: ClaudeCodeHandler
+	const mockOptions: ApiHandlerOptions = {
+		apiKey: "test-key",
+		apiModelId: "claude-3-5-sonnet-20241022",
+		claudeCodePath: "/test/path",
+	}
+
+	beforeEach(() => {
+		handler = new ClaudeCodeHandler(mockOptions)
+		vi.clearAllMocks()
+	})
+
+	it("should collect cache read tokens from API response", async () => {
+		const mockStream = async function* (): AsyncGenerator<string | ClaudeCodeMessage> {
+			// Initial system message
+			yield {
+				type: "system",
+				subtype: "init",
+				session_id: "test-session",
+				tools: [],
+				mcp_servers: [],
+				apiKeySource: "user",
+			} as ClaudeCodeMessage
+
+			// Assistant message with cache tokens
+			const message: Anthropic.Messages.Message = {
+				id: "msg_123",
+				type: "message",
+				role: "assistant",
+				model: "claude-3-5-sonnet-20241022",
+				content: [{ type: "text", text: "Hello!", citations: [] }],
+				usage: {
+					input_tokens: 100,
+					output_tokens: 50,
+					cache_read_input_tokens: 80, // 80 tokens read from cache
+					cache_creation_input_tokens: 20, // 20 new tokens cached
+				},
+				stop_reason: "end_turn",
+				stop_sequence: null,
+			}
+
+			yield {
+				type: "assistant",
+				message,
+				session_id: "test-session",
+			} as ClaudeCodeMessage
+
+			// Result with cost
+			yield {
+				type: "result",
+				subtype: "success",
+				result: "success",
+				total_cost_usd: 0.001,
+				is_error: false,
+				duration_ms: 1000,
+				duration_api_ms: 900,
+				num_turns: 1,
+				session_id: "test-session",
+			} as ClaudeCodeMessage
+		}
+
+		vi.mocked(runClaudeCode).mockReturnValue(mockStream())
+
+		const stream = handler.createMessage("System prompt", [{ role: "user", content: "Hello" }])
+
+		const chunks = []
+		for await (const chunk of stream) {
+			chunks.push(chunk)
+		}
+
+		// Find the usage chunk
+		const usageChunk = chunks.find((c) => c.type === "usage" && "totalCost" in c) as ApiStreamUsageChunk | undefined
+		expect(usageChunk).toBeDefined()
+		expect(usageChunk!.inputTokens).toBe(100)
+		expect(usageChunk!.outputTokens).toBe(50)
+		expect(usageChunk!.cacheReadTokens).toBe(80)
+		expect(usageChunk!.cacheWriteTokens).toBe(20)
+	})
+
+	it("should accumulate cache tokens across multiple messages", async () => {
+		const mockStream = async function* (): AsyncGenerator<string | ClaudeCodeMessage> {
+			yield {
+				type: "system",
+				subtype: "init",
+				session_id: "test-session",
+				tools: [],
+				mcp_servers: [],
+				apiKeySource: "user",
+			} as ClaudeCodeMessage
+
+			// First message chunk
+			const message1: Anthropic.Messages.Message = {
+				id: "msg_1",
+				type: "message",
+				role: "assistant",
+				model: "claude-3-5-sonnet-20241022",
+				content: [{ type: "text", text: "Part 1", citations: [] }],
+				usage: {
+					input_tokens: 50,
+					output_tokens: 25,
+					cache_read_input_tokens: 40,
+					cache_creation_input_tokens: 10,
+				},
+				stop_reason: null,
+				stop_sequence: null,
+			}
+
+			yield {
+				type: "assistant",
+				message: message1,
+				session_id: "test-session",
+			} as ClaudeCodeMessage
+
+			// Second message chunk
+			const message2: Anthropic.Messages.Message = {
+				id: "msg_2",
+				type: "message",
+				role: "assistant",
+				model: "claude-3-5-sonnet-20241022",
+				content: [{ type: "text", text: "Part 2", citations: [] }],
+				usage: {
+					input_tokens: 50,
+					output_tokens: 25,
+					cache_read_input_tokens: 30,
+					cache_creation_input_tokens: 20,
+				},
+				stop_reason: "end_turn",
+				stop_sequence: null,
+			}
+
+			yield {
+				type: "assistant",
+				message: message2,
+				session_id: "test-session",
+			} as ClaudeCodeMessage
+
+			yield {
+				type: "result",
+				subtype: "success",
+				result: "success",
+				total_cost_usd: 0.002,
+				is_error: false,
+				duration_ms: 2000,
+				duration_api_ms: 1800,
+				num_turns: 1,
+				session_id: "test-session",
+			} as ClaudeCodeMessage
+		}
+
+		vi.mocked(runClaudeCode).mockReturnValue(mockStream())
+
+		const stream = handler.createMessage("System prompt", [{ role: "user", content: "Hello" }])
+
+		const chunks = []
+		for await (const chunk of stream) {
+			chunks.push(chunk)
+		}
+
+		const usageChunk = chunks.find((c) => c.type === "usage" && "totalCost" in c) as ApiStreamUsageChunk | undefined
+		expect(usageChunk).toBeDefined()
+		expect(usageChunk!.inputTokens).toBe(100) // 50 + 50
+		expect(usageChunk!.outputTokens).toBe(50) // 25 + 25
+		expect(usageChunk!.cacheReadTokens).toBe(70) // 40 + 30
+		expect(usageChunk!.cacheWriteTokens).toBe(30) // 10 + 20
+	})
+
+	it("should handle missing cache token fields gracefully", async () => {
+		const mockStream = async function* (): AsyncGenerator<string | ClaudeCodeMessage> {
+			yield {
+				type: "system",
+				subtype: "init",
+				session_id: "test-session",
+				tools: [],
+				mcp_servers: [],
+				apiKeySource: "user",
+			} as ClaudeCodeMessage
+
+			// Message without cache tokens
+			const message: Anthropic.Messages.Message = {
+				id: "msg_123",
+				type: "message",
+				role: "assistant",
+				model: "claude-3-5-sonnet-20241022",
+				content: [{ type: "text", text: "Hello!", citations: [] }],
+				usage: {
+					input_tokens: 100,
+					output_tokens: 50,
+					cache_read_input_tokens: null,
+					cache_creation_input_tokens: null,
+				},
+				stop_reason: "end_turn",
+				stop_sequence: null,
+			}
+
+			yield {
+				type: "assistant",
+				message,
+				session_id: "test-session",
+			} as ClaudeCodeMessage
+
+			yield {
+				type: "result",
+				subtype: "success",
+				result: "success",
+				total_cost_usd: 0.001,
+				is_error: false,
+				duration_ms: 1000,
+				duration_api_ms: 900,
+				num_turns: 1,
+				session_id: "test-session",
+			} as ClaudeCodeMessage
+		}
+
+		vi.mocked(runClaudeCode).mockReturnValue(mockStream())
+
+		const stream = handler.createMessage("System prompt", [{ role: "user", content: "Hello" }])
+
+		const chunks = []
+		for await (const chunk of stream) {
+			chunks.push(chunk)
+		}
+
+		const usageChunk = chunks.find((c) => c.type === "usage" && "totalCost" in c) as ApiStreamUsageChunk | undefined
+		expect(usageChunk).toBeDefined()
+		expect(usageChunk!.inputTokens).toBe(100)
+		expect(usageChunk!.outputTokens).toBe(50)
+		expect(usageChunk!.cacheReadTokens).toBe(0)
+		expect(usageChunk!.cacheWriteTokens).toBe(0)
+	})
+
+	it("should report zero cost for subscription usage", async () => {
+		const mockStream = async function* (): AsyncGenerator<string | ClaudeCodeMessage> {
+			// Subscription usage has apiKeySource: "none"
+			yield {
+				type: "system",
+				subtype: "init",
+				session_id: "test-session",
+				tools: [],
+				mcp_servers: [],
+				apiKeySource: "none",
+			} as ClaudeCodeMessage
+
+			const message: Anthropic.Messages.Message = {
+				id: "msg_123",
+				type: "message",
+				role: "assistant",
+				model: "claude-3-5-sonnet-20241022",
+				content: [{ type: "text", text: "Hello!", citations: [] }],
+				usage: {
+					input_tokens: 100,
+					output_tokens: 50,
+					cache_read_input_tokens: 80,
+					cache_creation_input_tokens: 20,
+				},
+				stop_reason: "end_turn",
+				stop_sequence: null,
+			}
+
+			yield {
+				type: "assistant",
+				message,
+				session_id: "test-session",
+			} as ClaudeCodeMessage
+
+			yield {
+				type: "result",
+				subtype: "success",
+				result: "success",
+				total_cost_usd: 0.001, // This should be ignored for subscription usage
+				is_error: false,
+				duration_ms: 1000,
+				duration_api_ms: 900,
+				num_turns: 1,
+				session_id: "test-session",
+			} as ClaudeCodeMessage
+		}
+
+		vi.mocked(runClaudeCode).mockReturnValue(mockStream())
+
+		const stream = handler.createMessage("System prompt", [{ role: "user", content: "Hello" }])
+
+		const chunks = []
+		for await (const chunk of stream) {
+			chunks.push(chunk)
+		}
+
+		const usageChunk = chunks.find((c) => c.type === "usage" && "totalCost" in c) as ApiStreamUsageChunk | undefined
+		expect(usageChunk).toBeDefined()
+		expect(usageChunk!.totalCost).toBe(0) // Should be 0 for subscription usage
+	})
+})
diff --git a/src/api/providers/__tests__/claude-code-token-counting.spec.ts b/src/api/providers/__tests__/claude-code-token-counting.spec.ts
new file mode 100644
index 0000000000..35d2ff9fbe
--- /dev/null
+++ b/src/api/providers/__tests__/claude-code-token-counting.spec.ts
@@ -0,0 +1,117 @@
+import { describe, it, expect, vi, beforeEach } from "vitest"
+import { ClaudeCodeHandler } from "../claude-code"
+import { ApiHandlerOptions } from "../../../shared/api"
+import type { Anthropic } from "@anthropic-ai/sdk"
+
+describe("ClaudeCodeHandler Token Counting", () => {
+	let handler: ClaudeCodeHandler
+	let mockOptions: ApiHandlerOptions
+
+	beforeEach(() => {
+		mockOptions = {
+			apiModelId: "claude-3-5-sonnet-20241022",
+			claudeCodePath: "/usr/local/bin/claude",
+		} as ApiHandlerOptions
+
+		handler = new ClaudeCodeHandler(mockOptions)
+	})
+
+	describe("countTokens", () => {
+		it("should count tokens accurately without any fudge factor", async () => {
+			const content: Anthropic.Messages.ContentBlockParam[] = [
+				{
+					type: "text",
+					text: "Hello, this is a test message to verify token counting accuracy.",
+				},
+			]
+
+			const tokenCount = await handler.countTokens(content)
+
+			// The text has approximately 13-15 tokens
+			// With no fudge factor, we expect the exact token count
+			// With the old 1.5x fudge factor, it would have been around 20-23 tokens
+			expect(tokenCount).toBeLessThan(16)
+			expect(tokenCount).toBeGreaterThan(12)
+		})
+
+		it("should handle empty content", async () => {
+			const content: Anthropic.Messages.ContentBlockParam[] = []
+			const tokenCount = await handler.countTokens(content)
+			expect(tokenCount).toBe(0)
+		})
+
+		it("should handle multiple text blocks", async () => {
+			const content: Anthropic.Messages.ContentBlockParam[] = [
+				{ type: "text", text: "First block" },
+				{ type: "text", text: "Second block" },
+				{ type: "text", text: "Third block" },
+			]
+
+			const tokenCount = await handler.countTokens(content)
+
+			// Each block is approximately 2-3 tokens, so 6-9 tokens total
+			// With no fudge factor, expect exact count
+			expect(tokenCount).toBeLessThan(10) // Would be ~15 with old 1.5x factor
+			expect(tokenCount).toBeGreaterThan(5)
+		})
+
+		it("should handle image blocks with conservative estimate", async () => {
+			const content: Anthropic.Messages.ContentBlockParam[] = [
+				{
+					type: "image",
+					source: {
+						type: "base64",
+						media_type: "image/jpeg",
+						data: "base64data",
+					},
+				},
+			]
+
+			const tokenCount = await handler.countTokens(content)
+
+			// Images get a conservative 300 tokens estimate (no fudge factor)
+			expect(tokenCount).toBe(300)
+		})
+
+		it("should provide accurate token counts for typical messages", async () => {
+			// Simulate a typical user message with environment details
+			const content: Anthropic.Messages.ContentBlockParam[] = [
+				{
+					type: "text",
+					text: `Hi
+
+<environment_details>
+# VSCode Visible Files
+src/app.ts
+src/utils.ts
+
+# VSCode Open Tabs
+src/app.ts
+
+# Current Time
+2024-01-01 12:00:00 PM
+
+# Current Context Size (Tokens)
+1000 (5%)
+
+# Current Cost
+$0.05
+
+# Current Mode
+<slug>code</slug>
+<name>Code</name>
+<model>claude-3-5-sonnet-20241022</model>
+</environment_details>`,
+				},
+			]
+
+			const tokenCount = await handler.countTokens(content)
+
+			// This content is approximately 100-120 tokens
+			// With no fudge factor, expect exact count
+			// With old 1.5x factor, it would have been 150-180 tokens
+			expect(tokenCount).toBeLessThan(125)
+			expect(tokenCount).toBeGreaterThan(95)
+		})
+	})
+})
diff --git a/src/api/providers/__tests__/claude-code.spec.ts b/src/api/providers/__tests__/claude-code.spec.ts
index 5bc4c6f1ea..d0dfa68eb8 100644
--- a/src/api/providers/__tests__/claude-code.spec.ts
+++ b/src/api/providers/__tests__/claude-code.spec.ts
@@ -34,7 +34,7 @@ describe("ClaudeCodeHandler", () => {
 		const model = handler.getModel()
 		expect(model.id).toBe("claude-3-5-sonnet-20241022")
 		expect(model.info.supportsImages).toBe(false)
-		expect(model.info.supportsPromptCache).toBe(false)
+		expect(model.info.supportsPromptCache).toBe(true) // Claude Code now supports prompt caching
 	})
 
 	test("should use default model when invalid model provided", () => {
diff --git a/src/api/providers/claude-code.ts b/src/api/providers/claude-code.ts
index bc72e658fe..b99bd91e1b 100644
--- a/src/api/providers/claude-code.ts
+++ b/src/api/providers/claude-code.ts
@@ -7,9 +7,12 @@ import { filterMessagesForClaudeCode } from "../../integrations/claude-code/mess
 import { BaseProvider } from "./base-provider"
 import { t } from "../../i18n"
 import { ApiHandlerOptions } from "../../shared/api"
+import { Tiktoken } from "tiktoken/lite"
+import o200kBase from "tiktoken/encoders/o200k_base"
 
 export class ClaudeCodeHandler extends BaseProvider implements ApiHandler {
 	private options: ApiHandlerOptions
+	private encoder: Tiktoken | null = null
 
 	constructor(options: ApiHandlerOptions) {
 		super()
@@ -145,4 +148,41 @@ export class ClaudeCodeHandler extends BaseProvider implements ApiHandler {
 			return null
 		}
 	}
+
+	/**
+	 * Override the base provider's token counting to provide accurate counting for Claude Code.
+	 * Claude Code uses the same tokenizer as Anthropic, so we don't need any fudge factor.
+	 * The actual token counts are reported accurately in the API response.
+	 */
+	override async countTokens(content: Anthropic.Messages.ContentBlockParam[]): Promise<number> {
+		if (content.length === 0) {
+			return 0
+		}
+
+		let totalTokens = 0
+
+		// Lazily create and cache the encoder if it doesn't exist
+		if (!this.encoder) {
+			this.encoder = new Tiktoken(o200kBase.bpe_ranks, o200kBase.special_tokens, o200kBase.pat_str)
+		}
+
+		// Process each content block
+		for (const block of content) {
+			if (block.type === "text") {
+				const text = block.text || ""
+				if (text.length > 0) {
+					const tokens = this.encoder.encode(text)
+					totalTokens += tokens.length
+				}
+			} else if (block.type === "image") {
+				// Claude Code doesn't support images, but we handle them just in case
+				// Use a conservative estimate
+				totalTokens += 300
+			}
+		}
+
+		// No fudge factor needed - Claude Code uses the same tokenizer as Anthropic
+		// and reports accurate token counts in the API response
+		return totalTokens
+	}
 }

From 24eca3fe89a73c5abd8c59cf394befe6a0efd44f Mon Sep 17 00:00:00 2001
From: hannesrudolph <hrudolph@gmail.com>
Date: Wed, 25 Jun 2025 00:21:33 -0600
Subject: [PATCH 2/4] fix: address PR review comments

- Extract IMAGE_TOKEN_ESTIMATE as a named constant for clarity
- Update token counting tests to use exact counts instead of ranges for deterministic testing
- Fix test expectations to match actual tokenizer output
---
 .../claude-code-token-counting.spec.ts        | 73 ++++++++++---------
 src/api/providers/claude-code.ts              |  7 +-
 2 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/src/api/providers/__tests__/claude-code-token-counting.spec.ts b/src/api/providers/__tests__/claude-code-token-counting.spec.ts
index 35d2ff9fbe..cefd3ec1c7 100644
--- a/src/api/providers/__tests__/claude-code-token-counting.spec.ts
+++ b/src/api/providers/__tests__/claude-code-token-counting.spec.ts
@@ -27,11 +27,9 @@ describe("ClaudeCodeHandler Token Counting", () => {
 
 			const tokenCount = await handler.countTokens(content)
 
-			// The text has approximately 13-15 tokens
-			// With no fudge factor, we expect the exact token count
-			// With the old 1.5x fudge factor, it would have been around 20-23 tokens
-			expect(tokenCount).toBeLessThan(16)
-			expect(tokenCount).toBeGreaterThan(12)
+			// The exact token count for this text using o200k_base tokenizer is 13
+			// With the old 1.5x fudge factor, it would have been 20 tokens
+			expect(tokenCount).toBe(13)
 		})
 
 		it("should handle empty content", async () => {
@@ -49,10 +47,9 @@ describe("ClaudeCodeHandler Token Counting", () => {
 
 			const tokenCount = await handler.countTokens(content)
 
-			// Each block is approximately 2-3 tokens, so 6-9 tokens total
-			// With no fudge factor, expect exact count
-			expect(tokenCount).toBeLessThan(10) // Would be ~15 with old 1.5x factor
-			expect(tokenCount).toBeGreaterThan(5)
+			// "First block" = 2 tokens, "Second block" = 2 tokens, "Third block" = 2 tokens
+			// Total: 6 tokens (would have been 9 with old 1.5x factor)
+			expect(tokenCount).toBe(6)
 		})
 
 		it("should handle image blocks with conservative estimate", async () => {
@@ -74,44 +71,52 @@ describe("ClaudeCodeHandler Token Counting", () => {
 		})
 
 		it("should provide accurate token counts for typical messages", async () => {
-			// Simulate a typical user message with environment details
+			// Use a simpler, predictable message for exact token counting
 			const content: Anthropic.Messages.ContentBlockParam[] = [
 				{
 					type: "text",
-					text: `Hi
+					text: "This is a simple test message with exactly predictable token count.",
+				},
+			]
 
-<environment_details>
-# VSCode Visible Files
-src/app.ts
-src/utils.ts
+			const tokenCount = await handler.countTokens(content)
 
-# VSCode Open Tabs
-src/app.ts
+			// This specific text has exactly 12 tokens with o200k_base tokenizer
+			// With old 1.5x factor, it would have been 18 tokens
+			expect(tokenCount).toBe(12)
+		})
 
-# Current Time
-2024-01-01 12:00:00 PM
+		it("should handle mixed content types", async () => {
+			const content: Anthropic.Messages.ContentBlockParam[] = [
+				{ type: "text", text: "Hello world" }, // 2 tokens
+				{
+					type: "image",
+					source: {
+						type: "base64",
+						media_type: "image/jpeg",
+						data: "base64data",
+					},
+				}, // 300 tokens (IMAGE_TOKEN_ESTIMATE)
+				{ type: "text", text: "Goodbye" }, // 1 token
+			]
 
-# Current Context Size (Tokens)
-1000 (5%)
+			const tokenCount = await handler.countTokens(content)
 
-# Current Cost
-$0.05
+			// Total: 2 + 300 + 2 = 304 tokens ("Goodbye" is actually 2 tokens)
+			expect(tokenCount).toBe(304)
+		})
 
-# Current Mode
-<slug>code</slug>
-<name>Code</name>
-<model>claude-3-5-sonnet-20241022</model>
-</environment_details>`,
-				},
+		it("should handle empty text blocks", async () => {
+			const content: Anthropic.Messages.ContentBlockParam[] = [
+				{ type: "text", text: "" },
+				{ type: "text", text: "Hello" }, // 1 token
+				{ type: "text", text: "" },
 			]
 
 			const tokenCount = await handler.countTokens(content)
 
-			// This content is approximately 100-120 tokens
-			// With no fudge factor, expect exact count
-			// With old 1.5x factor, it would have been 150-180 tokens
-			expect(tokenCount).toBeLessThan(125)
-			expect(tokenCount).toBeGreaterThan(95)
+			// Only "Hello" contributes tokens
+			expect(tokenCount).toBe(1)
 		})
 	})
 })
diff --git a/src/api/providers/claude-code.ts b/src/api/providers/claude-code.ts
index b99bd91e1b..d75df8a07a 100644
--- a/src/api/providers/claude-code.ts
+++ b/src/api/providers/claude-code.ts
@@ -10,6 +10,10 @@ import { ApiHandlerOptions } from "../../shared/api"
 import { Tiktoken } from "tiktoken/lite"
 import o200kBase from "tiktoken/encoders/o200k_base"
 
+// Conservative token estimate for images (even though Claude Code doesn't support them)
+// This matches the estimate used in src/utils/tiktoken.ts for consistency
+const IMAGE_TOKEN_ESTIMATE = 300
+
 export class ClaudeCodeHandler extends BaseProvider implements ApiHandler {
 	private options: ApiHandlerOptions
 	private encoder: Tiktoken | null = null
@@ -176,8 +180,7 @@ export class ClaudeCodeHandler extends BaseProvider implements ApiHandler {
 				}
 			} else if (block.type === "image") {
 				// Claude Code doesn't support images, but we handle them just in case
-				// Use a conservative estimate
-				totalTokens += 300
+				totalTokens += IMAGE_TOKEN_ESTIMATE
 			}
 		}
 

From 2f659bc5c3a5348356eb127c7df8108223d7350c Mon Sep 17 00:00:00 2001
From: Daniel Riccio <ricciodaniel98@gmail.com>
Date: Wed, 25 Jun 2025 17:02:26 -0500
Subject: [PATCH 3/4] Remove token counting changes, keep only cache support

- Removed custom countTokens override from claude-code.ts
- Deleted claude-code-token-counting.spec.ts test file
- Kept cache token collection and reporting functionality
- Kept supportsPromptCache: true for all Claude Code models
- Kept claude-code-caching.spec.ts tests

This focuses the PR on enabling cache support without modifying token counting behavior.
---
 .../claude-code-token-counting.spec.ts        | 122 ------------------
 src/api/providers/claude-code.ts              |  43 ------
 2 files changed, 165 deletions(-)
 delete mode 100644 src/api/providers/__tests__/claude-code-token-counting.spec.ts

diff --git a/src/api/providers/__tests__/claude-code-token-counting.spec.ts b/src/api/providers/__tests__/claude-code-token-counting.spec.ts
deleted file mode 100644
index cefd3ec1c7..0000000000
--- a/src/api/providers/__tests__/claude-code-token-counting.spec.ts
+++ /dev/null
@@ -1,122 +0,0 @@
-import { describe, it, expect, vi, beforeEach } from "vitest"
-import { ClaudeCodeHandler } from "../claude-code"
-import { ApiHandlerOptions } from "../../../shared/api"
-import type { Anthropic } from "@anthropic-ai/sdk"
-
-describe("ClaudeCodeHandler Token Counting", () => {
-	let handler: ClaudeCodeHandler
-	let mockOptions: ApiHandlerOptions
-
-	beforeEach(() => {
-		mockOptions = {
-			apiModelId: "claude-3-5-sonnet-20241022",
-			claudeCodePath: "/usr/local/bin/claude",
-		} as ApiHandlerOptions
-
-		handler = new ClaudeCodeHandler(mockOptions)
-	})
-
-	describe("countTokens", () => {
-		it("should count tokens accurately without any fudge factor", async () => {
-			const content: Anthropic.Messages.ContentBlockParam[] = [
-				{
-					type: "text",
-					text: "Hello, this is a test message to verify token counting accuracy.",
-				},
-			]
-
-			const tokenCount = await handler.countTokens(content)
-
-			// The exact token count for this text using o200k_base tokenizer is 13
-			// With the old 1.5x fudge factor, it would have been 20 tokens
-			expect(tokenCount).toBe(13)
-		})
-
-		it("should handle empty content", async () => {
-			const content: Anthropic.Messages.ContentBlockParam[] = []
-			const tokenCount = await handler.countTokens(content)
-			expect(tokenCount).toBe(0)
-		})
-
-		it("should handle multiple text blocks", async () => {
-			const content: Anthropic.Messages.ContentBlockParam[] = [
-				{ type: "text", text: "First block" },
-				{ type: "text", text: "Second block" },
-				{ type: "text", text: "Third block" },
-			]
-
-			const tokenCount = await handler.countTokens(content)
-
-			// "First block" = 2 tokens, "Second block" = 2 tokens, "Third block" = 2 tokens
-			// Total: 6 tokens (would have been 9 with old 1.5x factor)
-			expect(tokenCount).toBe(6)
-		})
-
-		it("should handle image blocks with conservative estimate", async () => {
-			const content: Anthropic.Messages.ContentBlockParam[] = [
-				{
-					type: "image",
-					source: {
-						type: "base64",
-						media_type: "image/jpeg",
-						data: "base64data",
-					},
-				},
-			]
-
-			const tokenCount = await handler.countTokens(content)
-
-			// Images get a conservative 300 tokens estimate (no fudge factor)
-			expect(tokenCount).toBe(300)
-		})
-
-		it("should provide accurate token counts for typical messages", async () => {
-			// Use a simpler, predictable message for exact token counting
-			const content: Anthropic.Messages.ContentBlockParam[] = [
-				{
-					type: "text",
-					text: "This is a simple test message with exactly predictable token count.",
-				},
-			]
-
-			const tokenCount = await handler.countTokens(content)
-
-			// This specific text has exactly 12 tokens with o200k_base tokenizer
-			// With old 1.5x factor, it would have been 18 tokens
-			expect(tokenCount).toBe(12)
-		})
-
-		it("should handle mixed content types", async () => {
-			const content: Anthropic.Messages.ContentBlockParam[] = [
-				{ type: "text", text: "Hello world" }, // 2 tokens
-				{
-					type: "image",
-					source: {
-						type: "base64",
-						media_type: "image/jpeg",
-						data: "base64data",
-					},
-				}, // 300 tokens (IMAGE_TOKEN_ESTIMATE)
-				{ type: "text", text: "Goodbye" }, // 1 token
-			]
-
-			const tokenCount = await handler.countTokens(content)
-
-			// Total: 2 + 300 + 2 = 304 tokens ("Goodbye" is actually 2 tokens)
-			expect(tokenCount).toBe(304)
-		})
-
-		it("should handle empty text blocks", async () => {
-			const content: Anthropic.Messages.ContentBlockParam[] = [
-				{ type: "text", text: "" },
-				{ type: "text", text: "Hello" }, // 1 token
-				{ type: "text", text: "" },
-			]
-
-			const tokenCount = await handler.countTokens(content)
-
-			// Only "Hello" contributes tokens
-			expect(tokenCount).toBe(1)
-		})
-	})
-})
diff --git a/src/api/providers/claude-code.ts b/src/api/providers/claude-code.ts
index d75df8a07a..bc72e658fe 100644
--- a/src/api/providers/claude-code.ts
+++ b/src/api/providers/claude-code.ts
@@ -7,16 +7,9 @@ import { filterMessagesForClaudeCode } from "../../integrations/claude-code/mess
 import { BaseProvider } from "./base-provider"
 import { t } from "../../i18n"
 import { ApiHandlerOptions } from "../../shared/api"
-import { Tiktoken } from "tiktoken/lite"
-import o200kBase from "tiktoken/encoders/o200k_base"
-
-// Conservative token estimate for images (even though Claude Code doesn't support them)
-// This matches the estimate used in src/utils/tiktoken.ts for consistency
-const IMAGE_TOKEN_ESTIMATE = 300
 
 export class ClaudeCodeHandler extends BaseProvider implements ApiHandler {
 	private options: ApiHandlerOptions
-	private encoder: Tiktoken | null = null
 
 	constructor(options: ApiHandlerOptions) {
 		super()
@@ -152,40 +145,4 @@ export class ClaudeCodeHandler extends BaseProvider implements ApiHandler {
 			return null
 		}
 	}
-
-	/**
-	 * Override the base provider's token counting to provide accurate counting for Claude Code.
-	 * Claude Code uses the same tokenizer as Anthropic, so we don't need any fudge factor.
-	 * The actual token counts are reported accurately in the API response.
-	 */
-	override async countTokens(content: Anthropic.Messages.ContentBlockParam[]): Promise<number> {
-		if (content.length === 0) {
-			return 0
-		}
-
-		let totalTokens = 0
-
-		// Lazily create and cache the encoder if it doesn't exist
-		if (!this.encoder) {
-			this.encoder = new Tiktoken(o200kBase.bpe_ranks, o200kBase.special_tokens, o200kBase.pat_str)
-		}
-
-		// Process each content block
-		for (const block of content) {
-			if (block.type === "text") {
-				const text = block.text || ""
-				if (text.length > 0) {
-					const tokens = this.encoder.encode(text)
-					totalTokens += tokens.length
-				}
-			} else if (block.type === "image") {
-				// Claude Code doesn't support images, but we handle them just in case
-				totalTokens += IMAGE_TOKEN_ESTIMATE
-			}
-		}
-
-		// No fudge factor needed - Claude Code uses the same tokenizer as Anthropic
-		// and reports accurate token counts in the API response
-		return totalTokens
-	}
 }

From 98f3093d3ded77e1e3163e2e9e1b3a5ec9dbf6a5 Mon Sep 17 00:00:00 2001
From: Daniel Riccio <ricciodaniel98@gmail.com>
Date: Wed, 25 Jun 2025 17:53:03 -0500
Subject: [PATCH 4/4] fix: update webview test to expect
 supportsPromptCache=true for Claude Code models

---
 .../src/components/ui/hooks/__tests__/useSelectedModel.spec.ts  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts
index fd5950bf35..5fefabf59e 100644
--- a/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts
+++ b/webview-ui/src/components/ui/hooks/__tests__/useSelectedModel.spec.ts
@@ -402,7 +402,7 @@ describe("useSelectedModel", () => {
 			expect(result.current.id).toBe("claude-sonnet-4-20250514")
 			expect(result.current.info).toBeDefined()
 			expect(result.current.info?.supportsImages).toBe(false)
-			expect(result.current.info?.supportsPromptCache).toBe(false)
+			expect(result.current.info?.supportsPromptCache).toBe(true) // Claude Code now supports prompt cache
 			// Verify it inherits other properties from anthropic models
 			expect(result.current.info?.maxTokens).toBe(64_000)
 			expect(result.current.info?.contextWindow).toBe(200_000)