OpenAI & DeepSeek cost calculation (RooCodeInc#1864)

dtrugman · web-flow · commit d6184e9dacff · 2025-02-28T11:25:17.000-08:00
* Add OpenAI compatible cost calculation * Requesty: Prepare for correct price calculation * Native OpenAI: Update model caching info According to [OpenAI's website](https://platform.openai.com/docs/guides/prompt-caching), gpt-4o, gpt-4o-mini, o1-preview and o1-mini support caching. For gpt-4o, even though gpt-4o-2024-05-13 and chatgpt-4o-latest do no support caching, users will see there are no cached tokens, which will help avoid confusion. * Native OpenAI: Call getModel once * Native OpenAI: Extract yield usage into method * Native OpenAI: Add caching and cost info to task header * DeepSeek: Add cost info to task header * Add changeset
diff --git a/.changeset/bright-horses-double.md b/.changeset/bright-horses-double.md
@@ -0,0 +1,5 @@
+---
+"claude-dev": patch
+---
+
+Add correct cost and tokens info to Native OpenAI and DeepSeek providers
diff --git a/src/api/providers/deepseek.ts b/src/api/providers/deepseek.ts
@@ -3,6 +3,7 @@ import OpenAI from "openai"
 import { withRetry } from "../retry"
 import { ApiHandler } from "../"
 import { ApiHandlerOptions, DeepSeekModelId, ModelInfo, deepSeekDefaultModelId, deepSeekModels } from "../../shared/api"
+import { calculateApiCostOpenAI } from "../../utils/cost"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 import { ApiStream } from "../transform/stream"
 import { convertToR1Format } from "../transform/r1-format"
@@ -19,6 +20,37 @@ export class DeepSeekHandler implements ApiHandler {
 		})
 	}
 
+	private async *yieldUsage(info: ModelInfo, usage: OpenAI.Completions.CompletionUsage | undefined): ApiStream {
+		// Deepseek reports total input AND cache reads/writes,
+		// see context caching: https://api-docs.deepseek.com/guides/kv_cache)
+		// where the input tokens is the sum of the cache hits/misses, just like OpenAI.
+		// This affects:
+		// 1) context management truncation algorithm, and
+		// 2) cost calculation
+
+		// Deepseek usage includes extra fields.
+		// Safely cast the prompt token details section to the appropriate structure.
+		interface DeepSeekUsage extends OpenAI.CompletionUsage {
+			prompt_cache_hit_tokens?: number
+			prompt_cache_miss_tokens?: number
+		}
+		const deepUsage = usage as DeepSeekUsage
+
+		const inputTokens = deepUsage?.prompt_tokens || 0
+		const outputTokens = deepUsage?.completion_tokens || 0
+		const cacheReadTokens = deepUsage?.prompt_cache_hit_tokens || 0
+		const cacheWriteTokens = deepUsage?.prompt_cache_miss_tokens || 0
+		const totalCost = calculateApiCostOpenAI(info, inputTokens, outputTokens, cacheWriteTokens, cacheReadTokens)
+		yield {
+			type: "usage",
+			inputTokens: inputTokens,
+			outputTokens: outputTokens,
+			cacheWriteTokens: cacheWriteTokens,
+			cacheReadTokens: cacheReadTokens,
+			totalCost: totalCost,
+		}
+	}
+
 	@withRetry()
 	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
 		const model = this.getModel()
@@ -61,15 +93,7 @@ export class DeepSeekHandler implements ApiHandler {
 			}
 
 			if (chunk.usage) {
-				yield {
-					type: "usage",
-					inputTokens: chunk.usage.prompt_tokens || 0, // (deepseek reports total input AND cache reads/writes, see context caching: https://api-docs.deepseek.com/guides/kv_cache) where the input tokens is the sum of the cache hits/misses, while anthropic reports them as separate tokens. This is important to know for 1) context management truncation algorithm, and 2) cost calculation (NOTE: we report both input and cache stats but for now set input price to 0 since all the cost calculation will be done using cache hits/misses)
-					outputTokens: chunk.usage.completion_tokens || 0,
-					// @ts-ignore-next-line
-					cacheReadTokens: chunk.usage.prompt_cache_hit_tokens || 0,
-					// @ts-ignore-next-line
-					cacheWriteTokens: chunk.usage.prompt_cache_miss_tokens || 0,
-				}
+				yield* this.yieldUsage(model.info, chunk.usage)
 			}
 		}
 	}
diff --git a/src/api/providers/openai-native.ts b/src/api/providers/openai-native.ts
@@ -10,6 +10,7 @@ import {
 	openAiNativeModels,
 } from "../../shared/api"
 import { convertToOpenAiMessages } from "../transform/openai-format"
+import { calculateApiCostOpenAI } from "../../utils/cost"
 import { ApiStream } from "../transform/stream"
 import { ChatCompletionReasoningEffort } from "openai/resources/chat/completions.mjs"
 
@@ -24,31 +25,47 @@ export class OpenAiNativeHandler implements ApiHandler {
 		})
 	}
 
+	private async *yieldUsage(info: ModelInfo, usage: OpenAI.Completions.CompletionUsage | undefined): ApiStream {
+		const inputTokens = usage?.prompt_tokens || 0
+		const outputTokens = usage?.completion_tokens || 0
+		const cacheReadTokens = usage?.prompt_tokens_details?.cached_tokens || 0
+		const cacheWriteTokens = 0
+		const totalCost = calculateApiCostOpenAI(info, inputTokens, outputTokens, cacheWriteTokens, cacheReadTokens)
+		yield {
+			type: "usage",
+			inputTokens: inputTokens,
+			outputTokens: outputTokens,
+			cacheWriteTokens: cacheWriteTokens,
+			cacheReadTokens: cacheReadTokens,
+			totalCost: totalCost,
+		}
+	}
+
 	@withRetry()
 	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
-		switch (this.getModel().id) {
+		const model = this.getModel()
+
+		switch (model.id) {
 			case "o1":
 			case "o1-preview":
 			case "o1-mini": {
 				// o1 doesnt support streaming, non-1 temp, or system prompt
 				const response = await this.client.chat.completions.create({
-					model: this.getModel().id,
+					model: model.id,
 					messages: [{ role: "user", content: systemPrompt }, ...convertToOpenAiMessages(messages)],
 				})
 				yield {
 					type: "text",
 					text: response.choices[0]?.message.content || "",
 				}
-				yield {
-					type: "usage",
-					inputTokens: response.usage?.prompt_tokens || 0,
-					outputTokens: response.usage?.completion_tokens || 0,
-				}
+
+				yield* this.yieldUsage(model.info, response.usage)
+
 				break
 			}
 			case "o3-mini": {
 				const stream = await this.client.chat.completions.create({
-					model: this.getModel().id,
+					model: model.id,
 					messages: [{ role: "developer", content: systemPrompt }, ...convertToOpenAiMessages(messages)],
 					stream: true,
 					stream_options: { include_usage: true },
@@ -63,18 +80,15 @@ export class OpenAiNativeHandler implements ApiHandler {
 						}
 					}
 					if (chunk.usage) {
-						yield {
-							type: "usage",
-							inputTokens: chunk.usage.prompt_tokens || 0,
-							outputTokens: chunk.usage.completion_tokens || 0,
-						}
+						// Only last chunk contains usage
+						yield* this.yieldUsage(model.info, chunk.usage)
 					}
 				}
 				break
 			}
 			default: {
 				const stream = await this.client.chat.completions.create({
-					model: this.getModel().id,
+					model: model.id,
 					// max_completion_tokens: this.getModel().info.maxTokens,
 					temperature: 0,
 					messages: [{ role: "system", content: systemPrompt }, ...convertToOpenAiMessages(messages)],
@@ -90,14 +104,9 @@ export class OpenAiNativeHandler implements ApiHandler {
 							text: delta.content,
 						}
 					}
-
-					// contains a null value except for the last chunk which contains the token usage statistics for the entire request
 					if (chunk.usage) {
-						yield {
-							type: "usage",
-							inputTokens: chunk.usage.prompt_tokens || 0,
-							outputTokens: chunk.usage.completion_tokens || 0,
-						}
+						// Only last chunk contains usage
+						yield* this.yieldUsage(model.info, chunk.usage)
 					}
 				}
 			}
diff --git a/src/api/providers/requesty.ts b/src/api/providers/requesty.ts
@@ -1,6 +1,7 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 import OpenAI from "openai"
 import { withRetry } from "../retry"
+import { calculateApiCostOpenAI } from "../../utils/cost"
 import { ApiHandlerOptions, ModelInfo, openAiModelInfoSaneDefaults } from "../../shared/api"
 import { ApiHandler } from "../index"
 import { convertToOpenAiMessages } from "../transform/openai-format"
@@ -69,13 +70,19 @@ export class RequestyHandler implements ApiHandler {
 
 			if (chunk.usage) {
 				const usage = chunk.usage as RequestyUsage
+				const inputTokens = usage.prompt_tokens || 0
+				const outputTokens = usage.completion_tokens || 0
+				const cacheWriteTokens = usage.prompt_tokens_details?.caching_tokens || undefined
+				const cacheReadTokens = usage.prompt_tokens_details?.cached_tokens || undefined
+				const totalCost = 0 // TODO: Replace with calculateApiCostOpenAI(model.info, inputTokens, outputTokens, cacheWriteTokens, cacheReadTokens)
+
 				yield {
 					type: "usage",
-					inputTokens: usage.prompt_tokens || 0,
-					outputTokens: usage.completion_tokens || 0,
-					cacheWriteTokens: usage.prompt_tokens_details?.caching_tokens || undefined,
-					cacheReadTokens: usage.prompt_tokens_details?.cached_tokens || undefined,
-					totalCost: usage.total_cost || undefined,
+					inputTokens: inputTokens,
+					outputTokens: outputTokens,
+					cacheWriteTokens: cacheWriteTokens,
+					cacheReadTokens: cacheReadTokens,
+					totalCost: totalCost,
 				}
 			}
 		}
diff --git a/src/api/providers/vscode-lm.ts b/src/api/providers/vscode-lm.ts
@@ -1,7 +1,7 @@
 import { Anthropic } from "@anthropic-ai/sdk"
 import * as vscode from "vscode"
 import { ApiHandler, SingleCompletionHandler } from "../"
-import { calculateApiCost } from "../../utils/cost"
+import { calculateApiCostAnthropic } from "../../utils/cost"
 import { ApiStream } from "../transform/stream"
 import { convertToVsCodeLmMessages } from "../transform/vscode-lm-format"
 import { SELECTOR_SEPARATOR, stringifyVsCodeLmModelSelector } from "../../shared/vsCodeSelectorUtils"
@@ -525,7 +525,7 @@ export class VsCodeLmHandler implements ApiHandler, SingleCompletionHandler {
 				type: "usage",
 				inputTokens: totalInputTokens,
 				outputTokens: totalOutputTokens,
-				totalCost: calculateApiCost(this.getModel().info, totalInputTokens, totalOutputTokens),
+				totalCost: calculateApiCostAnthropic(this.getModel().info, totalInputTokens, totalOutputTokens),
 			}
 		} catch (error: unknown) {
 			this.ensureCleanState()
diff --git a/src/core/Cline.ts b/src/core/Cline.ts
@@ -47,7 +47,7 @@ import {
 import { getApiMetrics } from "../shared/getApiMetrics"
 import { HistoryItem } from "../shared/HistoryItem"
 import { ClineAskResponse, ClineCheckpointRestore } from "../shared/WebviewMessage"
-import { calculateApiCost } from "../utils/cost"
+import { calculateApiCostAnthropic } from "../utils/cost"
 import { fileExistsAtPath } from "../utils/fs"
 import { arePathsEqual, getReadablePath } from "../utils/path"
 import { fixModelHtmlEscaping, removeInvalidChars } from "../utils/string"
@@ -3115,7 +3115,13 @@ export class Cline {
 					cacheReads: cacheReadTokens,
 					cost:
 						totalCost ??
-						calculateApiCost(this.api.getModel().info, inputTokens, outputTokens, cacheWriteTokens, cacheReadTokens),
+						calculateApiCostAnthropic(
+							this.api.getModel().info,
+							inputTokens,
+							outputTokens,
+							cacheWriteTokens,
+							cacheReadTokens,
+						),
 					cancelReason,
 					streamingFailedMessage,
 				} satisfies ClineApiReqInfo)
diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -408,9 +408,10 @@ export const openAiNativeModels = {
 		maxTokens: 100_000,
 		contextWindow: 200_000,
 		supportsImages: false,
-		supportsPromptCache: false,
+		supportsPromptCache: true,
 		inputPrice: 1.1,
 		outputPrice: 4.4,
+		cacheReadsPrice: 0.55,
 	},
 	// don't support tool use yet
 	o1: {
@@ -420,38 +421,43 @@ export const openAiNativeModels = {
 		supportsPromptCache: false,
 		inputPrice: 15,
 		outputPrice: 60,
+		cacheReadsPrice: 7.5,
 	},
 	"o1-preview": {
 		maxTokens: 32_768,
 		contextWindow: 128_000,
 		supportsImages: true,
-		supportsPromptCache: false,
+		supportsPromptCache: true,
 		inputPrice: 15,
 		outputPrice: 60,
+		cacheReadsPrice: 7.5,
 	},
 	"o1-mini": {
 		maxTokens: 65_536,
 		contextWindow: 128_000,
 		supportsImages: true,
-		supportsPromptCache: false,
+		supportsPromptCache: true,
 		inputPrice: 1.1,
 		outputPrice: 4.4,
+		cacheReadsPrice: 0.55,
 	},
 	"gpt-4o": {
 		maxTokens: 4_096,
 		contextWindow: 128_000,
 		supportsImages: true,
-		supportsPromptCache: false,
+		supportsPromptCache: true,
 		inputPrice: 2.5,
 		outputPrice: 10,
+		cacheReadsPrice: 1.25,
 	},
 	"gpt-4o-mini": {
 		maxTokens: 16_384,
 		contextWindow: 128_000,
 		supportsImages: true,
-		supportsPromptCache: false,
+		supportsPromptCache: true,
 		inputPrice: 0.15,
 		outputPrice: 0.6,
+		cacheReadsPrice: 0.075,
 	},
 	"gpt-4.5-preview": {
 		maxTokens: 16_384,
@@ -477,8 +483,8 @@ export const deepSeekModels = {
 		maxTokens: 8_000,
 		contextWindow: 64_000,
 		supportsImages: false,
-		supportsPromptCache: true, // supports context caching, but not in the way anthropic does it (deepseek reports input tokens and reads/writes in the same usage report) FIXME: we need to show users cache stats how deepseek does it
-		inputPrice: 0, // technically there is no input price, it's all either a cache hit or miss (ApiOptions will not show this)
+		supportsPromptCache: true,
+		inputPrice: 0.27,
 		outputPrice: 1.1,
 		cacheWritesPrice: 0.27,
 		cacheReadsPrice: 0.07,
@@ -487,8 +493,8 @@ export const deepSeekModels = {
 		maxTokens: 8_000,
 		contextWindow: 64_000,
 		supportsImages: false,
-		supportsPromptCache: true, // supports context caching, but not in the way anthropic does it (deepseek reports input tokens and reads/writes in the same usage report) FIXME: we need to show users cache stats how deepseek does it
-		inputPrice: 0, // technically there is no input price, it's all either a cache hit or miss (ApiOptions will not show this)
+		supportsPromptCache: true,
+		inputPrice: 0.55,
 		outputPrice: 2.19,
 		cacheWritesPrice: 0.55,
 		cacheReadsPrice: 0.14,
diff --git a/src/utils/cost.test.ts b/src/utils/cost.test.ts
diff --git a/src/utils/cost.ts b/src/utils/cost.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"claude-dev": patch
 +---
++
 +Add correct cost and tokens info to Native OpenAI and DeepSeek providers