fix: add cache reporting support for OpenAI-Native provider (#7602)

hannesrudolph · hannesrudolph · commit 65697107520a · 2025-09-08T17:32:44.000-06:00
* fix: add cache reporting support for OpenAI-Native provider

- Add normalizeUsage method to properly extract cache tokens from Responses API
- Support both detailed token shapes (input_tokens_details) and legacy fields
- Calculate cache read/write tokens with proper fallbacks
- Include reasoning tokens when available in output_tokens_details
- Ensure accurate cost calculation using uncached input tokens

This fixes the issue where caching information was not being reported
when using the OpenAI-Native provider with the Responses API.

* fix: improve cache token normalization and add comprehensive tests

- Add fallback to derive total input tokens from details when totals are missing
- Remove unused convertToOpenAiMessages import
- Add comment explaining cost calculation alignment with Gemini provider
- Add comprehensive test coverage for normalizeUsage method covering:
  - Detailed token shapes with cached/miss tokens
  - Legacy field names and SSE-only events
  - Edge cases including missing totals with details-only
  - Cost calculation with uncached input tokens

* fix: address PR review comments

- Remove incorrect fallback to missFromDetails for cache write tokens
- Fix cost calculation to pass total input tokens (calculateApiCostOpenAI handles subtraction)
- Improve readability by extracting cache detail checks to intermediate variables
- Remove redundant ?? undefined
- Update tests to reflect correct behavior (miss tokens are not cache writes)
- Add clarifying comments about cache miss vs cache write tokens
diff --git a/src/api/providers/openai-native.ts b/src/api/providers/openai-native.ts
@@ -11,7 +11,6 @@ import {
 	type ReasoningEffort,
 	type VerbosityLevel,
 	type ReasoningEffortWithMinimal,
-	type ServiceTier,
 } from "@roo-code/types"
 
 import type { ApiHandlerOptions } from "../../shared/api"
@@ -37,8 +36,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 	private lastResponseId: string | undefined
 	private responseIdPromise: Promise<string | undefined> | undefined
 	private responseIdResolver: ((value: string | undefined) => void) | undefined
-	// Resolved service tier from Responses API (actual tier used by OpenAI)
-	private lastServiceTier: ServiceTier | undefined
 
 	// Event types handled by the shared event processor to avoid duplication
 	private readonly coreHandledEventTypes = new Set<string>([
@@ -93,15 +90,10 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 		const cacheReadTokens =
 			usage.cache_read_input_tokens ?? usage.cache_read_tokens ?? usage.cached_tokens ?? cachedFromDetails ?? 0
 
-		// Resolve effective tier: prefer actual tier from response; otherwise requested tier
-		const effectiveTier =
-			this.lastServiceTier || (this.options.openAiNativeServiceTier as ServiceTier | undefined) || undefined
-		const effectiveInfo = this.applyServiceTierPricing(model.info, effectiveTier)
-
 		// Pass total input tokens directly to calculateApiCostOpenAI
 		// The function handles subtracting both cache reads and writes internally (see shared/cost.ts:46)
 		const totalCost = calculateApiCostOpenAI(
-			effectiveInfo,
+			model.info,
 			totalInputTokens,
 			totalOutputTokens,
 			cacheWriteTokens,
@@ -154,9 +146,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 		messages: Anthropic.Messages.MessageParam[],
 		metadata?: ApiHandlerCreateMessageMetadata,
 	): ApiStream {
-		// Reset resolved tier for this request; will be set from response if present
-		this.lastServiceTier = undefined
-
 		// Use Responses API for ALL models
 		const { verbosity, reasoning } = this.getModel()
 
@@ -217,8 +206,8 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 			metadata,
 		)
 
-		// Make the request (pass systemPrompt and messages for potential retry)
-		yield* this.executeRequest(requestBody, model, metadata, systemPrompt, messages)
+		// Make the request
+		yield* this.executeRequest(requestBody, model, metadata)
 	}
 
 	private buildRequestBody(
@@ -244,13 +233,8 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 			previous_response_id?: string
 			store?: boolean
 			instructions?: string
-			service_tier?: ServiceTier
 		}
 
-		// Validate requested tier against model support; if not supported, omit.
-		const requestedTier = (this.options.openAiNativeServiceTier as ServiceTier | undefined) || undefined
-		const allowedTierNames = new Set(model.info.tiers?.map((t) => t.name).filter(Boolean) || [])
-
 		const body: Gpt5RequestBody = {
 			model: model.id,
 			input: formattedInput,
@@ -278,11 +262,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 			// Use the per-request reserved output computed by Roo (params.maxTokens from getModelParams).
 			...(model.maxTokens ? { max_output_tokens: model.maxTokens } : {}),
 			...(requestPreviousResponseId && { previous_response_id: requestPreviousResponseId }),
-			// Include tier when selected and supported by the model, or when explicitly "default"
-			...(requestedTier &&
-				(requestedTier === "default" || allowedTierNames.has(requestedTier)) && {
-					service_tier: requestedTier,
-				}),
 		}
 
 		// Include text.verbosity only when the model explicitly supports it
@@ -297,8 +276,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 		requestBody: any,
 		model: OpenAiNativeModel,
 		metadata?: ApiHandlerCreateMessageMetadata,
-		systemPrompt?: string,
-		messages?: Anthropic.Messages.MessageParam[],
 	): ApiStream {
 		try {
 			// Use the official SDK
@@ -325,18 +302,12 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 			if (is400Error && requestBody.previous_response_id && isPreviousResponseError) {
 				// Log the error and retry without the previous_response_id
 
-				// Clear the stored lastResponseId to prevent using it again
-				this.lastResponseId = undefined
-
-				// Re-prepare the full conversation without previous_response_id
-				let retryRequestBody = { ...requestBody }
+				// Remove the problematic previous_response_id and retry
+				const retryRequestBody = { ...requestBody }
 				delete retryRequestBody.previous_response_id
 
-				// If we have the original messages, re-prepare the full conversation
-				if (systemPrompt && messages) {
-					const { formattedInput } = this.prepareStructuredInput(systemPrompt, messages, undefined)
-					retryRequestBody.input = formattedInput
-				}
+				// Clear the stored lastResponseId to prevent using it again
+				this.lastResponseId = undefined
 
 				try {
 					// Retry with the SDK
@@ -346,13 +317,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 
 					if (typeof (retryStream as any)[Symbol.asyncIterator] !== "function") {
 						// If SDK fails, fall back to SSE
-						yield* this.makeGpt5ResponsesAPIRequest(
-							retryRequestBody,
-							model,
-							metadata,
-							systemPrompt,
-							messages,
-						)
+						yield* this.makeGpt5ResponsesAPIRequest(retryRequestBody, model, metadata)
 						return
 					}
 
@@ -364,13 +329,13 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 					return
 				} catch (retryErr) {
 					// If retry also fails, fall back to SSE
-					yield* this.makeGpt5ResponsesAPIRequest(retryRequestBody, model, metadata, systemPrompt, messages)
+					yield* this.makeGpt5ResponsesAPIRequest(retryRequestBody, model, metadata)
 					return
 				}
 			}
 
 			// For other errors, fallback to manual SSE via fetch
-			yield* this.makeGpt5ResponsesAPIRequest(requestBody, model, metadata, systemPrompt, messages)
+			yield* this.makeGpt5ResponsesAPIRequest(requestBody, model, metadata)
 		}
 	}
 
@@ -459,8 +424,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 		requestBody: any,
 		model: OpenAiNativeModel,
 		metadata?: ApiHandlerCreateMessageMetadata,
-		systemPrompt?: string,
-		messages?: Anthropic.Messages.MessageParam[],
 	): ApiStream {
 		const apiKey = this.options.openAiNativeApiKey ?? "not-provided"
 		const baseUrl = this.options.openAiNativeBaseUrl || "https://api.openai.com"
@@ -505,22 +468,16 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 				if (response.status === 400 && requestBody.previous_response_id && isPreviousResponseError) {
 					// Log the error and retry without the previous_response_id
 
+					// Remove the problematic previous_response_id and retry
+					const retryRequestBody = { ...requestBody }
+					delete retryRequestBody.previous_response_id
+
 					// Clear the stored lastResponseId to prevent using it again
 					this.lastResponseId = undefined
 					// Resolve the promise once to unblock any waiting requests
 					this.resolveResponseId(undefined)
 
-					// Re-prepare the full conversation without previous_response_id
-					let retryRequestBody = { ...requestBody }
-					delete retryRequestBody.previous_response_id
-
-					// If we have the original messages, re-prepare the full conversation
-					if (systemPrompt && messages) {
-						const { formattedInput } = this.prepareStructuredInput(systemPrompt, messages, undefined)
-						retryRequestBody.input = formattedInput
-					}
-
-					// Retry the request with full conversation context
+					// Retry the request without the previous_response_id
 					const retryResponse = await fetch(url, {
 						method: "POST",
 						headers: {
@@ -679,10 +636,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 							if (parsed.response?.id) {
 								this.resolveResponseId(parsed.response.id)
 							}
-							// Capture resolved service tier if present
-							if (parsed.response?.service_tier) {
-								this.lastServiceTier = parsed.response.service_tier as ServiceTier
-							}
 
 							// Delegate standard event types to the shared processor to avoid duplication
 							if (parsed?.type && this.coreHandledEventTypes.has(parsed.type)) {
@@ -974,10 +927,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 								if (parsed.response?.id) {
 									this.resolveResponseId(parsed.response.id)
 								}
-								// Capture resolved service tier if present
-								if (parsed.response?.service_tier) {
-									this.lastServiceTier = parsed.response.service_tier as ServiceTier
-								}
 
 								// Check if the done event contains the complete output (as a fallback)
 								if (
@@ -1102,10 +1051,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 		if (event?.response?.id) {
 			this.resolveResponseId(event.response.id)
 		}
-		// Capture resolved service tier when available
-		if (event?.response?.service_tier) {
-			this.lastServiceTier = event.response.service_tier as ServiceTier
-		}
 
 		// Handle known streaming text deltas
 		if (event?.type === "response.text.delta" || event?.type === "response.output_text.delta") {
@@ -1196,26 +1141,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 		return info.reasoningEffort as ReasoningEffortWithMinimal | undefined
 	}
 
-	/**
-		* Returns a shallow-cloned ModelInfo with pricing overridden for the given tier, if available.
-		* If no tier or no overrides exist, the original ModelInfo is returned.
-		*/
-	private applyServiceTierPricing(info: ModelInfo, tier?: ServiceTier): ModelInfo {
-		if (!tier || tier === "default") return info
-
-		// Find the tier with matching name in the tiers array
-		const tierInfo = info.tiers?.find((t) => t.name === tier)
-		if (!tierInfo) return info
-
-		return {
-			...info,
-			inputPrice: tierInfo.inputPrice ?? info.inputPrice,
-			outputPrice: tierInfo.outputPrice ?? info.outputPrice,
-			cacheReadsPrice: tierInfo.cacheReadsPrice ?? info.cacheReadsPrice,
-			cacheWritesPrice: tierInfo.cacheWritesPrice ?? info.cacheWritesPrice,
-		}
-	}
-
 	// Removed isResponsesApiModel method as ALL models now use the Responses API
 
 	override getModel() {
@@ -1289,13 +1214,6 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
 				store: false, // Don't store prompt completions
 			}
 
-			// Include service tier if selected and supported
-			const requestedTier = (this.options.openAiNativeServiceTier as ServiceTier | undefined) || undefined
-			const allowedTierNames = new Set(model.info.tiers?.map((t) => t.name).filter(Boolean) || [])
-			if (requestedTier && (requestedTier === "default" || allowedTierNames.has(requestedTier))) {
-				requestBody.service_tier = requestedTier
-			}
-
 			// Add reasoning if supported
 			if (reasoningEffort) {
 				requestBody.reasoning = {