fix: make usage chunk in stream mode of gemini compatible with openai (#1503)

hustxiayang · yuzisun · mathetake · web-flow · commit fe1a8df185fc · 2025-12-05T22:44:53.000-05:00
**Description** Users found the simply use "usage" information does not work for streaming responses of gemini models. <img width="964" height="70" alt="image" src="https://github.com/user-attachments/assets/334a5515-a57b-4f83-9bab-f296422409b8" /> This is because for openai models, the usage chunk would be a separate chunk. For example, this is an example response from gpt-4o: ``` ... chunk=ChatCompletionChunk(id='chatcmpl-CYv6DPkWfT1xrsS2ySOoRztQKnZDg', choices=[Choice(delta=ChoiceDelta(content=None, function_call=None, refusal=None, role=None, tool_calls=None), finish_reason='stop', index=0, logprobs=None, content_filter_result={'error': {'code': 'content_filter_error', 'message': 'The contents are not filtered'}})], created=1762438677, model='azure.gpt-4o', object='chat.completion.chunk', service_tier=None, system_fingerprint='fp_4a331a0222', usage=None, obfuscation='2xP') chunk=ChatCompletionChunk(id='chatcmpl-CYv6DPkWfT1xrsS2ySOoRztQKnZDg', choices=[], created=1762438677, model='azure.gpt-4o', object='chat.completion.chunk', service_tier=None, system_fingerprint='fp_4a331a0222', usage=CompletionUsage(completion_tokens=17, prompt_tokens=12, total_tokens=29, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)), obfuscation='xY40HsJr') ``` There is a finish_reason chunk, and then a usage chunk. Thus, want to make it compatible with Openai. (Actually, in anthropic translation, it's already compatible) --------- Signed-off-by: yxia216 <yxia216@bloomberg.net> Signed-off-by: Takeshi Yoneda <t.y.mathetake@gmail.com> Co-authored-by: Dan Sun <dsun20@bloomberg.net> Co-authored-by: Takeshi Yoneda <t.y.mathetake@gmail.com> Co-authored-by: Aaron Choo <achoo30@bloomberg.net>
diff --git a/internal/translator/openai_gcpvertexai.go b/internal/translator/openai_gcpvertexai.go
@@ -194,8 +194,41 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) handleStreamingResponse(
 		// Convert GCP chunk to OpenAI chunk.
 		openAIChunk := o.convertGCPChunkToOpenAI(chunk)
 
-		// Extract token usage if present in this chunk (typically in the last chunk).
-		if chunk.UsageMetadata != nil {
+		// Serialize to SSE format as expected by OpenAI API.
+		err := serializeOpenAIChatCompletionChunk(*openAIChunk, &newBody)
+		if err != nil {
+			return nil, nil, metrics.TokenUsage{}, "", fmt.Errorf("error marshaling OpenAI chunk: %w", err)
+		}
+
+		if span != nil {
+			span.RecordResponseChunk(openAIChunk)
+		}
+
+		// Extract token usage only in the last chunk.
+		if chunk.UsageMetadata != nil && chunk.UsageMetadata.PromptTokenCount > 0 {
+			// Convert usage to pointer if available.
+			usage := ptr.To(geminiUsageToOpenAIUsage(chunk.UsageMetadata))
+
+			usageChunk := openai.ChatCompletionResponseChunk{
+				ID:      chunk.ResponseID,
+				Created: openai.JSONUNIXTime(chunk.CreateTime),
+				Object:  "chat.completion.chunk",
+				Choices: []openai.ChatCompletionResponseChunkChoice{},
+				// usage is nil for all chunks other than the last chunk
+				Usage: usage,
+				Model: o.requestModel,
+			}
+
+			// Serialize to SSE format as expected by OpenAI API.
+			err := serializeOpenAIChatCompletionChunk(usageChunk, &newBody)
+			if err != nil {
+				return nil, nil, metrics.TokenUsage{}, "", fmt.Errorf("error marshaling OpenAI chunk: %w", err)
+			}
+
+			if span != nil {
+				span.RecordResponseChunk(&usageChunk)
+			}
+
 			if chunk.UsageMetadata.PromptTokenCount >= 0 {
 				tokenUsage.SetInputTokens(uint32(chunk.UsageMetadata.PromptTokenCount)) //nolint:gosec
 			}
@@ -209,16 +242,6 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) handleStreamingResponse(
 				tokenUsage.SetCachedInputTokens(uint32(chunk.UsageMetadata.CachedContentTokenCount)) //nolint:gosec
 			}
 		}
-
-		// Serialize to SSE format as expected by OpenAI API.
-		err := serializeOpenAIChatCompletionChunk(*openAIChunk, &newBody)
-		if err != nil {
-			return nil, nil, metrics.TokenUsage{}, "", fmt.Errorf("error marshaling OpenAI chunk: %w", err)
-		}
-
-		if span != nil {
-			span.RecordResponseChunk(openAIChunk)
-		}
 	}
 
 	if endOfStream {
@@ -381,19 +404,14 @@ func (o *openAIToGCPVertexAITranslatorV1ChatCompletion) convertGCPChunkToOpenAI(
 		choices = []openai.ChatCompletionResponseChunkChoice{}
 	}
 
-	// Convert usage to pointer if available.
-	var usage *openai.Usage
-	if chunk.UsageMetadata != nil {
-		usage = ptr.To(geminiUsageToOpenAIUsage(chunk.UsageMetadata))
-	}
-
 	return &openai.ChatCompletionResponseChunk{
 		ID:      chunk.ResponseID,
 		Created: openai.JSONUNIXTime(chunk.CreateTime),
 		Object:  "chat.completion.chunk",
 		Choices: choices,
-		Usage:   usage,
-		Model:   o.requestModel,
+		// usage is nil for all chunks other than the last chunk
+		Usage: nil,
+		Model: o.requestModel,
 	}
 }
 
diff --git a/internal/translator/openai_gcpvertexai_test.go b/internal/translator/openai_gcpvertexai_test.go
@@ -1019,7 +1019,9 @@ func TestOpenAIToGCPVertexAITranslatorV1ChatCompletion_ResponseBody(t *testing.T
 			endOfStream:   true,
 			wantError:     false,
 			wantHeaderMut: nil,
-			wantBodyMut: []byte(`data: {"choices":[{"index":0,"delta":{"content":"Hello","role":"assistant"}}],"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_tokens":3,"total_tokens":8,"completion_tokens_details":{},"prompt_tokens_details":{}}}
+			wantBodyMut: []byte(`data: {"choices":[{"index":0,"delta":{"content":"Hello","role":"assistant"}}],"object":"chat.completion.chunk"}
+
+data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_tokens":3,"total_tokens":8,"completion_tokens_details":{},"prompt_tokens_details":{}}}
 
 data: [DONE]
 `),
@@ -1228,7 +1230,9 @@ data: {"candidates":[{"content":{"parts":[{"text":"Hello"}]}}],"usageMetadata":{
 			wantHeaderMut: nil,
 			wantBodyMut: []byte(`data: {"choices":[{"index":0,"delta":{"role":"assistant","reasoning_content":{"text":"let me think step by step and reply you."}}}],"object":"chat.completion.chunk"}
 
-data: {"choices":[{"index":0,"delta":{"content":"Hello","role":"assistant"}}],"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_tokens":3,"total_tokens":8,"completion_tokens_details":{},"prompt_tokens_details":{}}}
+data: {"choices":[{"index":0,"delta":{"content":"Hello","role":"assistant"}}],"object":"chat.completion.chunk"}
+
+data: {"object":"chat.completion.chunk","usage":{"prompt_tokens":5,"completion_tokens":3,"total_tokens":8,"completion_tokens_details":{},"prompt_tokens_details":{}}}
 
 data: [DONE]
 `),
diff --git a/tests/extproc/testupstream_test.go b/tests/extproc/testupstream_test.go
@@ -579,7 +579,9 @@ data: {"id":"msg_123","choices":[{"index":0,"delta":{"content":" you","role":"as
 
 data: {"id":"msg_123","choices":[{"index":0,"delta":{"content":" today","role":"assistant"}}],"created":123,"model":"gemini-1.5-pro","object":"chat.completion.chunk"}
 
-data: {"id":"msg_123","choices":[{"index":0,"delta":{"content":"?","role":"assistant"},"finish_reason":"stop"}],"created":123,"model":"gemini-1.5-pro","object":"chat.completion.chunk","usage":{"prompt_tokens":10,"completion_tokens":7,"total_tokens":17,"completion_tokens_details":{},"prompt_tokens_details":{}}}
+data: {"id":"msg_123","choices":[{"index":0,"delta":{"content":"?","role":"assistant"},"finish_reason":"stop"}],"created":123,"model":"gemini-1.5-pro","object":"chat.completion.chunk"}
+
+data: {"id":"msg_123","created":123,"model":"gemini-1.5-pro","object":"chat.completion.chunk","usage":{"prompt_tokens":10,"completion_tokens":7,"total_tokens":17,"completion_tokens_details":{},"prompt_tokens_details":{}}}
 
 data: [DONE]
 `,