envoyproxy · VarSuren · Sep 29, 2025 · Sep 29, 2025 · Sep 30, 2025 · Sep 30, 2025
@@ -6,10 +6,12 @@
 package extproc
 
 import (
+	"bytes"
 	"cmp"
 	"context"
 	"encoding/json"
 	"fmt"
+	"io"
 	"log/slog"
 
 	corev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
@@ -147,6 +149,7 @@ type messagesProcessorUpstreamFilter struct {
 	stream                 bool
 	metrics                metrics.ChatCompletionMetrics
 	costs                  translator.LLMTokenUsage
+	gzipBuffer             []byte
 }
 
 // selectTranslator selects the translator based on the output schema.
@@ -266,22 +269,53 @@ func (c *messagesProcessorUpstreamFilter) ProcessResponseBody(ctx context.Contex
 		}
 	}()
 
-	// Decompress the body if needed using common utility.
-	decodingResult, err := decodeContentIfNeeded(body.Body, c.responseEncoding)
-	if err != nil {
-		return nil, err
-	}
+	var headerMutation *extprocv3.HeaderMutation
+	var bodyMutation *extprocv3.BodyMutation
+	var tokenUsage translator.LLMTokenUsage
+	var responseModel internalapi.ResponseModel
 
-	// headerMutation, bodyMutation, tokenUsage, err := c.translator.ResponseBody(c.responseHeaders, br, body.EndOfStream).
-	headerMutation, bodyMutation, tokenUsage, responseModel, err := c.translator.ResponseBody(c.responseHeaders, decodingResult.reader, body.EndOfStream)
-	if err != nil {
-		return nil, fmt.Errorf("failed to transform response: %w", err)
-	}
+	if c.stream && !body.EndOfStream {
+		// For streaming intermediate chunks: buffer data and check if decompression succeeded
+		decodingResult, err := decodeContentWithBuffering(body.Body, c.responseEncoding, &c.gzipBuffer, body.EndOfStream)
+		if err != nil {
+			return nil, err
+		}
+
+		// Check if we got decompressed data (successful buffering completion)
+		data, _ := io.ReadAll(decodingResult.reader)
+		if len(data) > 0 {
+			// Decompression succeeded! Process the complete response
+			decodingResult.reader = bytes.NewReader(data)
+			headerMutation, bodyMutation, tokenUsage, responseModel, err = c.translator.ResponseBody(c.responseHeaders, decodingResult.reader, c.stream)
+			if err != nil {
+				return nil, fmt.Errorf("failed to transform response: %w", err)
+			}
+			c.metrics.SetResponseModel(responseModel)
+		} else {
+			// Still buffering incomplete data - pass through with no mutations
+			headerMutation, bodyMutation = nil, nil
+			tokenUsage = translator.LLMTokenUsage{}
+		}
+	} else {
+		// For non-streaming OR final streaming chunk: decompress and translate
+		decodingResult, err := decodeContentWithBuffering(body.Body, c.responseEncoding, &c.gzipBuffer, body.EndOfStream)
+		if err != nil {
+			return nil, err
+		}
 
-	c.metrics.SetResponseModel(responseModel)
+		// Process the decompressed data
+		data, _ := io.ReadAll(decodingResult.reader)
+		decodingResult.reader = bytes.NewReader(data)
+		headerMutation, bodyMutation, tokenUsage, responseModel, err = c.translator.ResponseBody(c.responseHeaders, decodingResult.reader, c.stream)
+		if err != nil {
+			return nil, fmt.Errorf("failed to transform response: %w", err)
+		}
+
+		c.metrics.SetResponseModel(responseModel)
 
-	// Remove content-encoding header if original body encoded but was mutated in the processor.
-	headerMutation = removeContentEncodingIfNeeded(headerMutation, bodyMutation, decodingResult.isEncoded)
+		// Remove content-encoding header if original body encoded but was mutated in the processor.
+		headerMutation = removeContentEncodingIfNeeded(headerMutation, bodyMutation, decodingResult.isEncoded)
+	}
 
 	resp := &extprocv3.ProcessingResponse{
 		Response: &extprocv3.ProcessingResponse_ResponseBody{

@@ -90,7 +90,7 @@ func (a *anthropicToGCPAnthropicTranslator) ResponseHeaders(_ map[string]string)
 
 // ResponseBody implements [AnthropicMessagesTranslator.ResponseBody] for Anthropic to GCP Anthropic.
 // This is essentially a passthrough since both use the same Anthropic response format.
-func (a *anthropicToGCPAnthropicTranslator) ResponseBody(_ map[string]string, body io.Reader, endOfStream bool) (
+func (a *anthropicToGCPAnthropicTranslator) ResponseBody(_ map[string]string, body io.Reader, isStreaming bool) (
 	headerMutation *extprocv3.HeaderMutation, bodyMutation *extprocv3.BodyMutation, tokenUsage LLMTokenUsage, responseModel string, err error,
 ) {
 	// Read the response body for both streaming and non-streaming.
@@ -99,8 +99,8 @@ func (a *anthropicToGCPAnthropicTranslator) ResponseBody(_ map[string]string, bo
 		return nil, nil, LLMTokenUsage{}, "", fmt.Errorf("failed to read response body: %w", err)
 	}
 
-	// For streaming chunks, parse SSE format to extract token usage.
-	if !endOfStream {
+	// For streaming requests, parse SSE format to extract token usage.
+	if isStreaming {
 		// Parse SSE format - split by lines and look for data: lines.
 		for line := range bytes.Lines(bodyBytes) {
 			line = bytes.TrimSpace(line)
@@ -117,14 +117,16 @@ func (a *anthropicToGCPAnthropicTranslator) ResponseBody(_ map[string]string, bo
 					switch eventType {
 					case "message_start":
 						// Extract input tokens from message.usage.
+						// Now handles complete response with potentially multiple events.
 						if messageData, ok := eventData["message"].(map[string]any); ok {
 							if usageData, ok := messageData["usage"].(map[string]any); ok {
 								if inputTokens, ok := usageData["input_tokens"].(float64); ok {
-									tokenUsage.InputTokens = uint32(inputTokens) //nolint:gosec
+									// Accumulate input tokens (though typically only one message_start per conversation)
+									tokenUsage.InputTokens += uint32(inputTokens) //nolint:gosec
 								}
 								// Some message_start events may include initial output tokens.
 								if outputTokens, ok := usageData["output_tokens"].(float64); ok && outputTokens > 0 {
-									tokenUsage.OutputTokens = uint32(outputTokens) //nolint:gosec
+									tokenUsage.OutputTokens += uint32(outputTokens) //nolint:gosec
 								}
 								tokenUsage.TotalTokens = tokenUsage.InputTokens + tokenUsage.OutputTokens
 							}
@@ -143,18 +145,16 @@ func (a *anthropicToGCPAnthropicTranslator) ResponseBody(_ map[string]string, bo
 			}
 		}
 
-		return nil, &extprocv3.BodyMutation{
-			Mutation: &extprocv3.BodyMutation_Body{Body: bodyBytes},
-		}, tokenUsage, a.requestModel, nil
+		// For streaming responses, we only extract token usage, don't modify the body
+		// Return nil bodyMutation to pass through original data (potentially gzipped)
+		return nil, nil, tokenUsage, a.requestModel, nil
 	}
 
 	// Parse the Anthropic response to extract token usage.
 	var anthropicResp anthropic.Message
 	if err = json.Unmarshal(bodyBytes, &anthropicResp); err != nil {
-		// If we can't parse as Anthropic format, pass through as-is.
-		return nil, &extprocv3.BodyMutation{
-			Mutation: &extprocv3.BodyMutation_Body{Body: bodyBytes},
-		}, LLMTokenUsage{}, a.requestModel, nil
+		// If we can't parse as Anthropic format, pass through as-is without modification.
+		return nil, nil, LLMTokenUsage{}, a.requestModel, nil
 	}
 
 	// Extract token usage from the response.
@@ -164,12 +164,6 @@ func (a *anthropicToGCPAnthropicTranslator) ResponseBody(_ map[string]string, bo
 		TotalTokens:  uint32(anthropicResp.Usage.InputTokens + anthropicResp.Usage.OutputTokens), //nolint:gosec
 	}
 
-	// Pass through the response body unchanged since both input and output are Anthropic format.
-	headerMutation = &extprocv3.HeaderMutation{}
-	setContentLength(headerMutation, bodyBytes)
-	bodyMutation = &extprocv3.BodyMutation{
-		Mutation: &extprocv3.BodyMutation_Body{Body: bodyBytes},
-	}
-
-	return headerMutation, bodyMutation, tokenUsage, a.requestModel, nil
+	// Pass through the response body unchanged - don't create body mutation to preserve original encoding.
+	return nil, nil, tokenUsage, a.requestModel, nil
 }
@@ -569,12 +569,12 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingTokenUsage(t *t
 			bodyReader := bytes.NewReader([]byte(tt.chunk))
 			respHeaders := map[string]string{"content-type": "application/json"}
 
-			headerMutation, bodyMutation, tokenUsage, _, err := translator.ResponseBody(respHeaders, bodyReader, tt.endOfStream)
+			// These tests are for streaming SSE chunks, so use isStreaming=true
+			headerMutation, bodyMutation, tokenUsage, _, err := translator.ResponseBody(respHeaders, bodyReader, true)
 
 			require.NoError(t, err)
 			require.Nil(t, headerMutation)
-			require.NotNil(t, bodyMutation)
-			require.Equal(t, tt.expectedBody, string(bodyMutation.GetBody()))
+			require.Nil(t, bodyMutation) // No body mutation to preserve original encoding
 			require.Equal(t, tt.expectedUsage, tokenUsage)
 		})
 	}
@@ -649,12 +649,12 @@ func TestAnthropicToGCPAnthropicTranslator_ResponseBody_StreamingEdgeCases(t *te
 			bodyReader := bytes.NewReader([]byte(tt.chunk))
 			respHeaders := map[string]string{"content-type": "application/json"}
 
-			headerMutation, bodyMutation, tokenUsage, _, err := translator.ResponseBody(respHeaders, bodyReader, false)
+			// These are streaming edge case tests, so use isStreaming=true
+			headerMutation, bodyMutation, tokenUsage, _, err := translator.ResponseBody(respHeaders, bodyReader, true)
 
 			require.NoError(t, err)
 			require.Nil(t, headerMutation)
-			require.NotNil(t, bodyMutation)
-			require.Equal(t, tt.chunk, string(bodyMutation.GetBody()))
+			require.Nil(t, bodyMutation) // No body mutation to preserve original encoding
 			require.Equal(t, tt.expectedUsage, tokenUsage)
 		})
 	}

@@ -10,6 +10,7 @@ import (
 	"compress/gzip"
 	"fmt"
 	"io"
+	"log/slog"
 
 	extprocv3 "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3"
 )
@@ -42,6 +43,82 @@ func decodeContentIfNeeded(body []byte, contentEncoding string) (contentDecoding
 	}
 }
 
+// decodeContentWithBuffering decompresses response body with buffering support for streaming.
+// Accumulates chunks in the provided buffer until complete gzip data is available.
+// Returns a reader for the (potentially decompressed) body and metadata about the encoding.
+func decodeContentWithBuffering(body []byte, contentEncoding string, gzipBuffer *[]byte, endOfStream bool) (contentDecodingResult, error) {
+	switch contentEncoding {
+	case "gzip":
+
+		// Accumulate chunks in buffer
+		*gzipBuffer = append(*gzipBuffer, body...)
+
+		// Try to decompress the accumulated buffer
+		if len(*gzipBuffer) > 0 {
+			gzipReader, err := gzip.NewReader(bytes.NewReader(*gzipBuffer))
+			if err != nil {
+				// If it's not endOfStream, keep buffering
+				if !endOfStream {
+					return contentDecodingResult{
+						reader:    bytes.NewReader(nil), // Empty reader to signal buffering in progress
+						isEncoded: true,
+					}, nil
+				}
+				// If endOfStream and still can't read, pass through buffered data
+				slog.Info("gzip buffering: invalid header at end of stream, passing through buffered data",
+					"error", err,
+					"buffer_size", len(*gzipBuffer))
+				result := contentDecodingResult{
+					reader:    bytes.NewReader(*gzipBuffer),
+					isEncoded: true,
+				}
+				*gzipBuffer = nil // Clear buffer
+				return result, nil
+			}
+			defer gzipReader.Close()
+
+			decompressedBody, err := io.ReadAll(gzipReader)
+			if err != nil {
+				// If it's not endOfStream, keep buffering
+				if !endOfStream {
+					return contentDecodingResult{
+						reader:    bytes.NewReader(nil), // Empty reader to signal buffering in progress
+						isEncoded: true,
+					}, nil
+				}
+				// If endOfStream and decompression failed, pass through buffered data
+				slog.Info("gzip buffering: decompression failed at end of stream, passing through buffered data",
+					"error", err,
+					"buffer_size", len(*gzipBuffer))
+				result := contentDecodingResult{
+					reader:    bytes.NewReader(*gzipBuffer),
+					isEncoded: true,
+				}
+				*gzipBuffer = nil // Clear buffer
+				return result, nil
+			}
+
+			// Successfully decompressed!
+			*gzipBuffer = nil // Clear buffer
+			return contentDecodingResult{
+				reader:    bytes.NewReader(decompressedBody),
+				isEncoded: true,
+			}, nil
+		}
+
+		// Empty buffer, return empty
+		return contentDecodingResult{
+			reader:    bytes.NewReader(nil), // Empty reader for empty buffer
+			isEncoded: true,
+		}, nil
+	default:
+		return contentDecodingResult{
+			reader:    bytes.NewReader(body),
+			isEncoded: false,
+		}, nil
+	}
+}
+
 // removeContentEncodingIfNeeded removes the content-encoding header if the body was modified and was encoded.
 // This is needed when the transformation modifies the body content but the response was originally compressed.
 func removeContentEncodingIfNeeded(headerMutation *extprocv3.HeaderMutation, bodyMutation *extprocv3.BodyMutation, isEncoded bool) *extprocv3.HeaderMutation {