fix(pricing): pass request context to processor for usage extraction

CaddyGlow · CaddyGlow · commit ecbeaa749e9f · 2025-08-26T21:58:05.000+02:00
diff --git a/ccproxy/services/http/plugin_handler.py b/ccproxy/services/http/plugin_handler.py
@@ -186,6 +186,7 @@ async def _handle_regular_request(
                 headers=dict(response.headers),
                 status_code=response.status_code,
                 handler_config=handler_config,
+                request_context=request_context,
             )
 
             result = Response(
diff --git a/ccproxy/services/http/processor.py b/ccproxy/services/http/processor.py
@@ -95,6 +95,7 @@ async def process_response(
         status_code: int,
         handler_config: HandlerConfig,
         request_headers: dict[str, str] | None = None,
+        request_context: Any | None = None,
     ) -> tuple[bytes, dict[str, str]]:
         """Process response through adapters and transformers.
 
@@ -104,10 +105,15 @@ async def process_response(
             status_code: HTTP status code
             handler_config: Handler configuration
             request_headers: Original request headers for CORS processing
+            request_context: Optional request context for storing extracted data
 
         Returns:
             Tuple of (processed_body, processed_headers)
         """
+        # Extract usage from original response BEFORE format conversion
+        if request_context and status_code < 400:
+            self._extract_usage_before_conversion(body, request_context)
+
         # Apply response adapter for successful responses
         processed_body = body
         if handler_config.response_adapter and status_code < 400:
@@ -200,6 +206,69 @@ async def _apply_response_adapter(
             )
             return body
 
+    def _extract_usage_before_conversion(
+        self, body: bytes, request_context: Any
+    ) -> None:
+        """Extract usage data from Anthropic response before format conversion.
+
+        Args:
+            body: Response body in Anthropic format
+            request_context: Request context to store usage data
+        """
+        try:
+            # Parse response body
+            response_data = json.loads(body)
+            usage = response_data.get("usage", {})
+
+            if not usage:
+                return
+
+            # Extract Anthropic-specific usage fields
+            tokens_input = usage.get("input_tokens", 0)
+            tokens_output = usage.get("output_tokens", 0)
+            cache_read_tokens = usage.get("cache_read_input_tokens", 0)
+
+            # Handle both old and new cache creation token formats
+            cache_write_tokens = usage.get("cache_creation_input_tokens", 0)
+
+            # New format has cache_creation as nested object
+            if "cache_creation" in usage and isinstance(usage["cache_creation"], dict):
+                cache_creation = usage["cache_creation"]
+                # Sum all cache creation tokens from different tiers
+                cache_write_tokens = cache_creation.get(
+                    "ephemeral_5m_input_tokens", 0
+                ) + cache_creation.get("ephemeral_1h_input_tokens", 0)
+
+            # Update request context with usage data
+            if hasattr(request_context, "metadata"):
+                request_context.metadata.update(
+                    {
+                        "tokens_input": tokens_input,
+                        "tokens_output": tokens_output,
+                        "tokens_total": tokens_input + tokens_output,
+                        "cache_read_tokens": cache_read_tokens,
+                        "cache_write_tokens": cache_write_tokens,
+                        # Note: cost calculation happens in the adapter with pricing service
+                    }
+                )
+
+                self.logger.debug(
+                    "usage_extracted_before_conversion",
+                    tokens_input=tokens_input,
+                    tokens_output=tokens_output,
+                    cache_read_tokens=cache_read_tokens,
+                    cache_write_tokens=cache_write_tokens,
+                    source="processor",
+                )
+
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            # Silent fail - usage extraction is non-critical
+            pass
+        except Exception as e:
+            self.logger.debug(
+                "usage_extraction_failed", error=str(e), source="processor"
+            )
+
     def _filter_internal_headers(self, headers: dict[str, str]) -> dict[str, str]:
         """Filter out internal headers that shouldn't be sent upstream.
 
diff --git a/plugins/claude_api/adapter.py b/plugins/claude_api/adapter.py
@@ -347,6 +347,10 @@ async def _execute_request(
             request_context=request_context,  # Pass the actual RequestContext object
         )
 
+        # For non-streaming responses, calculate cost based on usage already extracted in processor
+        if not is_streaming and request_context:
+            await self._calculate_cost_for_usage(request_context)
+
         # For deferred streaming responses, return directly (metrics collector already has cost calculation)
         if isinstance(response, DeferredStreaming):
             return response
@@ -355,131 +359,89 @@ async def _execute_request(
         if is_streaming and isinstance(response, StreamingResponse):
             return await self._wrap_streaming_response(response, request_context)
 
-        # For non-streaming responses, extract usage data if available
-        if not is_streaming and hasattr(response, "body"):
-            # Get response body (might be bytes or memoryview)
-            response_body = response.body
-            if isinstance(response_body, memoryview):
-                response_body = bytes(response_body)
-            await self._extract_usage_from_response(response_body, request_context)
-
         return response
 
-    async def _extract_usage_from_response(
-        self, body: bytes | str, request_context: "RequestContext"
+    async def _calculate_cost_for_usage(
+        self, request_context: "RequestContext"
     ) -> None:
-        """Extract usage data from response body and update context.
-
-        Common function used by both streaming and non-streaming responses.
+        """Calculate cost for usage data already extracted in processor.
 
         Args:
-            body: Response body (bytes or string)
-            request_context: Request context to update with usage data
+            request_context: Request context with usage data from processor
         """
-        try:
-            import json
-
-            # Convert body to string if needed
-            body_str = body
-            if isinstance(body_str, bytes):
-                body_str = body_str.decode("utf-8")
-
-            # Parse response to extract usage
-            response_data = json.loads(body_str)
-            usage = response_data.get("usage", {})
+        # Check if we have usage data from the processor
+        metadata = request_context.metadata
+        tokens_input = metadata.get("tokens_input", 0)
+        tokens_output = metadata.get("tokens_output", 0)
 
-            if not usage:
-                return
+        # Skip if no usage data available
+        if not (tokens_input or tokens_output):
+            return
 
-            # Extract Claude-specific usage fields
-            tokens_input = usage.get("input_tokens", 0)
-            tokens_output = usage.get("output_tokens", 0)
-            cache_read_tokens = usage.get("cache_read_input_tokens", 0)
-            cache_write_tokens = usage.get("cache_creation_input_tokens", 0)
+        # Get pricing service and calculate cost
+        pricing_service = self._get_pricing_service()
+        if not pricing_service:
+            return
 
-            # Calculate cost using pricing service if available
-            cost_usd = None
-            pricing_service = self._get_pricing_service()
-            self.logger.debug(
-                "pricing_service_check",
-                has_pricing_service=pricing_service is not None,
-                source="non_streaming",
+        try:
+            model = metadata.get("model", "claude-3-5-sonnet-20241022")
+            cache_read_tokens = metadata.get("cache_read_tokens", 0)
+            cache_write_tokens = metadata.get("cache_write_tokens", 0)
+
+            # Import pricing exceptions
+            from plugins.pricing.exceptions import (
+                ModelPricingNotFoundError,
+                PricingDataNotLoadedError,
+                PricingServiceDisabledError,
             )
-            if pricing_service:
-                try:
-                    model = request_context.metadata.get(
-                        "model", "claude-3-5-sonnet-20241022"
-                    )
-                    # Import pricing exceptions
-                    from plugins.pricing.exceptions import (
-                        ModelPricingNotFoundError,
-                        PricingDataNotLoadedError,
-                        PricingServiceDisabledError,
-                    )
-
-                    cost_decimal = await pricing_service.calculate_cost(
-                        model_name=model,
-                        input_tokens=tokens_input,
-                        output_tokens=tokens_output,
-                        cache_read_tokens=cache_read_tokens,
-                        cache_write_tokens=cache_write_tokens,
-                    )
-                    cost_usd = float(cost_decimal)
-                    self.logger.debug(
-                        "cost_calculated",
-                        model=model,
-                        cost_usd=cost_usd,
-                        tokens_input=tokens_input,
-                        tokens_output=tokens_output,
-                    )
-                except ModelPricingNotFoundError as e:
-                    self.logger.warning(
-                        "model_pricing_not_found",
-                        model=model,
-                        message=str(e),
-                        tokens_input=tokens_input,
-                        tokens_output=tokens_output,
-                    )
-                except PricingDataNotLoadedError as e:
-                    self.logger.warning(
-                        "pricing_data_not_loaded",
-                        model=model,
-                        message=str(e),
-                    )
-                except PricingServiceDisabledError as e:
-                    self.logger.debug(
-                        "pricing_service_disabled",
-                        message=str(e),
-                    )
-                except Exception as e:
-                    self.logger.debug(
-                        "cost_calculation_failed", error=str(e), model=model
-                    )
 
-            # Update request context with usage data
-            request_context.metadata.update(
-                {
-                    "tokens_input": tokens_input,
-                    "tokens_output": tokens_output,
-                    "tokens_total": tokens_input + tokens_output,
-                    "cache_read_tokens": cache_read_tokens,
-                    "cache_write_tokens": cache_write_tokens,
-                    "cost_usd": cost_usd or 0.0,
-                }
+            cost_decimal = await pricing_service.calculate_cost(
+                model_name=model,
+                input_tokens=tokens_input,
+                output_tokens=tokens_output,
+                cache_read_tokens=cache_read_tokens,
+                cache_write_tokens=cache_write_tokens,
             )
+            cost_usd = float(cost_decimal)
+
+            # Update context with calculated cost
+            metadata["cost_usd"] = cost_usd
 
             self.logger.debug(
-                "usage_extracted",
+                "cost_calculated",
+                model=model,
+                cost_usd=cost_usd,
                 tokens_input=tokens_input,
                 tokens_output=tokens_output,
                 cache_read_tokens=cache_read_tokens,
                 cache_write_tokens=cache_write_tokens,
-                cost_usd=cost_usd,
-                source="response_body",
+                source="non_streaming",
+            )
+        except ModelPricingNotFoundError as e:
+            self.logger.warning(
+                "model_pricing_not_found",
+                model=model,
+                message=str(e),
+                tokens_input=tokens_input,
+                tokens_output=tokens_output,
+            )
+        except PricingDataNotLoadedError as e:
+            self.logger.warning(
+                "pricing_data_not_loaded",
+                model=model,
+                message=str(e),
+            )
+        except PricingServiceDisabledError as e:
+            self.logger.debug(
+                "pricing_service_disabled",
+                message=str(e),
             )
-
         except Exception as e:
-            self.logger.debug("usage_extraction_failed", error=str(e))
+            self.logger.debug(
+                "cost_calculation_failed",
+                error=str(e),
+                model=metadata.get("model"),
+            )
 
     async def _wrap_streaming_response(
         self, response: StreamingResponse, request_context: "RequestContext"
@@ -638,7 +600,9 @@ async def wrapped_iterator() -> AsyncIterator[bytes]:
                                         model=model,
                                         message=str(e),
                                         tokens_input=usage_metrics.get("tokens_input"),
-                                        tokens_output=usage_metrics.get("tokens_output"),
+                                        tokens_output=usage_metrics.get(
+                                            "tokens_output"
+                                        ),
                                         category="pricing",
                                     )
                                 except PricingDataNotLoadedError as e:
diff --git a/tests/unit/plugins/test_claude_api_pricing.py b/tests/unit/plugins/test_claude_api_pricing.py
@@ -102,7 +102,7 @@ def test_get_pricing_service_with_missing_runtime(self):
     async def test_extract_usage_with_pricing(
         self, adapter_with_pricing, mock_pricing_service
     ):
-        """Test that usage extraction uses pricing service for cost calculation."""
+        """Test that cost calculation uses pricing service when available."""
         import time
 
         from ccproxy.observability.context import RequestContext
@@ -113,14 +113,19 @@ async def test_extract_usage_with_pricing(
         )
         request_context.metadata["model"] = "claude-3-5-sonnet-20241022"
 
-        # Mock response body with usage data
-        response_body = b'{"usage": {"input_tokens": 1000, "output_tokens": 500}}'
-
-        # Extract usage from response
-        await adapter_with_pricing._extract_usage_from_response(
-            response_body, request_context
+        # Simulate usage data already extracted in processor
+        request_context.metadata.update(
+            {
+                "tokens_input": 1000,
+                "tokens_output": 500,
+                "cache_read_tokens": 0,
+                "cache_write_tokens": 0,
+            }
         )
 
+        # Calculate cost with pricing service
+        await adapter_with_pricing._calculate_cost_for_usage(request_context)
+
         # Verify pricing service was called
         mock_pricing_service.calculate_cost.assert_called_once_with(
             model_name="claude-3-5-sonnet-20241022",
@@ -156,16 +161,22 @@ async def test_extract_usage_without_pricing(self):
         )
         request_context.metadata["model"] = "claude-3-5-sonnet-20241022"
 
-        # Mock response body with usage data
-        response_body = b'{"usage": {"input_tokens": 1000, "output_tokens": 500}}'
+        # Simulate usage data already extracted in processor
+        request_context.metadata.update(
+            {
+                "tokens_input": 1000,
+                "tokens_output": 500,
+                "tokens_total": 1500,
+            }
+        )
 
-        # Extract usage from response (should not fail)
-        await adapter._extract_usage_from_response(response_body, request_context)
+        # Calculate cost (should not fail even without pricing service)
+        await adapter._calculate_cost_for_usage(request_context)
 
-        # Verify tokens were extracted even without pricing
+        # Verify tokens are still in metadata
         assert request_context.metadata["tokens_input"] == 1000
         assert request_context.metadata["tokens_output"] == 500
         assert request_context.metadata["tokens_total"] == 1500
 
-        # Cost should be 0 when pricing service is not available
-        assert request_context.metadata["cost_usd"] == 0.0
+        # Cost should not be set when pricing service is not available
+        assert "cost_usd" not in request_context.metadata

Original file line number	Diff line number	Diff line change
`@@ -186,6 +186,7 @@ async def _handle_regular_request(`
`186`	`186`	`headers=dict(response.headers),`
`187`	`187`	`status_code=response.status_code,`
`188`	`188`	`handler_config=handler_config,`
	`189`	`+ request_context=request_context,`
`189`	`190`	`)`
`190`	`191`
`191`	`192`	`result = Response(`