envoyproxy
diff --git a/‎api/v1alpha1/ai_gateway_route.go‎
Lines changed: 2 additions & 0 deletions b/‎api/v1alpha1/ai_gateway_route.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎api/v1alpha1/shared_types.go‎
Lines changed: 5 additions & 1 deletion b/‎api/v1alpha1/shared_types.go‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/token_ratelimit/token_ratelimit.yaml‎
Lines changed: 22 additions & 0 deletions b/‎examples/token_ratelimit/token_ratelimit.yaml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎internal/controller/gateway.go‎
Lines changed: 2 additions & 0 deletions b/‎internal/controller/gateway.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎internal/controller/gateway_test.go‎
Lines changed: 5 additions & 3 deletions b/‎internal/controller/gateway_test.go‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎internal/extproc/chatcompletion_processor.go‎
Lines changed: 5 additions & 2 deletions b/‎internal/extproc/chatcompletion_processor.go‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎internal/extproc/chatcompletion_processor_test.go‎
Lines changed: 6 additions & 2 deletions b/‎internal/extproc/chatcompletion_processor_test.go‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎internal/extproc/messages_processor.go‎
Lines changed: 1 addition & 1 deletion b/‎internal/extproc/messages_processor.go‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎internal/extproc/mocks_test.go‎
Lines changed: 3 additions & 1 deletion b/‎internal/extproc/mocks_test.go‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎internal/extproc/server_test.go‎
Lines changed: 1 addition & 1 deletion b/‎internal/extproc/server_test.go‎
Lines changed: 1 addition & 1 deletion
@@ -103,6 +103,8 @@ type AIGatewayRouteSpec struct {
 	//	  type: OutputToken
 	//	- metadataKey: llm_total_token
 	//	  type: TotalToken
+	//	- metadataKey: llm_cached_input_token
+	//	  type: CachedInputToken
 	// ```
 	// Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
 	// rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
 
@@ -80,7 +80,7 @@ type LLMRequestCost struct {
 	// and it uses "output token" as the cost. The other types are "InputToken", "TotalToken",
 	// and "CEL".
 	//
-	// +kubebuilder:validation:Enum=OutputToken;InputToken;TotalToken;CEL
+	// +kubebuilder:validation:Enum=OutputToken;InputToken;CachedInputToken;TotalToken;CEL
 	Type LLMRequestCostType `json:"type"`
 	// CEL is the CEL expression to calculate the cost of the request.
 	// The CEL expression must return a signed or unsigned integer. If the
@@ -91,13 +91,15 @@ type LLMRequestCost struct {
 	//	* model: the model name extracted from the request content. Type: string.
 	//	* backend: the backend name in the form of "name.namespace". Type: string.
 	//	* input_tokens: the number of input tokens. Type: unsigned integer.
+	//	* cached_input_tokens: the number of cached input tokens. Type: unsigned integer.
 	//	* output_tokens: the number of output tokens. Type: unsigned integer.
 	//	* total_tokens: the total number of tokens. Type: unsigned integer.
 	//
 	// For example, the following expressions are valid:
 	//
 	// 	* "model == 'llama' ?  input_tokens + output_token * 0.5 : total_tokens"
 	//	* "backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens"
+	//	* "backend == 'bar.default' ?  (input_tokens - cached_input_tokens) + cached_input_tokens * 0.1 + output_tokens : total_tokens"
 	//	* "input_tokens + output_tokens + total_tokens"
 	//	* "input_tokens * output_tokens"
 	//
@@ -111,6 +113,8 @@ type LLMRequestCostType string
 const (
 	// LLMRequestCostTypeInputToken is the cost type of the input token.
 	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
+	// LLMRequestCostTypeCachedInputToken is the cost type of the cached input token.
+	LLMRequestCostTypeCachedInputToken LLMRequestCostType = "CachedInputToken"
 	// LLMRequestCostTypeOutputToken is the cost type of the output token.
 	LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
 	// LLMRequestCostTypeTotalToken is the cost type of the total token.
 
@@ -49,6 +49,8 @@ spec:
   llmRequestCosts:
     - metadataKey: llm_input_token
       type: InputToken
+    - metadataKey: llm_cached_input_token
+      type: CachedInputToken
     - metadataKey: llm_output_token
       type: OutputToken
     - metadataKey: llm_total_token
@@ -164,6 +166,26 @@ spec:
                 namespace: io.envoy.ai_gateway
                 key: llm_total_token
 
+        # Repeat the same configuration for a different token type.
+        # This configures the cached input token limit, and it has a different budget than others,
+        # so it will be rate limited separately.
+        - clientSelectors:
+            - headers:
+                - name: x-user-id
+                  type: Distinct
+          limit:
+            requests: 100
+            unit: Hour
+          cost:
+            request:
+              from: Number
+              number: 0
+            response:
+              from: Metadata
+              metadata:
+                namespace: io.envoy.ai_gateway
+                key: llm_cached_input_token
+
         # Repeat the same configuration for a different token type.
         # This configures the token limit based on the CEL expression.
         - clientSelectors:
 
@@ -260,6 +260,8 @@ func (c *GatewayController) reconcileFilterConfigSecret(
 				switch cost.Type {
 				case aigv1a1.LLMRequestCostTypeInputToken:
 					fc.Type = filterapi.LLMRequestCostTypeInputToken
+				case aigv1a1.LLMRequestCostTypeCachedInputToken:
+					fc.Type = filterapi.LLMRequestCostTypeCachedInputToken
 				case aigv1a1.LLMRequestCostTypeOutputToken:
 					fc.Type = filterapi.LLMRequestCostTypeOutputToken
 				case aigv1a1.LLMRequestCostTypeTotalToken:
 
@@ -194,6 +194,7 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) {
 					{MetadataKey: "foo", Type: aigv1a1.LLMRequestCostTypeInputToken},
 					{MetadataKey: "bar", Type: aigv1a1.LLMRequestCostTypeOutputToken},
 					{MetadataKey: "baz", Type: aigv1a1.LLMRequestCostTypeTotalToken},
+					{MetadataKey: "qux", Type: aigv1a1.LLMRequestCostTypeCachedInputToken},
 				},
 			},
 		},
@@ -267,12 +268,13 @@ func TestGatewayController_reconcileFilterConfigSecret(t *testing.T) {
 		require.True(t, ok)
 		var fc filterapi.Config
 		require.NoError(t, yaml.Unmarshal([]byte(configStr), &fc))
-		require.Len(t, fc.LLMRequestCosts, 4)
+		require.Len(t, fc.LLMRequestCosts, 5)
 		require.Equal(t, filterapi.LLMRequestCostTypeInputToken, fc.LLMRequestCosts[0].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeOutputToken, fc.LLMRequestCosts[1].Type)
 		require.Equal(t, filterapi.LLMRequestCostTypeTotalToken, fc.LLMRequestCosts[2].Type)
-		require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[3].Type)
-		require.Equal(t, `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[3].CEL)
+		require.Equal(t, filterapi.LLMRequestCostTypeCachedInputToken, fc.LLMRequestCosts[3].Type)
+		require.Equal(t, filterapi.LLMRequestCostTypeCEL, fc.LLMRequestCosts[4].Type)
+		require.Equal(t, `backend == 'foo.default' ?  input_tokens + output_tokens : total_tokens`, fc.LLMRequestCosts[4].CEL)
 		require.Len(t, fc.Models, 1)
 		require.Equal(t, "mymodel", fc.Models[0].Name)
 
 
@@ -410,13 +410,13 @@ func (c *chatCompletionProcessorUpstreamFilter) ProcessResponseBody(ctx context.
 		c.metrics.RecordTokenLatency(ctx, tokenUsage.OutputTokens, body.EndOfStream, c.requestHeaders)
 		// Emit usage once at end-of-stream using final totals.
 		if body.EndOfStream {
-			c.metrics.RecordTokenUsage(ctx, c.costs.InputTokens, c.costs.OutputTokens, c.requestHeaders)
+			c.metrics.RecordTokenUsage(ctx, c.costs.InputTokens, c.costs.CachedInputTokens, c.costs.OutputTokens, c.requestHeaders)
 		}
 		// TODO: if c.forcedStreamOptionIncludeUsage is true, we should not include usage in the response body since
 		// that's what the clients would expect. However, it is a little bit tricky as we simply just reading the streaming
 		// chunk by chunk, we only want to drop a specific line before the last chunk.
 	} else {
-		c.metrics.RecordTokenUsage(ctx, tokenUsage.InputTokens, tokenUsage.OutputTokens, c.requestHeaders)
+		c.metrics.RecordTokenUsage(ctx, tokenUsage.InputTokens, tokenUsage.CachedInputTokens, tokenUsage.OutputTokens, c.requestHeaders)
 	}
 
 	if body.EndOfStream && len(c.config.requestCosts) > 0 {
@@ -536,6 +536,8 @@ func buildDynamicMetadata(config *processorConfig, costs *translator.LLMTokenUsa
 		switch rc.Type {
 		case filterapi.LLMRequestCostTypeInputToken:
 			cost = costs.InputTokens
+		case filterapi.LLMRequestCostTypeCachedInputToken:
+			cost = costs.CachedInputTokens
 		case filterapi.LLMRequestCostTypeOutputToken:
 			cost = costs.OutputTokens
 		case filterapi.LLMRequestCostTypeTotalToken:
@@ -546,6 +548,7 @@ func buildDynamicMetadata(config *processorConfig, costs *translator.LLMTokenUsa
 				requestHeaders[internalapi.ModelNameHeaderKeyDefault],
 				backendName,
 				costs.InputTokens,
+				costs.CachedInputTokens,
 				costs.OutputTokens,
 				costs.TotalTokens,
 			)
 
@@ -259,7 +259,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		mt := &mockTranslator{
 			t: t, expResponseBody: inBody,
 			retBodyMutation: expBodyMut, retHeaderMutation: expHeadMut,
-			retUsedToken: translator.LLMTokenUsage{OutputTokens: 123, InputTokens: 1},
+			retUsedToken: translator.LLMTokenUsage{OutputTokens: 123, InputTokens: 1, CachedInputTokens: 1},
 		}
 
 		celProgInt, err := llmcostcel.NewProgram("54321")
@@ -275,6 +275,7 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 				requestCosts: []processorConfigRequestCost{
 					{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeOutputToken, MetadataKey: "output_token_usage"}},
 					{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeInputToken, MetadataKey: "input_token_usage"}},
+					{LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCachedInputToken, MetadataKey: "cached_input_token_usage"}},
 					{
 						celProg:        celProgInt,
 						LLMRequestCost: &filterapi.LLMRequestCost{Type: filterapi.LLMRequestCostTypeCEL, MetadataKey: "cel_int"},
@@ -304,6 +305,8 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 			GetStructValue().Fields["output_token_usage"].GetNumberValue())
 		require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["input_token_usage"].GetNumberValue())
+		require.Equal(t, float64(1), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
+			GetStructValue().Fields["cached_input_token_usage"].GetNumberValue())
 		require.Equal(t, float64(54321), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
 			GetStructValue().Fields["cel_int"].GetNumberValue())
 		require.Equal(t, float64(9999), md.Fields[internalapi.AIGatewayFilterMetadataNamespace].
@@ -356,11 +359,12 @@ func Test_chatCompletionProcessorUpstreamFilter_ProcessResponseBody(t *testing.T
 		// Final chunk should mark success and record usage once.
 		final := &extprocv3.HttpBody{Body: []byte("chunk-final"), EndOfStream: true}
 		mt.expResponseBody = final
-		mt.retUsedToken = translator.LLMTokenUsage{InputTokens: 5, OutputTokens: 138, TotalTokens: 143}
+		mt.retUsedToken = translator.LLMTokenUsage{InputTokens: 5, CachedInputTokens: 3, OutputTokens: 138, TotalTokens: 143}
 		_, err = p.ProcessResponseBody(t.Context(), final)
 		require.NoError(t, err)
 		mm.RequireRequestSuccess(t)
 		require.Equal(t, 143, mm.tokenUsageCount)       // 5 input + 138 output
+		require.Equal(t, 3, mm.cachedInputCount)        // cached input tokens
 		require.Equal(t, 138, mm.streamingOutputTokens) // accumulated output tokens from stream
 	})
 }
 
@@ -302,7 +302,7 @@ func (c *messagesProcessorUpstreamFilter) ProcessResponseBody(ctx context.Contex
 	c.costs.TotalTokens += tokenUsage.TotalTokens
 
 	// Update metrics with token usage.
-	c.metrics.RecordTokenUsage(ctx, tokenUsage.InputTokens, tokenUsage.OutputTokens, c.requestHeaders)
+	c.metrics.RecordTokenUsage(ctx, tokenUsage.InputTokens, tokenUsage.CachedInputTokens, tokenUsage.OutputTokens, c.requestHeaders)
 	if c.stream {
 		c.metrics.RecordTokenLatency(ctx, tokenUsage.OutputTokens, body.EndOfStream, c.requestHeaders)
 	}
 
@@ -172,6 +172,7 @@ type mockChatCompletionMetrics struct {
 	backend             string
 	requestSuccessCount int
 	requestErrorCount   int
+	cachedInputCount    int
 	tokenUsageCount     int
 	// streamingOutputTokens tracks the cumulative output tokens recorded via RecordTokenLatency.
 	streamingOutputTokens int
@@ -201,8 +202,9 @@ func (m *mockChatCompletionMetrics) SetResponseModel(responseModel internalapi.R
 func (m *mockChatCompletionMetrics) SetBackend(backend *filterapi.Backend) { m.backend = backend.Name }
 
 // RecordTokenUsage implements [metrics.ChatCompletion].
-func (m *mockChatCompletionMetrics) RecordTokenUsage(_ context.Context, input, output uint32, _ map[string]string) {
+func (m *mockChatCompletionMetrics) RecordTokenUsage(_ context.Context, input, cachedInput, output uint32, _ map[string]string) {
 	m.tokenUsageCount += int(input + output)
+	m.cachedInputCount += int(cachedInput)
 }
 
 // RecordTokenLatency implements [metrics.ChatCompletion].
 
@@ -84,7 +84,7 @@ func TestServer_LoadConfig(t *testing.T) {
 		require.Equal(t, "1 + 1", s.config.requestCosts[1].CEL)
 		prog := s.config.requestCosts[1].celProg
 		require.NotNil(t, prog)
-		val, err := llmcostcel.EvaluateProgram(prog, "", "", 1, 1, 1)
+		val, err := llmcostcel.EvaluateProgram(prog, "", "", 1, 1, 1, 1)
 		require.NoError(t, err)
 		require.Equal(t, uint64(2), val)
 		require.Equal(t, config.Models, s.config.declaredModels)
Original file line number	Diff line number	Diff line change
`@@ -302,7 +302,7 @@ func (c *messagesProcessorUpstreamFilter) ProcessResponseBody(ctx context.Contex`
`302`	`302`	`c.costs.TotalTokens += tokenUsage.TotalTokens`
`303`	`303`
`304`	`304`	`// Update metrics with token usage.`
`305`		`- c.metrics.RecordTokenUsage(ctx, tokenUsage.InputTokens, tokenUsage.OutputTokens, c.requestHeaders)`
	`305`	`+ c.metrics.RecordTokenUsage(ctx, tokenUsage.InputTokens, tokenUsage.CachedInputTokens, tokenUsage.OutputTokens, c.requestHeaders)`
`306`	`306`	`if c.stream {`
`307`	`307`	`c.metrics.RecordTokenLatency(ctx, tokenUsage.OutputTokens, body.EndOfStream, c.requestHeaders)`
`308`	`308`	`}`