vllm-project · rootfs · Sep 7, 2025 · Sep 7, 2025 · Sep 7, 2025
@@ -56,8 +56,9 @@ model_config:
     batch_size: 512.0  # vLLM default batch size
     context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
     pricing:
-      prompt_usd_per_1m: 200.0
-      completion_usd_per_1m: 600.0
+      currency: USD
+      prompt_per_1m: 0.07
+      completion_per_1m: 0.35
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
@@ -68,8 +69,9 @@ model_config:
     batch_size: 512.0
     context_size: 16384.0
     pricing:
-      prompt_usd_per_1m: 500.0
-      completion_usd_per_1m: 1500.0
+      currency: USD
+      prompt_per_1m: 0.067
+      completion_per_1m: 0.267
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
@@ -79,8 +81,9 @@ model_config:
     batch_size: 512.0
     context_size: 16384.0
     pricing:
-      prompt_usd_per_1m: 300.0
-      completion_usd_per_1m: 900.0
+      currency: USD
+      prompt_per_1m: 0.1
+      completion_per_1m: 0.3
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types

@@ -193,10 +193,12 @@ type VLLMEndpoint struct {
 
 // ModelParams represents configuration for model-specific parameters
 type ModelPricing struct {
-	// Price in USD per 1M prompt tokens
-	PromptUSDPer1M float64 `yaml:"prompt_usd_per_1m,omitempty"`
-	// Price in USD per 1M completion tokens
-	CompletionUSDPer1M float64 `yaml:"completion_usd_per_1m,omitempty"`
+	// ISO currency code for the pricing (e.g., "USD"). Defaults to "USD" when omitted.
+	Currency string `yaml:"currency,omitempty"`
+
+	// Price per 1M tokens (unit: <currency>/1_000_000 tokens)
+	PromptPer1M     float64 `yaml:"prompt_per_1m,omitempty"`
+	CompletionPer1M float64 `yaml:"completion_per_1m,omitempty"`
 }
 
 type ModelParams struct {
@@ -374,15 +376,20 @@ func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float6
 	return defaultValue
 }
 
-// GetModelPricing returns pricing in USD per 1M tokens for prompt and completion.
-func (c *RouterConfig) GetModelPricing(modelName string) (promptUSDPer1M float64, completionUSDPer1M float64, ok bool) {
+// GetModelPricing returns pricing per 1M tokens and its currency for the given model.
+// The currency indicates the unit of the returned rates (e.g., "USD").
+func (c *RouterConfig) GetModelPricing(modelName string) (promptPer1M float64, completionPer1M float64, currency string, ok bool) {
 	if modelConfig, okc := c.ModelConfig[modelName]; okc {
 		p := modelConfig.Pricing
-		if p.PromptUSDPer1M != 0 || p.CompletionUSDPer1M != 0 {
-			return p.PromptUSDPer1M, p.CompletionUSDPer1M, true
+		if p.PromptPer1M != 0 || p.CompletionPer1M != 0 {
+			cur := p.Currency
+			if cur == "" {
+				cur = "USD"
+			}
+			return p.PromptPer1M, p.CompletionPer1M, cur, true
 		}
 	}
-	return 0, 0, false
+	return 0, 0, "", false
 }
 
 // GetModelPIIPolicy returns the PII policy for a given model

@@ -56,18 +56,22 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response
 
 		// Compute and record cost if pricing is configured
 		if r.Config != nil {
-			promptRatePer1M, completionRatePer1M, ok := r.Config.GetModelPricing(ctx.RequestModel)
+			promptRatePer1M, completionRatePer1M, currency, ok := r.Config.GetModelPricing(ctx.RequestModel)
 			if ok {
-				costUSD := (float64(promptTokens)*promptRatePer1M + float64(completionTokens)*completionRatePer1M) / 1_000_000.0
-				metrics.RecordModelCostUSD(ctx.RequestModel, costUSD)
+				costAmount := (float64(promptTokens)*promptRatePer1M + float64(completionTokens)*completionRatePer1M) / 1_000_000.0
+				if currency == "" {
+					currency = "USD"
+				}
+				metrics.RecordModelCost(ctx.RequestModel, currency, costAmount)
 				observability.LogEvent("llm_usage", map[string]interface{}{
 					"request_id":            ctx.RequestID,
 					"model":                 ctx.RequestModel,
 					"prompt_tokens":         promptTokens,
 					"completion_tokens":     completionTokens,
 					"total_tokens":          promptTokens + completionTokens,
 					"completion_latency_ms": completionLatency.Milliseconds(),
-					"cost_usd":              costUSD,
+					"cost":                  costAmount,
+					"currency":              currency,
 				})
 			} else {
 				observability.LogEvent("llm_usage", map[string]interface{}{
@@ -77,7 +81,8 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response
 					"completion_tokens":     completionTokens,
 					"total_tokens":          promptTokens + completionTokens,
 					"completion_latency_ms": completionLatency.Milliseconds(),
-					"cost_usd":              0.0,
+					"cost":                  0.0,
+					"currency":              "unknown",
 					"pricing":               "not_configured",
 				})
 			}

@@ -102,13 +102,13 @@ var (
 		[]string{"model"},
 	)
 
-	// ModelCostUSD tracks the total USD cost attributed to each model
-	ModelCostUSD = promauto.NewCounterVec(
+	// ModelCost tracks the total cost attributed to each model by currency
+	ModelCost = promauto.NewCounterVec(
 		prometheus.CounterOpts{
-			Name: "llm_model_cost_usd_total",
-			Help: "The total USD cost attributed to each LLM model",
+			Name: "llm_model_cost_total",
+			Help: "The total cost attributed to each LLM model, labeled by currency",
 		},
-		[]string{"model"},
+		[]string{"model", "currency"},
 	)
 
 	// ModelTokens tracks the number of tokens used by each model
@@ -256,12 +256,15 @@ func RecordModelTokens(model string, tokens float64) {
 	ModelTokens.WithLabelValues(model).Add(tokens)
 }
 
-// RecordModelCostUSD adds the dollar cost attributed to a specific model
-func RecordModelCostUSD(model string, usd float64) {
-	if usd < 0 {
+// RecordModelCost records the cost attributed to a specific model with a currency label
+func RecordModelCost(model string, currency string, amount float64) {
+	if amount < 0 {
 		return
 	}
-	ModelCostUSD.WithLabelValues(model).Add(usd)
+	if currency == "" {
+		currency = "USD"
+	}
+	ModelCost.WithLabelValues(model, currency).Add(amount)
 }
 
 // RecordRoutingReasonCode increments the counter for a routing decision reason code and model

@@ -222,10 +222,11 @@ sum by (family, effort) (
 
 The router exposes additional metrics for cost accounting and routing decisions.
 
-- `llm_model_cost_usd_total{model}`
-  - Description: Total accumulated USD cost attributed to each model (computed from token usage and per-1M pricing).
+- `llm_model_cost_total{model, currency}`
+  - Description: Total accumulated cost attributed to each model (computed from token usage and per-1M pricing), labeled by currency.
   - Labels:
     - model: model name used for the request
+    - currency: currency code (e.g., "USD")
 
 - `llm_routing_reason_codes_total{reason_code, model}`
   - Description: Count of routing decisions by reason code and selected model.
@@ -236,8 +237,11 @@ The router exposes additional metrics for cost accounting and routing decisions.
 Example PromQL:
 
 ```prometheus
-# Cost by model over the last hour
-sum by (model) (increase(llm_model_cost_usd_total[1h]))
+# Cost by model and currency over the last hour
+sum by (model, currency) (increase(llm_model_cost_total[1h]))
+
+# Or, if you only use USD, a common query is:
+sum by (model) (increase(llm_model_cost_total{currency="USD"}[1h]))
 
 # Routing decisions by reason code over the last 15 minutes
 sum by (reason_code) (increase(llm_routing_reason_codes_total[15m]))
@@ -251,21 +255,24 @@ Provide per-1M pricing for your models so the router can compute request cost an
 model_config:
   phi4:
     pricing:
-      prompt_usd_per_1m: 200.0
-      completion_usd_per_1m: 600.0
+      currency: USD
+      prompt_per_1m: 0.07
+      completion_per_1m: 0.35
   "mistral-small3.1":
     pricing:
-      prompt_usd_per_1m: 300.0
-      completion_usd_per_1m: 900.0
+      currency: USD
+      prompt_per_1m: 0.1
+      completion_per_1m: 0.3
   gemma3:27b:
     pricing:
-      prompt_usd_per_1m: 500.0
-      completion_usd_per_1m: 1500.0
+      currency: USD
+      prompt_per_1m: 0.067
+      completion_per_1m: 0.267
 ```
 
 Notes:
 - Pricing is optional; if omitted, cost is treated as 0 and only token metrics are emitted.
-- Cost is computed as: (prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000.
+- Cost is computed as: (prompt_tokens * prompt_per_1m + completion_tokens * completion_per_1m) / 1_000_000 (in the configured currency).
 
 ## gRPC ExtProc API
 

@@ -126,25 +126,28 @@ model_config:
 
 ### Pricing (Optional)
 
-If you want the router to compute USD cost per request and expose Prometheus cost metrics, add per-1M token pricing under each model in `model_config`.
+If you want the router to compute request cost and expose Prometheus cost metrics, add per-1M token pricing and currency under each model in `model_config`.
 
 ```yaml
 model_config:
   phi4:
     pricing:
-      prompt_usd_per_1m: 0.07
-      completion_usd_per_1m: 0.35
+      currency: USD
+      prompt_per_1m: 0.07
+      completion_per_1m: 0.35
   "mistral-small3.1":
     pricing:
-      prompt_usd_per_1m: 0.1
-      completion_usd_per_1m: 0.3
+      currency: USD
+      prompt_per_1m: 0.1
+      completion_per_1m: 0.3
   gemma3:27b:
     pricing:
-      prompt_usd_per_1m: 0.067
-      completion_usd_per_1m: 0.267
+      currency: USD
+      prompt_per_1m: 0.067
+      completion_per_1m: 0.267
 ```
 
-- Cost formula: `(prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000`.
+- Cost formula: `(prompt_tokens * prompt_per_1m + completion_tokens * completion_per_1m) / 1_000_000` (in the given currency).
 - When not configured, the router still reports token and latency metrics; cost is treated as 0.
 
 ### Classification Models