diff --git a/config/config.yaml b/config/config.yaml
index 7db18883..5ad0dbd5 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,258 +1,266 @@
-bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
-  threshold: 0.6
-  use_cpu: true
-semantic_cache:
-  enabled: true
-  similarity_threshold: 0.8
-  max_entries: 1000
-  ttl_seconds: 3600
-tools:
-  enabled: true  # Set to true to enable automatic tool selection
-  top_k: 3        # Number of most relevant tools to select
-  similarity_threshold: 0.2  # Threshold for tool similarity
-  tools_db_path: "config/tools_db.json"
-  fallback_to_empty: true  # If true, return no tools on failure; if false, return error
-prompt_guard:
-  enabled: true
-  use_modernbert: true
-  model_id: "models/jailbreak_classifier_modernbert-base_model"
-  threshold: 0.7
-  use_cpu: true
-  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
-gpu_config:
-  flops: 312000000000000  # 312e12 fp16
-  hbm: 2000000000000      # 2e12 (2 TB/s)
-  description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
-
-# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
-vllm_endpoints:
-  - name: "endpoint1"
-    address: "127.0.0.1"
-    port: 11434
-    models:
-      - "phi4"
-      - "gemma3:27b"
-    weight: 1  # Load balancing weight
-    health_check_path: "/health"  # Optional health check endpoint
-  - name: "endpoint2"
-    address: "127.0.0.1"
-    port: 11434
-    models:
-      - "mistral-small3.1"
-    weight: 1
-    health_check_path: "/health"
-  - name: "endpoint3"
-    address: "127.0.0.1"
-    port: 11434
-    models:
-      - "phi4"  # Same model can be served by multiple endpoints for redundancy
-      - "mistral-small3.1"
-    weight: 2  # Higher weight for more powerful endpoint
-
-model_config:
-  phi4:
-    param_count: 14000000000  # 14B parameters https://huggingface.co/microsoft/phi-4
-    batch_size: 512.0  # vLLM default batch size
-    context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
-    pii_policy:
-      allow_by_default: false  # Deny all PII by default
-      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
-    # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
-    preferred_endpoints: ["endpoint1", "endpoint3"]
-  gemma3:27b:
-    param_count: 27000000000  # 27B parameters (base version)
-    batch_size: 512.0
-    context_size: 16384.0
-    pii_policy:
-      allow_by_default: false  # Deny all PII by default
-      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
-    preferred_endpoints: ["endpoint1"]
-  "mistral-small3.1":
-    param_count: 22000000000
-    batch_size: 512.0
-    context_size: 16384.0
-    pii_policy:
-      allow_by_default: false  # Deny all PII by default
-      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
-    preferred_endpoints: ["endpoint2", "endpoint3"]
-
-# Classifier configuration for text classification
-classifier:
-  category_model:
-    model_id: "models/category_classifier_modernbert-base_model" #TODO: Use local model for now before the code can download the entire model from huggingface
-    use_modernbert: true
-    threshold: 0.6
-    use_cpu: true
-    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
-  pii_model:
-    model_id: "models/pii_classifier_modernbert-base_presidio_token_model" #TODO: Use local model for now before the code can download the entire model from huggingface
-    use_modernbert: true
-    threshold: 0.7
-    use_cpu: true
-    pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
-  load_aware: false
-categories:
-- name: business
-  use_reasoning: false
-  reasoning_description: "Business content is typically conversational"
-  reasoning_effort: low  # Business conversations need low reasoning effort
-  model_scores:
-  - model: phi4
-    score: 0.8
-  - model: gemma3:27b
-    score: 0.4
-  - model: mistral-small3.1
-    score: 0.2
-- name: law
-  use_reasoning: false
-  reasoning_description: "Legal content is typically explanatory"
-  model_scores:
-  - model: gemma3:27b
-    score: 0.8
-  - model: phi4
-    score: 0.6
-  - model: mistral-small3.1
-    score: 0.4
-- name: psychology
-  use_reasoning: false
-  reasoning_description: "Psychology content is usually explanatory"
-  model_scores:
-  - model: mistral-small3.1
-    score: 0.6
-  - model: gemma3:27b
-    score: 0.4
-  - model: phi4
-    score: 0.4
-- name: biology
-  use_reasoning: true
-  reasoning_description: "Biological processes benefit from structured analysis"
-  model_scores:
-  - model: mistral-small3.1
-    score: 0.8
-  - model: gemma3:27b
-    score: 0.6
-  - model: phi4
-    score: 0.2
-- name: chemistry
-  use_reasoning: true
-  reasoning_description: "Chemical reactions and formulas require systematic thinking"
-  reasoning_effort: high  # Chemistry requires high reasoning effort
-  model_scores:
-  - model: mistral-small3.1
-    score: 0.8
-  - model: gemma3:27b
-    score: 0.6
-  - model: phi4
-    score: 0.6
-- name: history
-  use_reasoning: false
-  reasoning_description: "Historical content is narrative-based"
-  model_scores:
-  - model: mistral-small3.1
-    score: 0.8
-  - model: phi4
-    score: 0.6
-  - model: gemma3:27b
-    score: 0.4
-- name: other
-  use_reasoning: false
-  reasoning_description: "General content doesn't require reasoning"
-  model_scores:
-  - model: gemma3:27b
-    score: 0.8
-  - model: phi4
-    score: 0.6
-  - model: mistral-small3.1
-    score: 0.6
-- name: health
-  use_reasoning: false
-  reasoning_description: "Health information is typically informational"
-  model_scores:
-  - model: gemma3:27b
-    score: 0.8
-  - model: phi4
-    score: 0.8
-  - model: mistral-small3.1
-    score: 0.6
-- name: economics
-  use_reasoning: false
-  reasoning_description: "Economic discussions are usually explanatory"
-  model_scores:
-  - model: gemma3:27b
-    score: 0.8
-  - model: mistral-small3.1
-    score: 0.8
-  - model: phi4
-    score: 0.0
-- name: math
-  use_reasoning: true
-  reasoning_description: "Mathematical problems require step-by-step reasoning"
-  reasoning_effort: high  # Math problems need high reasoning effort
-  model_scores:
-  - model: phi4
-    score: 1.0
-  - model: mistral-small3.1
-    score: 0.8
-  - model: gemma3:27b
-    score: 0.6
-- name: physics
-  use_reasoning: true
-  reasoning_description: "Physics concepts need logical analysis"
-  model_scores:
-  - model: gemma3:27b
-    score: 0.4
-  - model: phi4
-    score: 0.4
-  - model: mistral-small3.1
-    score: 0.4
-- name: computer science
-  use_reasoning: true
-  reasoning_description: "Programming and algorithms need logical reasoning"
-  model_scores:
-  - model: gemma3:27b
-    score: 0.6
-  - model: mistral-small3.1
-    score: 0.6
-  - model: phi4
-    score: 0.0
-- name: philosophy
-  use_reasoning: false
-  reasoning_description: "Philosophical discussions are conversational"
-  model_scores:
-  - model: phi4
-    score: 0.6
-  - model: gemma3:27b
-    score: 0.2
-  - model: mistral-small3.1
-    score: 0.2
-- name: engineering
-  use_reasoning: true
-  reasoning_description: "Engineering problems require systematic problem-solving"
-  model_scores:
-  - model: gemma3:27b
-    score: 0.6
-  - model: mistral-small3.1
-    score: 0.6
-  - model: phi4
-    score: 0.2
-default_model: mistral-small3.1
-default_reasoning_effort: medium  # Default reasoning effort level (low, medium, high)
-
-# API Configuration
-api:
-  batch_classification:
-    max_batch_size: 100          # Maximum number of texts in a single batch
-    concurrency_threshold: 5     # Switch to concurrent processing when batch size > this value
-    max_concurrency: 8           # Maximum number of concurrent goroutines
-        
-    # Metrics configuration for monitoring batch classification performance
-    metrics:
-      enabled: true              # Enable comprehensive metrics collection
-      detailed_goroutine_tracking: true  # Track individual goroutine lifecycle 
-      high_resolution_timing: false      # Use nanosecond precision timing 
-      sample_rate: 1.0                   # Collect metrics for all requests (1.0 = 100%, 0.5 = 50%)
-      
-      # Histogram buckets for metrics (directly configure what you need)
-      duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
-      size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]     
+bert_model:
+  model_id: sentence-transformers/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+semantic_cache:
+  enabled: true
+  similarity_threshold: 0.8
+  max_entries: 1000
+  ttl_seconds: 3600
+tools:
+  enabled: true  # Set to true to enable automatic tool selection
+  top_k: 3        # Number of most relevant tools to select
+  similarity_threshold: 0.2  # Threshold for tool similarity
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true  # If true, return no tools on failure; if false, return error
+prompt_guard:
+  enabled: true
+  use_modernbert: true
+  model_id: "models/jailbreak_classifier_modernbert-base_model"
+  threshold: 0.7
+  use_cpu: true
+  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+gpu_config:
+  flops: 312000000000000  # 312e12 fp16
+  hbm: 2000000000000      # 2e12 (2 TB/s)
+  description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
+
+# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
+vllm_endpoints:
+  - name: "endpoint1"
+    address: "127.0.0.1"
+    port: 11434
+    models:
+      - "phi4"
+      - "gemma3:27b"
+    weight: 1  # Load balancing weight
+    health_check_path: "/health"  # Optional health check endpoint
+  - name: "endpoint2"
+    address: "127.0.0.1"
+    port: 11434
+    models:
+      - "mistral-small3.1"
+    weight: 1
+    health_check_path: "/health"
+  - name: "endpoint3"
+    address: "127.0.0.1"
+    port: 11434
+    models:
+      - "phi4"  # Same model can be served by multiple endpoints for redundancy
+      - "mistral-small3.1"
+    weight: 2  # Higher weight for more powerful endpoint
+
+model_config:
+  phi4:
+    param_count: 14000000000  # 14B parameters https://huggingface.co/microsoft/phi-4
+    batch_size: 512.0  # vLLM default batch size
+    context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
+    pricing:
+      prompt_usd_per_1m: 200.0
+      completion_usd_per_1m: 600.0
+    pii_policy:
+      allow_by_default: false  # Deny all PII by default
+      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
+    # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
+    preferred_endpoints: ["endpoint1", "endpoint3"]
+  gemma3:27b:
+    param_count: 27000000000  # 27B parameters (base version)
+    batch_size: 512.0
+    context_size: 16384.0
+    pricing:
+      prompt_usd_per_1m: 500.0
+      completion_usd_per_1m: 1500.0
+    pii_policy:
+      allow_by_default: false  # Deny all PII by default
+      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
+    preferred_endpoints: ["endpoint1"]
+  "mistral-small3.1":
+    param_count: 22000000000
+    batch_size: 512.0
+    context_size: 16384.0
+    pricing:
+      prompt_usd_per_1m: 300.0
+      completion_usd_per_1m: 900.0
+    pii_policy:
+      allow_by_default: false  # Deny all PII by default
+      pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
+    preferred_endpoints: ["endpoint2", "endpoint3"]
+
+# Classifier configuration for text classification
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model" #TODO: Use local model for now before the code can download the entire model from huggingface
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+  pii_model:
+    model_id: "models/pii_classifier_modernbert-base_presidio_token_model" #TODO: Use local model for now before the code can download the entire model from huggingface
+    use_modernbert: true
+    threshold: 0.7
+    use_cpu: true
+    pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
+  load_aware: false
+categories:
+- name: business
+  use_reasoning: false
+  reasoning_description: "Business content is typically conversational"
+  reasoning_effort: low  # Business conversations need low reasoning effort
+  model_scores:
+  - model: phi4
+    score: 0.8
+  - model: gemma3:27b
+    score: 0.4
+  - model: mistral-small3.1
+    score: 0.2
+- name: law
+  use_reasoning: false
+  reasoning_description: "Legal content is typically explanatory"
+  model_scores:
+  - model: gemma3:27b
+    score: 0.8
+  - model: phi4
+    score: 0.6
+  - model: mistral-small3.1
+    score: 0.4
+- name: psychology
+  use_reasoning: false
+  reasoning_description: "Psychology content is usually explanatory"
+  model_scores:
+  - model: mistral-small3.1
+    score: 0.6
+  - model: gemma3:27b
+    score: 0.4
+  - model: phi4
+    score: 0.4
+- name: biology
+  use_reasoning: true
+  reasoning_description: "Biological processes benefit from structured analysis"
+  model_scores:
+  - model: mistral-small3.1
+    score: 0.8
+  - model: gemma3:27b
+    score: 0.6
+  - model: phi4
+    score: 0.2
+- name: chemistry
+  use_reasoning: true
+  reasoning_description: "Chemical reactions and formulas require systematic thinking"
+  reasoning_effort: high  # Chemistry requires high reasoning effort
+  model_scores:
+  - model: mistral-small3.1
+    score: 0.8
+  - model: gemma3:27b
+    score: 0.6
+  - model: phi4
+    score: 0.6
+- name: history
+  use_reasoning: false
+  reasoning_description: "Historical content is narrative-based"
+  model_scores:
+  - model: mistral-small3.1
+    score: 0.8
+  - model: phi4
+    score: 0.6
+  - model: gemma3:27b
+    score: 0.4
+- name: other
+  use_reasoning: false
+  reasoning_description: "General content doesn't require reasoning"
+  model_scores:
+  - model: gemma3:27b
+    score: 0.8
+  - model: phi4
+    score: 0.6
+  - model: mistral-small3.1
+    score: 0.6
+- name: health
+  use_reasoning: false
+  reasoning_description: "Health information is typically informational"
+  model_scores:
+  - model: gemma3:27b
+    score: 0.8
+  - model: phi4
+    score: 0.8
+  - model: mistral-small3.1
+    score: 0.6
+- name: economics
+  use_reasoning: false
+  reasoning_description: "Economic discussions are usually explanatory"
+  model_scores:
+  - model: gemma3:27b
+    score: 0.8
+  - model: mistral-small3.1
+    score: 0.8
+  - model: phi4
+    score: 0.0
+- name: math
+  use_reasoning: true
+  reasoning_description: "Mathematical problems require step-by-step reasoning"
+  reasoning_effort: high  # Math problems need high reasoning effort
+  model_scores:
+  - model: phi4
+    score: 1.0
+  - model: mistral-small3.1
+    score: 0.8
+  - model: gemma3:27b
+    score: 0.6
+- name: physics
+  use_reasoning: true
+  reasoning_description: "Physics concepts need logical analysis"
+  model_scores:
+  - model: gemma3:27b
+    score: 0.4
+  - model: phi4
+    score: 0.4
+  - model: mistral-small3.1
+    score: 0.4
+- name: computer science
+  use_reasoning: true
+  reasoning_description: "Programming and algorithms need logical reasoning"
+  model_scores:
+  - model: gemma3:27b
+    score: 0.6
+  - model: mistral-small3.1
+    score: 0.6
+  - model: phi4
+    score: 0.0
+- name: philosophy
+  use_reasoning: false
+  reasoning_description: "Philosophical discussions are conversational"
+  model_scores:
+  - model: phi4
+    score: 0.6
+  - model: gemma3:27b
+    score: 0.2
+  - model: mistral-small3.1
+    score: 0.2
+- name: engineering
+  use_reasoning: true
+  reasoning_description: "Engineering problems require systematic problem-solving"
+  model_scores:
+  - model: gemma3:27b
+    score: 0.6
+  - model: mistral-small3.1
+    score: 0.6
+  - model: phi4
+    score: 0.2
+default_model: mistral-small3.1
+default_reasoning_effort: medium  # Default reasoning effort level (low, medium, high)
+
+# API Configuration
+api:
+  batch_classification:
+    max_batch_size: 100          # Maximum number of texts in a single batch
+    concurrency_threshold: 5     # Switch to concurrent processing when batch size > this value
+    max_concurrency: 8           # Maximum number of concurrent goroutines
+
+    # Metrics configuration for monitoring batch classification performance
+    metrics:
+      enabled: true              # Enable comprehensive metrics collection
+      detailed_goroutine_tracking: true  # Track individual goroutine lifecycle
+      high_resolution_timing: false      # Use nanosecond precision timing
+      sample_rate: 1.0                   # Collect metrics for all requests (1.0 = 100%, 0.5 = 50%)
+      # Histogram buckets for metrics (directly configure what you need)
+      duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+      size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] 
diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go
index 4a8aa580..cc0fb15c 100644
--- a/src/semantic-router/pkg/config/config.go
+++ b/src/semantic-router/pkg/config/config.go
@@ -192,6 +192,13 @@ type VLLMEndpoint struct {
 }
 
 // ModelParams represents configuration for model-specific parameters
+type ModelPricing struct {
+	// Price in USD per 1M prompt tokens
+	PromptUSDPer1M float64 `yaml:"prompt_usd_per_1m,omitempty"`
+	// Price in USD per 1M completion tokens
+	CompletionUSDPer1M float64 `yaml:"completion_usd_per_1m,omitempty"`
+}
+
 type ModelParams struct {
 	// Number of parameters in the model
 	ParamCount float64 `yaml:"param_count"`
@@ -207,6 +214,9 @@ type ModelParams struct {
 
 	// Preferred endpoints for this model (optional)
 	PreferredEndpoints []string `yaml:"preferred_endpoints,omitempty"`
+
+	// Optional pricing used for cost computation
+	Pricing ModelPricing `yaml:"pricing,omitempty"`
 }
 
 // PIIPolicy represents the PII (Personally Identifiable Information) policy for a model
@@ -364,6 +374,17 @@ func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float6
 	return defaultValue
 }
 
+// GetModelPricing returns pricing in USD per 1M tokens for prompt and completion.
+func (c *RouterConfig) GetModelPricing(modelName string) (promptUSDPer1M float64, completionUSDPer1M float64, ok bool) {
+	if modelConfig, okc := c.ModelConfig[modelName]; okc {
+		p := modelConfig.Pricing
+		if p.PromptUSDPer1M != 0 || p.CompletionUSDPer1M != 0 {
+			return p.PromptUSDPer1M, p.CompletionUSDPer1M, true
+		}
+	}
+	return 0, 0, false
+}
+
 // GetModelPIIPolicy returns the PII policy for a given model
 // If the model is not found in the config, returns a default policy that allows all PII
 func (c *RouterConfig) GetModelPIIPolicy(modelName string) PIIPolicy {
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
index 31db6d7a..0bfa4431 100644
--- a/src/semantic-router/pkg/extproc/request_handler.go
+++ b/src/semantic-router/pkg/extproc/request_handler.go
@@ -14,6 +14,7 @@ import (
 
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/cache"
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/metrics"
+	"github.com/vllm-project/semantic-router/semantic-router/pkg/observability"
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/http"
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/pii"
 )
@@ -173,7 +174,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
 	userContent, nonUserMessages := extractUserAndNonUserContent(openAIRequest)
 
 	// Perform security checks
-	if response, shouldReturn := r.performSecurityChecks(userContent, nonUserMessages); shouldReturn {
+	if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages); shouldReturn {
 		return response, nil
 	}
 
@@ -187,7 +188,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
 }
 
 // performSecurityChecks performs PII and jailbreak detection
-func (r *OpenAIRouter) performSecurityChecks(userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) {
+func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) {
 	// Perform PII classification on all message content
 	allContent := pii.ExtractAllContent(userContent, nonUserMessages)
 
@@ -212,6 +213,13 @@ func (r *OpenAIRouter) performSecurityChecks(userContent string, nonUserMessages
 			log.Printf("JAILBREAK ATTEMPT BLOCKED: %s (confidence: %.3f)", jailbreakType, confidence)
 
 			// Return immediate jailbreak violation response
+			// Structured log for security block
+			observability.LogEvent("security_block", map[string]interface{}{
+				"reason_code":    "jailbreak_detected",
+				"jailbreak_type": jailbreakType,
+				"confidence":     confidence,
+				"request_id":     ctx.RequestID,
+			})
 			jailbreakResponse := http.CreateJailbreakViolationResponse(jailbreakType, confidence)
 			return jailbreakResponse, true
 		} else {
@@ -241,6 +249,13 @@ func (r *OpenAIRouter) handleCaching(ctx *RequestContext) (*ext_proc.ProcessingR
 		if err != nil {
 			log.Printf("Error searching cache: %v", err)
 		} else if found {
+			// Record and log cache hit
+			metrics.RecordCacheHit()
+			observability.LogEvent("cache_hit", map[string]interface{}{
+				"request_id": ctx.RequestID,
+				"model":      requestModel,
+				"query":      requestQuery,
+			})
 			// Return immediate response from cache
 			response := http.CreateCacheHitResponse(cachedResponse)
 			return response, true
@@ -313,6 +328,8 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 								// Select the best allowed model from this category
 								matchedModel = r.Classifier.SelectBestModelFromList(allowedModels, categoryName)
 								log.Printf("Selected alternative model %s that passes PII policy", matchedModel)
+								// Record reason code for selecting alternative due to PII
+								metrics.RecordRoutingReasonCode("pii_policy_alternative_selected", matchedModel)
 							} else {
 								log.Printf("No models in category %s pass PII policy, using default", categoryName)
 								matchedModel = r.Config.DefaultModel
@@ -320,12 +337,24 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 								defaultAllowed, defaultDeniedPII, _ := r.PIIChecker.CheckPolicy(matchedModel, detectedPII)
 								if !defaultAllowed {
 									log.Printf("Default model also violates PII policy, returning error")
+									observability.LogEvent("routing_block", map[string]interface{}{
+										"reason_code": "pii_policy_denied_default_model",
+										"request_id":  ctx.RequestID,
+										"model":       matchedModel,
+										"denied_pii":  defaultDeniedPII,
+									})
 									piiResponse := http.CreatePIIViolationResponse(matchedModel, defaultDeniedPII)
 									return piiResponse, nil
 								}
 							}
 						} else {
 							log.Printf("Could not determine category, returning PII violation for model %s", matchedModel)
+							observability.LogEvent("routing_block", map[string]interface{}{
+								"reason_code": "pii_policy_denied",
+								"request_id":  ctx.RequestID,
+								"model":       matchedModel,
+								"denied_pii":  deniedPII,
+							})
 							piiResponse := http.CreatePIIViolationResponse(matchedModel, deniedPII)
 							return piiResponse, nil
 						}
@@ -424,6 +453,20 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 				}
 
 				log.Printf("Use new model: %s", matchedModel)
+
+				// Structured log for routing decision (auto)
+				observability.LogEvent("routing_decision", map[string]interface{}{
+					"reason_code":        "auto_routing",
+					"request_id":         ctx.RequestID,
+					"original_model":     originalModel,
+					"selected_model":     matchedModel,
+					"category":           categoryName,
+					"reasoning_enabled":  useReasoning,
+					"reasoning_effort":   effortForMetrics,
+					"selected_endpoint":  selectedEndpoint,
+					"routing_latency_ms": time.Since(ctx.ProcessingStartTime).Milliseconds(),
+				})
+				metrics.RecordRoutingReasonCode("auto_routing", matchedModel)
 			}
 		}
 	} else if originalModel != "auto" {
@@ -438,6 +481,12 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 			// Continue with request on error
 		} else if !allowed {
 			log.Printf("Model %s violates PII policy, returning error", originalModel)
+			observability.LogEvent("routing_block", map[string]interface{}{
+				"reason_code": "pii_policy_denied",
+				"request_id":  ctx.RequestID,
+				"model":       originalModel,
+				"denied_pii":  deniedPII,
+			})
 			piiResponse := http.CreatePIIViolationResponse(originalModel, deniedPII)
 			return piiResponse, nil
 		}
@@ -472,6 +521,19 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 				},
 			},
 		}
+		// Structured log for routing decision (explicit model)
+		observability.LogEvent("routing_decision", map[string]interface{}{
+			"reason_code":        "model_specified",
+			"request_id":         ctx.RequestID,
+			"original_model":     originalModel,
+			"selected_model":     originalModel,
+			"category":           "",
+			"reasoning_enabled":  false,
+			"reasoning_effort":   "",
+			"selected_endpoint":  selectedEndpoint,
+			"routing_latency_ms": time.Since(ctx.ProcessingStartTime).Milliseconds(),
+		})
+		metrics.RecordRoutingReasonCode("model_specified", originalModel)
 	}
 
 	// Save the actual model that will be used for token tracking
diff --git a/src/semantic-router/pkg/extproc/response_handler.go b/src/semantic-router/pkg/extproc/response_handler.go
index 7015acc6..1fdbe7bf 100644
--- a/src/semantic-router/pkg/extproc/response_handler.go
+++ b/src/semantic-router/pkg/extproc/response_handler.go
@@ -9,6 +9,7 @@ import (
 
 	"github.com/openai/openai-go"
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/metrics"
+	"github.com/vllm-project/semantic-router/semantic-router/pkg/observability"
 )
 
 // handleResponseHeaders processes the response headers
@@ -52,6 +53,35 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response
 		)
 		metrics.RecordModelCompletionLatency(ctx.RequestModel, completionLatency.Seconds())
 		r.Classifier.DecrementModelLoad(ctx.RequestModel)
+
+		// Compute and record cost if pricing is configured
+		if r.Config != nil {
+			promptRatePer1M, completionRatePer1M, ok := r.Config.GetModelPricing(ctx.RequestModel)
+			if ok {
+				costUSD := (float64(promptTokens)*promptRatePer1M + float64(completionTokens)*completionRatePer1M) / 1_000_000.0
+				metrics.RecordModelCostUSD(ctx.RequestModel, costUSD)
+				observability.LogEvent("llm_usage", map[string]interface{}{
+					"request_id":            ctx.RequestID,
+					"model":                 ctx.RequestModel,
+					"prompt_tokens":         promptTokens,
+					"completion_tokens":     completionTokens,
+					"total_tokens":          promptTokens + completionTokens,
+					"completion_latency_ms": completionLatency.Milliseconds(),
+					"cost_usd":              costUSD,
+				})
+			} else {
+				observability.LogEvent("llm_usage", map[string]interface{}{
+					"request_id":            ctx.RequestID,
+					"model":                 ctx.RequestModel,
+					"prompt_tokens":         promptTokens,
+					"completion_tokens":     completionTokens,
+					"total_tokens":          promptTokens + completionTokens,
+					"completion_latency_ms": completionLatency.Milliseconds(),
+					"cost_usd":              0.0,
+					"pricing":               "not_configured",
+				})
+			}
+		}
 	}
 
 	// Check if this request has a pending cache entry
diff --git a/src/semantic-router/pkg/metrics/metrics.go b/src/semantic-router/pkg/metrics/metrics.go
index 42a1fb6b..cd1b1553 100644
--- a/src/semantic-router/pkg/metrics/metrics.go
+++ b/src/semantic-router/pkg/metrics/metrics.go
@@ -102,6 +102,15 @@ var (
 		[]string{"model"},
 	)
 
+	// ModelCostUSD tracks the total USD cost attributed to each model
+	ModelCostUSD = promauto.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "llm_model_cost_usd_total",
+			Help: "The total USD cost attributed to each LLM model",
+		},
+		[]string{"model"},
+	)
+
 	// ModelTokens tracks the number of tokens used by each model
 	ModelTokens = promauto.NewCounterVec(
 		prometheus.CounterOpts{
@@ -138,6 +147,15 @@ var (
 		[]string{"source_model", "target_model"},
 	)
 
+	// RoutingReasonCodes tracks routing decisions by reason_code and model
+	RoutingReasonCodes = promauto.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "llm_routing_reason_codes_total",
+			Help: "The total number of routing decisions by reason code and model",
+		},
+		[]string{"reason_code", "model"},
+	)
+
 	// ModelCompletionLatency tracks the latency of completions by model
 	ModelCompletionLatency = promauto.NewHistogramVec(
 		prometheus.HistogramOpts{
@@ -238,6 +256,25 @@ func RecordModelTokens(model string, tokens float64) {
 	ModelTokens.WithLabelValues(model).Add(tokens)
 }
 
+// RecordModelCostUSD adds the dollar cost attributed to a specific model
+func RecordModelCostUSD(model string, usd float64) {
+	if usd < 0 {
+		return
+	}
+	ModelCostUSD.WithLabelValues(model).Add(usd)
+}
+
+// RecordRoutingReasonCode increments the counter for a routing decision reason code and model
+func RecordRoutingReasonCode(reasonCode, model string) {
+	if reasonCode == "" {
+		reasonCode = "unknown"
+	}
+	if model == "" {
+		model = "unknown"
+	}
+	RoutingReasonCodes.WithLabelValues(reasonCode, model).Inc()
+}
+
 // RecordModelTokensDetailed records detailed token usage (prompt and completion)
 func RecordModelTokensDetailed(model string, promptTokens, completionTokens float64) {
 	// Record in both the aggregated and detailed metrics
diff --git a/src/semantic-router/pkg/observability/logging.go b/src/semantic-router/pkg/observability/logging.go
new file mode 100644
index 00000000..ad44ae31
--- /dev/null
+++ b/src/semantic-router/pkg/observability/logging.go
@@ -0,0 +1,28 @@
+package observability
+
+import (
+	"encoding/json"
+	"log"
+	"time"
+)
+
+// LogEvent emits a structured JSON log line with a standard envelope
+// Fields provided by callers take precedence and will not be overwritten.
+func LogEvent(event string, fields map[string]interface{}) {
+	if fields == nil {
+		fields = map[string]interface{}{}
+	}
+	if _, ok := fields["event"]; !ok {
+		fields["event"] = event
+	}
+	if _, ok := fields["ts"]; !ok {
+		fields["ts"] = time.Now().UTC().Format(time.RFC3339Nano)
+	}
+	b, err := json.Marshal(fields)
+	if err != nil {
+		// Fallback to regular log on marshal error
+		log.Printf("event=%s marshal_error=%v fields_len=%d", event, err, len(fields))
+		return
+	}
+	log.Println(string(b))
+}
diff --git a/website/docs/api/router.md b/website/docs/api/router.md
index 26a1e557..b76986c9 100644
--- a/website/docs/api/router.md
+++ b/website/docs/api/router.md
@@ -218,6 +218,55 @@ sum by (family, effort) (
 )
 ```
 
+### Cost and Routing Metrics
+
+The router exposes additional metrics for cost accounting and routing decisions.
+
+- `llm_model_cost_usd_total{model}`
+  - Description: Total accumulated USD cost attributed to each model (computed from token usage and per-1M pricing).
+  - Labels:
+    - model: model name used for the request
+
+- `llm_routing_reason_codes_total{reason_code, model}`
+  - Description: Count of routing decisions by reason code and selected model.
+  - Labels:
+    - reason_code: why a routing decision happened (e.g., auto_routing, model_specified, pii_policy_alternative_selected)
+    - model: final selected model
+
+Example PromQL:
+
+```prometheus
+# Cost by model over the last hour
+sum by (model) (increase(llm_model_cost_usd_total[1h]))
+
+# Routing decisions by reason code over the last 15 minutes
+sum by (reason_code) (increase(llm_routing_reason_codes_total[15m]))
+```
+
+### Pricing Configuration
+
+Provide per-1M pricing for your models so the router can compute request cost and emit metrics/logs.
+
+```yaml
+model_config:
+  phi4:
+    pricing:
+      prompt_usd_per_1m: 200.0
+      completion_usd_per_1m: 600.0
+  "mistral-small3.1":
+    pricing:
+      prompt_usd_per_1m: 300.0
+      completion_usd_per_1m: 900.0
+  gemma3:27b:
+    pricing:
+      prompt_usd_per_1m: 500.0
+      completion_usd_per_1m: 1500.0
+```
+
+Notes:
+- Pricing is optional; if omitted, cost is treated as 0 and only token metrics are emitted.
+- Cost is computed as: (prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000.
+
 ## gRPC ExtProc API
 
 For direct integration with the ExtProc protocol:
diff --git a/website/docs/getting-started/configuration.md b/website/docs/getting-started/configuration.md
index dbba7606..074793a9 100644
--- a/website/docs/getting-started/configuration.md
+++ b/website/docs/getting-started/configuration.md
@@ -124,6 +124,29 @@ model_config:
     preferred_endpoints: ["my_endpoint"]
 ```
 
+### Pricing (Optional)
+
+If you want the router to compute USD cost per request and expose Prometheus cost metrics, add per-1M token pricing under each model in `model_config`.
+
+```yaml
+model_config:
+  phi4:
+    pricing:
+      prompt_usd_per_1m: 0.07
+      completion_usd_per_1m: 0.35
+  "mistral-small3.1":
+    pricing:
+      prompt_usd_per_1m: 0.1
+      completion_usd_per_1m: 0.3
+  gemma3:27b:
+    pricing:
+      prompt_usd_per_1m: 0.067
+      completion_usd_per_1m: 0.267
+```
+
+- Cost formula: `(prompt_tokens * prompt_usd_per_1m + completion_tokens * completion_usd_per_1m) / 1_000_000`.
+- When not configured, the router still reports token and latency metrics; cost is treated as 0.
+
 ### Classification Models
 
 Configure the BERT classification models:
@@ -658,4 +681,4 @@ This workflow ensures your configuration is:
 - **[Quick Start Guide](installation.md)** - Basic usage examples
 - **[API Documentation](../api/router.md)** - Complete API reference
 
-The configuration system is designed to be simple yet powerful. Start with the basic configuration and gradually enable advanced features as needed.
\ No newline at end of file
+The configuration system is designed to be simple yet powerful. Start with the basic configuration and gradually enable advanced features as needed.