From 4e286e567be9ab6611c83f8783a7c0adbd1a3bc9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 22 Sep 2025 22:21:20 +0000 Subject: [PATCH 1/2] Initial plan From b4c206f04bba01edddf4a5b0b828673c6605b0ac Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 22 Sep 2025 22:40:10 +0000 Subject: [PATCH 2/2] Implement comprehensive multi-cloud and hybrid cloud routing support Co-authored-by: wangchen615 <39573014+wangchen615@users.noreply.github.com> --- config/multi-cloud-config-example.yaml | 351 +++++++++++++++++ src/semantic-router/pkg/config/config.go | 366 ++++++++++++++++++ .../pkg/extproc/request_handler.go | 97 ++++- src/semantic-router/pkg/extproc/router.go | 11 + .../pkg/intercluster/router.go | 351 +++++++++++++++++ .../pkg/intercluster/router_test.go | 293 ++++++++++++++ .../getting-started/multi-cloud-routing.md | 205 ++++++++++ 7 files changed, 1661 insertions(+), 13 deletions(-) create mode 100644 config/multi-cloud-config-example.yaml create mode 100644 src/semantic-router/pkg/intercluster/router.go create mode 100644 src/semantic-router/pkg/intercluster/router_test.go create mode 100644 website/docs/getting-started/multi-cloud-routing.md diff --git a/config/multi-cloud-config-example.yaml b/config/multi-cloud-config-example.yaml new file mode 100644 index 00000000..078b2e8f --- /dev/null +++ b/config/multi-cloud-config-example.yaml @@ -0,0 +1,351 @@ +# Multi-Cloud Semantic Router Configuration Example +# This configuration demonstrates inter-cluster and hybrid cloud routing capabilities + +bert_model: + model_id: sentence-transformers/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + +semantic_cache: + enabled: true + backend_type: "memory" + similarity_threshold: 0.8 + max_entries: 1000 + ttl_seconds: 3600 + eviction_policy: "fifo" + +tools: + enabled: true + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true + +prompt_guard: + enabled: true + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +# Local vLLM Endpoints (legacy single-cluster support) +vllm_endpoints: + - name: "local-endpoint" + address: "127.0.0.1" + port: 8000 + models: + - "llama-2-7b" + weight: 1 + health_check_path: "/health" + +# Model Configuration +model_config: + "llama-2-70b": + reasoning_family: "llama" + pii_policy: + allow_by_default: true + "gpt-4": + reasoning_family: "gpt" + pii_policy: + allow_by_default: false + pii_types_allowed: ["EMAIL_ADDRESS", "PERSON"] + "claude-3": + reasoning_family: "claude" + pii_policy: + allow_by_default: true + +# Inter-Cluster and Multi-Cloud Routing Configuration +inter_cluster_routing: + enabled: true + + # Cluster Discovery Configuration + cluster_discovery: + method: "static" # Options: "static", "kubernetes", "consul", "etcd" + refresh_interval: "30s" + health_check_interval: "10s" + + # Static cluster definitions + static_clusters: + - name: "on-prem-gpu-cluster" + location: "us-west-2" + type: "vllm" + endpoint: "https://on-prem.company.com:8000" + authentication: + type: "bearer" + token: "bearer-token-secret" + models: + - "llama-2-70b" + - "codellama-34b" + - "mistral-7b" + capabilities: + max_context_length: 4096 + max_tokens_per_second: 100 + performance: + avg_latency_ms: 150 + throughput_rps: 50 + availability: 99.5 + compliance: + - "hipaa" + - "sox" + cost_per_token: 0.001 + health_check: + path: "/health" + interval: "15s" + timeout: "5s" + unhealthy_threshold: 3 + healthy_threshold: 2 + + - name: "eu-west-cluster" + location: "eu-west-1" + type: "vllm" + endpoint: "https://eu-cluster.company.com:8000" + authentication: + type: "bearer" + token: "eu-bearer-token" + models: + - "llama-2-70b" + - "mistral-7b" + capabilities: + max_context_length: 4096 + max_tokens_per_second: 80 + performance: + avg_latency_ms: 200 + throughput_rps: 40 + availability: 99.9 + compliance: + - "gdpr" + - "iso27001" + cost_per_token: 0.0015 + health_check: + path: "/health" + interval: "15s" + timeout: "5s" + + - name: "code-specialized-cluster" + location: "us-east-1" + type: "vllm" + endpoint: "https://code-cluster.company.com:8000" + authentication: + type: "api_key" + key: "api-key-secret" + models: + - "codellama-34b" + - "gpt-4-code" + capabilities: + max_context_length: 8192 + max_tokens_per_second: 120 + performance: + avg_latency_ms: 100 + throughput_rps: 60 + availability: 99.8 + cost_per_token: 0.002 + + # Cloud Provider Configurations + providers: + - name: "openai-cloud" + type: "openai" + endpoint: "https://api.openai.com/v1" + authentication: + type: "api_key" + key: "sk-your-openai-api-key" + models: + - "gpt-4" + - "gpt-3.5-turbo" + - "gpt-4-turbo" + capabilities: + max_context_length: 8192 + max_tokens_per_second: 200 + performance: + avg_latency_ms: 300 + throughput_rps: 100 + availability: 99.9 + rate_limit: + requests_per_minute: 500 + tokens_per_minute: 90000 + burst_allowance: 50 + + - name: "anthropic-claude" + type: "claude" + endpoint: "https://api.anthropic.com/v1" + authentication: + type: "api_key" + key: "claude-api-key" + models: + - "claude-3" + - "claude-3-sonnet" + - "claude-3-haiku" + capabilities: + max_context_length: 200000 + max_tokens_per_second: 150 + performance: + avg_latency_ms: 400 + throughput_rps: 80 + availability: 99.8 + rate_limit: + requests_per_minute: 300 + tokens_per_minute: 50000 + + - name: "grok-provider" + type: "grok" + endpoint: "https://api.x.ai/v1" + authentication: + type: "api_key" + key: "grok-api-key" + models: + - "grok-1" + - "grok-1.5" + capabilities: + max_context_length: 128000 + max_tokens_per_second: 100 + performance: + avg_latency_ms: 500 + throughput_rps: 60 + availability: 99.5 + rate_limit: + requests_per_minute: 200 + tokens_per_minute: 40000 + + # Routing Strategies (applied in priority order - higher number = higher priority) + routing_strategies: + # Highest Priority: Compliance-based routing for GDPR requirements + - name: "gdpr-compliance-routing" + priority: 300 + conditions: + - type: "compliance_requirement" + required_compliance: ["gdpr"] + actions: + - type: "route_to_cluster" + target: "eu-west-cluster" + + # High Priority: Code generation routing + - name: "code-generation-routing" + priority: 250 + conditions: + - type: "model_requirement" + required_model: "codellama-34b" + actions: + - type: "route_to_cluster" + target: "code-specialized-cluster" + + # Medium Priority: Latency-optimized routing + - name: "latency-optimized-routing" + priority: 200 + conditions: + - type: "latency_requirement" + max_latency_ms: 200 + actions: + - type: "route_to_cluster" + target: "code-specialized-cluster" + - type: "failover" + failover_targets: ["on-prem-gpu-cluster", "eu-west-cluster"] + + # Medium Priority: Cost-sensitive routing + - name: "cost-optimized-routing" + priority: 150 + conditions: + - type: "cost_sensitivity" + max_cost_per_1k_tokens: 0.0015 + actions: + - type: "route_to_cluster" + target: "on-prem-gpu-cluster" + - type: "failover" + failover_targets: ["eu-west-cluster"] + + # Low Priority: Load balancing for general queries + - name: "load-balanced-routing" + priority: 100 + conditions: [] # No specific conditions - applies to all requests + actions: + - type: "load_balance" + load_balance_strategy: "round_robin" + + # Fault Tolerance Configuration + fault_tolerance: + circuit_breaker: + failure_threshold: 5 + timeout: "30s" + max_requests: 10 + retry_policy: + max_retries: 3 + backoff_multiplier: 2.0 + max_backoff: "10s" + retry_on_errors: ["timeout", "connection_error", "server_error"] + fallback_strategy: "next_best_cluster" + default_fallback_cluster: "on-prem-gpu-cluster" + +# Classifier configuration +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.7 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_model/pii_type_mapping.json" + +# Categories for routing queries +categories: + - name: "math" + description: "Mathematical calculations and problem solving" + model_scores: + - model: "llama-2-70b" + score: 0.9 + use_reasoning: true + - model: "gpt-4" + score: 0.85 + use_reasoning: true + + - name: "creative" + description: "Creative writing, storytelling, and artistic content" + model_scores: + - model: "claude-3" + score: 0.95 + use_reasoning: false + - model: "gpt-4" + score: 0.8 + use_reasoning: false + + - name: "code_generation" + description: "Programming, code generation, and software development" + reasoning_description: "Code generation with step-by-step reasoning" + reasoning_effort: "high" + model_scores: + - model: "codellama-34b" + score: 0.95 + use_reasoning: true + - model: "gpt-4-code" + score: 0.9 + use_reasoning: true + + - name: "general" + description: "General purpose queries and conversations" + model_scores: + - model: "llama-2-70b" + score: 0.8 + use_reasoning: false + - model: "gpt-3.5-turbo" + score: 0.75 + use_reasoning: false + +# Default model to use if no match is found +default_model: "llama-2-7b" + +# Default reasoning effort level +default_reasoning_effort: "medium" + +# Reasoning family configurations +reasoning_families: + llama: + type: "chat_template_kwargs" + parameter: "thinking" + gpt: + type: "reasoning_effort" + parameter: "reasoning_effort" + claude: + type: "chat_template_kwargs" + parameter: "enable_thinking" \ No newline at end of file diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index cf4aafa9..b2653d89 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -85,6 +85,9 @@ type RouterConfig struct { // vLLM endpoints configuration for multiple backend support VLLMEndpoints []VLLMEndpoint `yaml:"vllm_endpoints"` + // Inter-cluster routing configuration + InterClusterRouting InterClusterConfig `yaml:"inter_cluster_routing,omitempty"` + // API configuration for classification endpoints API APIConfig `yaml:"api"` } @@ -251,6 +254,272 @@ const ( PIITypeZipCode = "ZIP_CODE" // ZIP/Postal codes ) +// InterClusterConfig represents the inter-cluster and multi-cloud routing configuration +type InterClusterConfig struct { + // Enable inter-cluster routing + Enabled bool `yaml:"enabled"` + + // Cluster discovery configuration + ClusterDiscovery ClusterDiscoveryConfig `yaml:"cluster_discovery,omitempty"` + + // Provider configurations + Providers []ProviderConfig `yaml:"providers,omitempty"` + + // Routing strategies + RoutingStrategies []RoutingStrategy `yaml:"routing_strategies,omitempty"` + + // Fault tolerance configuration + FaultTolerance FaultToleranceConfig `yaml:"fault_tolerance,omitempty"` +} + +// ClusterDiscoveryConfig represents cluster discovery and management settings +type ClusterDiscoveryConfig struct { + // Discovery method: "static", "kubernetes", "consul", "etcd" + Method string `yaml:"method"` + + // Refresh interval for dynamic discovery + RefreshInterval string `yaml:"refresh_interval,omitempty"` + + // Health check interval + HealthCheckInterval string `yaml:"health_check_interval,omitempty"` + + // Static cluster definitions (used when method is "static") + StaticClusters []ClusterConfig `yaml:"static_clusters,omitempty"` +} + +// ClusterConfig represents a cluster configuration +type ClusterConfig struct { + // Cluster name identifier + Name string `yaml:"name"` + + // Cluster location/region + Location string `yaml:"location"` + + // Cluster type: "vllm", "openai", "claude", "grok", "custom" + Type string `yaml:"type"` + + // Endpoint address + Endpoint string `yaml:"endpoint"` + + // Authentication configuration + Authentication AuthConfig `yaml:"authentication,omitempty"` + + // Available models in this cluster + Models []string `yaml:"models"` + + // Cluster capabilities + Capabilities ClusterCapabilities `yaml:"capabilities,omitempty"` + + // Performance characteristics + Performance PerformanceMetrics `yaml:"performance,omitempty"` + + // Compliance and data residency + Compliance []string `yaml:"compliance,omitempty"` + + // Cost per token (for cost-based routing) + CostPerToken float64 `yaml:"cost_per_token,omitempty"` + + // Health check configuration + HealthCheck HealthCheckConfig `yaml:"health_check,omitempty"` +} + +// ProviderConfig represents a cloud provider configuration +type ProviderConfig struct { + // Provider name + Name string `yaml:"name"` + + // Provider type: "vllm", "openai", "claude", "grok", "custom" + Type string `yaml:"type"` + + // Endpoint URL + Endpoint string `yaml:"endpoint"` + + // Authentication configuration + Authentication AuthConfig `yaml:"authentication"` + + // Available models + Models []string `yaml:"models"` + + // Provider-specific capabilities + Capabilities ClusterCapabilities `yaml:"capabilities,omitempty"` + + // Performance characteristics + Performance PerformanceMetrics `yaml:"performance,omitempty"` + + // Rate limiting configuration + RateLimit RateLimitConfig `yaml:"rate_limit,omitempty"` +} + +// AuthConfig represents authentication configuration for providers/clusters +type AuthConfig struct { + // Authentication type: "bearer", "api_key", "oauth", "none" + Type string `yaml:"type"` + + // Bearer token + Token string `yaml:"token,omitempty"` + + // API key + Key string `yaml:"key,omitempty"` + + // OAuth configuration + OAuth OAuthConfig `yaml:"oauth,omitempty"` +} + +// OAuthConfig represents OAuth authentication configuration +type OAuthConfig struct { + ClientID string `yaml:"client_id"` + ClientSecret string `yaml:"client_secret"` + TokenURL string `yaml:"token_url"` + Scopes []string `yaml:"scopes,omitempty"` +} + +// ClusterCapabilities represents cluster/provider capabilities +type ClusterCapabilities struct { + // Maximum context length + MaxContextLength int `yaml:"max_context_length,omitempty"` + + // Maximum tokens per second + MaxTokensPerSecond int `yaml:"max_tokens_per_second,omitempty"` + + // Supported features + Features []string `yaml:"features,omitempty"` +} + +// PerformanceMetrics represents performance characteristics +type PerformanceMetrics struct { + // Average latency in milliseconds + AvgLatencyMs int `yaml:"avg_latency_ms,omitempty"` + + // Throughput in requests per second + ThroughputRPS int `yaml:"throughput_rps,omitempty"` + + // Availability percentage + Availability float64 `yaml:"availability,omitempty"` +} + +// RateLimitConfig represents rate limiting configuration +type RateLimitConfig struct { + // Requests per minute + RequestsPerMinute int `yaml:"requests_per_minute,omitempty"` + + // Tokens per minute + TokensPerMinute int `yaml:"tokens_per_minute,omitempty"` + + // Burst allowance + BurstAllowance int `yaml:"burst_allowance,omitempty"` +} + +// RoutingStrategy represents a routing strategy configuration +type RoutingStrategy struct { + // Strategy name + Name string `yaml:"name"` + + // Priority (higher number = higher priority) + Priority int `yaml:"priority"` + + // Conditions for applying this strategy + Conditions []RoutingCondition `yaml:"conditions,omitempty"` + + // Actions to take when conditions are met + Actions []RoutingAction `yaml:"actions"` +} + +// RoutingCondition represents a condition for routing strategy +type RoutingCondition struct { + // Condition type: "latency_requirement", "cost_sensitivity", "data_residency", "model_requirement", etc. + Type string `yaml:"type"` + + // Maximum latency in milliseconds (for latency_requirement) + MaxLatencyMs int `yaml:"max_latency_ms,omitempty"` + + // Maximum cost per 1k tokens (for cost_sensitivity) + MaxCostPer1kTokens float64 `yaml:"max_cost_per_1k_tokens,omitempty"` + + // Required region (for data_residency) + RequiredRegion string `yaml:"required_region,omitempty"` + + // Required model (for model_requirement) + RequiredModel string `yaml:"required_model,omitempty"` + + // Required compliance (for compliance_requirement) + RequiredCompliance []string `yaml:"required_compliance,omitempty"` +} + +// RoutingAction represents an action for routing strategy +type RoutingAction struct { + // Action type: "route_to_cluster", "route_to_provider", "load_balance", "failover" + Type string `yaml:"type"` + + // Target cluster/provider name + Target string `yaml:"target,omitempty"` + + // Load balancing strategy: "round_robin", "least_connections", "weighted" + LoadBalanceStrategy string `yaml:"load_balance_strategy,omitempty"` + + // Failover targets (ordered list) + FailoverTargets []string `yaml:"failover_targets,omitempty"` +} + +// FaultToleranceConfig represents fault tolerance configuration +type FaultToleranceConfig struct { + // Circuit breaker configuration + CircuitBreaker CircuitBreakerConfig `yaml:"circuit_breaker,omitempty"` + + // Retry policy + RetryPolicy RetryPolicyConfig `yaml:"retry_policy,omitempty"` + + // Fallback strategy: "next_best_cluster", "default_cluster", "error" + FallbackStrategy string `yaml:"fallback_strategy,omitempty"` + + // Default fallback cluster + DefaultFallbackCluster string `yaml:"default_fallback_cluster,omitempty"` +} + +// CircuitBreakerConfig represents circuit breaker configuration +type CircuitBreakerConfig struct { + // Failure threshold to open circuit + FailureThreshold int `yaml:"failure_threshold,omitempty"` + + // Timeout before attempting to close circuit + Timeout string `yaml:"timeout,omitempty"` + + // Maximum number of requests in half-open state + MaxRequests int `yaml:"max_requests,omitempty"` +} + +// RetryPolicyConfig represents retry policy configuration +type RetryPolicyConfig struct { + // Maximum number of retries + MaxRetries int `yaml:"max_retries,omitempty"` + + // Backoff multiplier + BackoffMultiplier float64 `yaml:"backoff_multiplier,omitempty"` + + // Maximum backoff duration + MaxBackoff string `yaml:"max_backoff,omitempty"` + + // Retry on specific error codes + RetryOnErrors []string `yaml:"retry_on_errors,omitempty"` +} + +// HealthCheckConfig represents health check configuration +type HealthCheckConfig struct { + // Health check path/endpoint + Path string `yaml:"path,omitempty"` + + // Check interval + Interval string `yaml:"interval,omitempty"` + + // Timeout for health check + Timeout string `yaml:"timeout,omitempty"` + + // Unhealthy threshold + UnhealthyThreshold int `yaml:"unhealthy_threshold,omitempty"` + + // Healthy threshold + HealthyThreshold int `yaml:"healthy_threshold,omitempty"` +} + // GetCacheSimilarityThreshold returns the effective threshold for the semantic cache func (c *RouterConfig) GetCacheSimilarityThreshold() float32 { if c.SemanticCache.SimilarityThreshold != nil { @@ -662,3 +931,100 @@ func (c *RouterConfig) ValidateEndpoints() error { return nil } + +// IsInterClusterRoutingEnabled returns true if inter-cluster routing is enabled +func (c *RouterConfig) IsInterClusterRoutingEnabled() bool { + return c.InterClusterRouting.Enabled +} + +// GetClusterByName returns a cluster configuration by name +func (c *RouterConfig) GetClusterByName(name string) (*ClusterConfig, bool) { + if !c.IsInterClusterRoutingEnabled() { + return nil, false + } + + for _, cluster := range c.InterClusterRouting.ClusterDiscovery.StaticClusters { + if cluster.Name == name { + return &cluster, true + } + } + return nil, false +} + +// GetProviderByName returns a provider configuration by name +func (c *RouterConfig) GetProviderByName(name string) (*ProviderConfig, bool) { + if !c.IsInterClusterRoutingEnabled() { + return nil, false + } + + for _, provider := range c.InterClusterRouting.Providers { + if provider.Name == name { + return &provider, true + } + } + return nil, false +} + +// GetRoutingStrategiesByPriority returns routing strategies sorted by priority (descending) +func (c *RouterConfig) GetRoutingStrategiesByPriority() []RoutingStrategy { + if !c.IsInterClusterRoutingEnabled() { + return nil + } + + strategies := make([]RoutingStrategy, len(c.InterClusterRouting.RoutingStrategies)) + copy(strategies, c.InterClusterRouting.RoutingStrategies) + + // Sort by priority (descending - higher priority first) + slices.SortFunc(strategies, func(a, b RoutingStrategy) int { + return b.Priority - a.Priority + }) + + return strategies +} + +// GetAllClustersAndProviders returns all available clusters and providers for routing +func (c *RouterConfig) GetAllClustersAndProviders() ([]ClusterConfig, []ProviderConfig) { + if !c.IsInterClusterRoutingEnabled() { + return nil, nil + } + + clusters := make([]ClusterConfig, len(c.InterClusterRouting.ClusterDiscovery.StaticClusters)) + copy(clusters, c.InterClusterRouting.ClusterDiscovery.StaticClusters) + + providers := make([]ProviderConfig, len(c.InterClusterRouting.Providers)) + copy(providers, c.InterClusterRouting.Providers) + + return clusters, providers +} + +// FindClustersForModel returns all clusters/providers that can serve the specified model +func (c *RouterConfig) FindClustersForModel(modelName string) ([]ClusterConfig, []ProviderConfig) { + if !c.IsInterClusterRoutingEnabled() { + return nil, nil + } + + var matchingClusters []ClusterConfig + var matchingProviders []ProviderConfig + + // Check clusters + for _, cluster := range c.InterClusterRouting.ClusterDiscovery.StaticClusters { + for _, model := range cluster.Models { + if model == modelName { + matchingClusters = append(matchingClusters, cluster) + break + } + } + } + + // Check providers + for _, provider := range c.InterClusterRouting.Providers { + for _, model := range provider.Models { + if model == modelName { + matchingProviders = append(matchingProviders, provider) + break + } + } + } + + return matchingClusters, matchingProviders +} diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index 867333de..eccbafc1 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -12,6 +12,7 @@ import ( "google.golang.org/grpc/status" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/cache" + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/intercluster" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/metrics" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/utils/http" @@ -389,13 +390,48 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe // Update the actual model that will be used actualModel = matchedModel - // Select the best endpoint for this model - endpointAddress, endpointFound := r.Config.SelectBestEndpointAddressForModel(matchedModel) - if endpointFound { - selectedEndpoint = endpointAddress - observability.Infof("Selected endpoint address: %s for model: %s", selectedEndpoint, matchedModel) - } else { - observability.Warnf("No endpoint found for model %s, using fallback", matchedModel) + // Try inter-cluster routing first if enabled + if r.Config.IsInterClusterRoutingEnabled() { + routingCtx := &intercluster.RoutingContext{ + ModelName: matchedModel, + Category: categoryName, + UserContent: userContent, + } + + // Try to route using inter-cluster router + if routingResult, err := r.InterClusterRouter.RouteRequest(routingCtx); err == nil { + selectedEndpoint = routingResult.TargetEndpoint + observability.Infof("Inter-cluster routing selected %s (%s) for model: %s (reason: %s)", + routingResult.TargetName, routingResult.TargetType, matchedModel, routingResult.ReasonCode) + + // Log additional metrics for inter-cluster routing + observability.LogEvent("inter_cluster_routing", map[string]interface{}{ + "target_type": routingResult.TargetType, + "target_name": routingResult.TargetName, + "target_endpoint": routingResult.TargetEndpoint, + "reason_code": routingResult.ReasonCode, + "confidence": routingResult.Confidence, + "estimated_cost": routingResult.EstimatedCost, + "estimated_latency": routingResult.EstimatedLatency, + "model": matchedModel, + "category": categoryName, + }) + metrics.RecordRoutingReasonCode(routingResult.ReasonCode, matchedModel) + } else { + observability.Warnf("Inter-cluster routing failed for model %s: %v, falling back to local routing", matchedModel, err) + } + } + + // Fall back to local endpoint selection if inter-cluster routing didn't work + if selectedEndpoint == "" { + // Select the best endpoint for this model + endpointAddress, endpointFound := r.Config.SelectBestEndpointAddressForModel(matchedModel) + if endpointFound { + selectedEndpoint = endpointAddress + observability.Infof("Selected local endpoint address: %s for model: %s", selectedEndpoint, matchedModel) + } else { + observability.Warnf("No local endpoint found for model %s, using fallback", matchedModel) + } } // Modify the model in the request @@ -505,12 +541,47 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe } // Select the best endpoint for the specified model - endpointAddress, endpointFound := r.Config.SelectBestEndpointAddressForModel(originalModel) - if endpointFound { - selectedEndpoint = endpointAddress - observability.Infof("Selected endpoint address: %s for model: %s", selectedEndpoint, originalModel) - } else { - observability.Warnf("No endpoint found for model %s, using fallback", originalModel) + // Try inter-cluster routing first if enabled + if r.Config.IsInterClusterRoutingEnabled() { + routingCtx := &intercluster.RoutingContext{ + ModelName: originalModel, + Category: "", // No category for non-auto routing + UserContent: userContent, + } + + // Try to route using inter-cluster router + if routingResult, err := r.InterClusterRouter.RouteRequest(routingCtx); err == nil { + selectedEndpoint = routingResult.TargetEndpoint + observability.Infof("Inter-cluster routing selected %s (%s) for specified model: %s (reason: %s)", + routingResult.TargetName, routingResult.TargetType, originalModel, routingResult.ReasonCode) + + // Log additional metrics for inter-cluster routing + observability.LogEvent("inter_cluster_routing", map[string]interface{}{ + "target_type": routingResult.TargetType, + "target_name": routingResult.TargetName, + "target_endpoint": routingResult.TargetEndpoint, + "reason_code": routingResult.ReasonCode, + "confidence": routingResult.Confidence, + "estimated_cost": routingResult.EstimatedCost, + "estimated_latency": routingResult.EstimatedLatency, + "model": originalModel, + "category": "", + }) + metrics.RecordRoutingReasonCode(routingResult.ReasonCode, originalModel) + } else { + observability.Warnf("Inter-cluster routing failed for model %s: %v, falling back to local routing", originalModel, err) + } + } + + // Fall back to local endpoint selection if inter-cluster routing didn't work + if selectedEndpoint == "" { + endpointAddress, endpointFound := r.Config.SelectBestEndpointAddressForModel(originalModel) + if endpointFound { + selectedEndpoint = endpointAddress + observability.Infof("Selected local endpoint address: %s for model: %s", selectedEndpoint, originalModel) + } else { + observability.Warnf("No local endpoint found for model %s, using fallback", originalModel) + } } setHeaders := []*core.HeaderValueOption{} if selectedEndpoint != "" { diff --git a/src/semantic-router/pkg/extproc/router.go b/src/semantic-router/pkg/extproc/router.go index 90eed7c5..3a5fb879 100644 --- a/src/semantic-router/pkg/extproc/router.go +++ b/src/semantic-router/pkg/extproc/router.go @@ -9,6 +9,7 @@ import ( "github.com/vllm-project/semantic-router/src/semantic-router/pkg/cache" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/config" + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/intercluster" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/services" "github.com/vllm-project/semantic-router/src/semantic-router/pkg/tools" @@ -24,6 +25,7 @@ type OpenAIRouter struct { PIIChecker *pii.PolicyChecker Cache cache.CacheBackend ToolsDatabase *tools.ToolsDatabase + InterClusterRouter *intercluster.InterClusterRouter } // Ensure OpenAIRouter implements the ext_proc calls @@ -138,6 +140,14 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) { return nil, fmt.Errorf("failed to create classifier: %w", err) } + // Create inter-cluster router + interClusterRouter := intercluster.NewInterClusterRouter(cfg) + if cfg.IsInterClusterRoutingEnabled() { + observability.Infof("Inter-cluster routing enabled with %d clusters and %d providers", + len(cfg.InterClusterRouting.ClusterDiscovery.StaticClusters), + len(cfg.InterClusterRouting.Providers)) + } + // Create global classification service for API access with auto-discovery // This will prioritize LoRA models over legacy ModernBERT autoSvc, err := services.NewClassificationServiceWithAutoDiscovery(cfg) @@ -157,6 +167,7 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) { PIIChecker: piiChecker, Cache: semanticCache, ToolsDatabase: toolsDatabase, + InterClusterRouter: interClusterRouter, } // Log reasoning configuration after router is created diff --git a/src/semantic-router/pkg/intercluster/router.go b/src/semantic-router/pkg/intercluster/router.go new file mode 100644 index 00000000..ab41b25c --- /dev/null +++ b/src/semantic-router/pkg/intercluster/router.go @@ -0,0 +1,351 @@ +package intercluster + +import ( + "fmt" + "time" + + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/config" + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability" +) + +// InterClusterRouter handles routing decisions across multiple clusters and providers +type InterClusterRouter struct { + config *config.RouterConfig +} + +// NewInterClusterRouter creates a new inter-cluster router +func NewInterClusterRouter(cfg *config.RouterConfig) *InterClusterRouter { + return &InterClusterRouter{ + config: cfg, + } +} + +// RoutingContext contains information for routing decisions +type RoutingContext struct { + ModelName string + Category string + UserContent string + LatencyRequirement *int // Max latency in milliseconds + CostSensitivity *float64 // Max cost per 1k tokens + ComplianceRequirements []string // Required compliance standards + DataResidency string // Required region +} + +// RoutingResult contains the result of a routing decision +type RoutingResult struct { + TargetType string // "cluster" or "provider" + TargetName string + TargetEndpoint string + ReasonCode string + Confidence float64 + EstimatedCost float64 + EstimatedLatency int +} + +// RouteRequest performs inter-cluster routing based on context and strategies +func (r *InterClusterRouter) RouteRequest(ctx *RoutingContext) (*RoutingResult, error) { + if !r.config.IsInterClusterRoutingEnabled() { + observability.Infof("Inter-cluster routing disabled, falling back to local routing") + return nil, fmt.Errorf("inter-cluster routing is not enabled") + } + + // Get routing strategies by priority + strategies := r.config.GetRoutingStrategiesByPriority() + if len(strategies) == 0 { + observability.Warnf("No routing strategies configured, using default strategy") + return r.defaultRouting(ctx) + } + + // Apply strategies in priority order + for _, strategy := range strategies { + if r.evaluateConditions(strategy.Conditions, ctx) { + result, err := r.executeActions(strategy.Actions, ctx) + if err == nil && result != nil { + result.ReasonCode = fmt.Sprintf("strategy_%s", strategy.Name) + observability.Infof("Applied routing strategy '%s' for model '%s'", strategy.Name, ctx.ModelName) + return result, nil + } + observability.Warnf("Strategy '%s' failed: %v", strategy.Name, err) + } + } + + // Fall back to default routing + observability.Infof("No strategies matched, using default routing for model '%s'", ctx.ModelName) + return r.defaultRouting(ctx) +} + +// evaluateConditions checks if all conditions in a strategy are met +func (r *InterClusterRouter) evaluateConditions(conditions []config.RoutingCondition, ctx *RoutingContext) bool { + if len(conditions) == 0 { + return true // No conditions means always apply + } + + for _, condition := range conditions { + if !r.evaluateCondition(condition, ctx) { + return false + } + } + return true +} + +// evaluateCondition checks if a single condition is met +func (r *InterClusterRouter) evaluateCondition(condition config.RoutingCondition, ctx *RoutingContext) bool { + switch condition.Type { + case "latency_requirement": + if ctx.LatencyRequirement != nil && condition.MaxLatencyMs > 0 { + return *ctx.LatencyRequirement <= condition.MaxLatencyMs + } + return false // If no latency requirement in context, this condition should not match + + case "cost_sensitivity": + if ctx.CostSensitivity != nil && condition.MaxCostPer1kTokens > 0 { + return *ctx.CostSensitivity <= condition.MaxCostPer1kTokens + } + return false // If no cost sensitivity in context, this condition should not match + + case "data_residency": + if condition.RequiredRegion != "" && ctx.DataResidency != "" { + return ctx.DataResidency == condition.RequiredRegion + } + return false // If no data residency requirement, this condition should not match + + case "model_requirement": + if condition.RequiredModel != "" { + return ctx.ModelName == condition.RequiredModel + } + return true + + case "compliance_requirement": + if len(condition.RequiredCompliance) > 0 && len(ctx.ComplianceRequirements) > 0 { + return r.hasRequiredCompliance(condition.RequiredCompliance, ctx.ComplianceRequirements) + } + return false // If no compliance requirements in context, this condition should not match + + default: + observability.Warnf("Unknown condition type: %s", condition.Type) + return false + } +} + +// hasRequiredCompliance checks if all required compliance standards are met +func (r *InterClusterRouter) hasRequiredCompliance(required []string, available []string) bool { + for _, req := range required { + found := false + for _, avail := range available { + if req == avail { + found = true + break + } + } + if !found { + return false + } + } + return true +} + +// executeActions executes the actions for a matched strategy +func (r *InterClusterRouter) executeActions(actions []config.RoutingAction, ctx *RoutingContext) (*RoutingResult, error) { + for _, action := range actions { + switch action.Type { + case "route_to_cluster": + return r.routeToCluster(action.Target, ctx) + case "route_to_provider": + return r.routeToProvider(action.Target, ctx) + case "load_balance": + return r.loadBalanceRoute(action, ctx) + case "failover": + return r.failoverRoute(action, ctx) + default: + observability.Warnf("Unknown action type: %s", action.Type) + } + } + return nil, fmt.Errorf("no executable actions found") +} + +// routeToCluster routes to a specific cluster +func (r *InterClusterRouter) routeToCluster(clusterName string, ctx *RoutingContext) (*RoutingResult, error) { + cluster, found := r.config.GetClusterByName(clusterName) + if !found { + return nil, fmt.Errorf("cluster '%s' not found", clusterName) + } + + // Check if cluster supports the model + if !r.clusterSupportsModel(cluster, ctx.ModelName) { + return nil, fmt.Errorf("cluster '%s' does not support model '%s'", clusterName, ctx.ModelName) + } + + result := &RoutingResult{ + TargetType: "cluster", + TargetName: clusterName, + TargetEndpoint: cluster.Endpoint, + Confidence: 1.0, + EstimatedCost: cluster.CostPerToken * 1000, // Convert to per 1k tokens + EstimatedLatency: cluster.Performance.AvgLatencyMs, + } + + observability.Infof("Routed to cluster '%s' for model '%s'", clusterName, ctx.ModelName) + return result, nil +} + +// routeToProvider routes to a specific provider +func (r *InterClusterRouter) routeToProvider(providerName string, ctx *RoutingContext) (*RoutingResult, error) { + provider, found := r.config.GetProviderByName(providerName) + if !found { + return nil, fmt.Errorf("provider '%s' not found", providerName) + } + + // Check if provider supports the model + if !r.providerSupportsModel(provider, ctx.ModelName) { + return nil, fmt.Errorf("provider '%s' does not support model '%s'", providerName, ctx.ModelName) + } + + result := &RoutingResult{ + TargetType: "provider", + TargetName: providerName, + TargetEndpoint: provider.Endpoint, + Confidence: 1.0, + EstimatedLatency: provider.Performance.AvgLatencyMs, + } + + observability.Infof("Routed to provider '%s' for model '%s'", providerName, ctx.ModelName) + return result, nil +} + +// loadBalanceRoute implements load balancing across multiple targets +func (r *InterClusterRouter) loadBalanceRoute(action config.RoutingAction, ctx *RoutingContext) (*RoutingResult, error) { + clusters, providers := r.config.FindClustersForModel(ctx.ModelName) + + if len(clusters) == 0 && len(providers) == 0 { + return nil, fmt.Errorf("no clusters or providers found for model '%s'", ctx.ModelName) + } + + // Simple round-robin load balancing based on timestamp + timestamp := time.Now().UnixNano() + totalTargets := len(clusters) + len(providers) + selectedIndex := int(timestamp) % totalTargets + + if selectedIndex < len(clusters) { + // Route to cluster + cluster := clusters[selectedIndex] + return r.routeToCluster(cluster.Name, ctx) + } else { + // Route to provider + provider := providers[selectedIndex-len(clusters)] + return r.routeToProvider(provider.Name, ctx) + } +} + +// failoverRoute implements failover routing +func (r *InterClusterRouter) failoverRoute(action config.RoutingAction, ctx *RoutingContext) (*RoutingResult, error) { + for _, target := range action.FailoverTargets { + // Try cluster first + if result, err := r.routeToCluster(target, ctx); err == nil { + result.ReasonCode = "failover_cluster" + return result, nil + } + + // Try provider + if result, err := r.routeToProvider(target, ctx); err == nil { + result.ReasonCode = "failover_provider" + return result, nil + } + + observability.Warnf("Failover target '%s' failed for model '%s'", target, ctx.ModelName) + } + + return nil, fmt.Errorf("all failover targets failed for model '%s'", ctx.ModelName) +} + +// defaultRouting implements default routing strategy (latency-optimized) +func (r *InterClusterRouter) defaultRouting(ctx *RoutingContext) (*RoutingResult, error) { + clusters, providers := r.config.FindClustersForModel(ctx.ModelName) + + if len(clusters) == 0 && len(providers) == 0 { + return nil, fmt.Errorf("no clusters or providers found for model '%s'", ctx.ModelName) + } + + var bestResult *RoutingResult + bestLatency := int(^uint(0) >> 1) // Max int + + // Check clusters + for _, cluster := range clusters { + if cluster.Performance.AvgLatencyMs > 0 && cluster.Performance.AvgLatencyMs < bestLatency { + bestLatency = cluster.Performance.AvgLatencyMs + bestResult = &RoutingResult{ + TargetType: "cluster", + TargetName: cluster.Name, + TargetEndpoint: cluster.Endpoint, + ReasonCode: "default_latency_optimized", + Confidence: 0.8, + EstimatedCost: cluster.CostPerToken * 1000, + EstimatedLatency: cluster.Performance.AvgLatencyMs, + } + } + } + + // Check providers + for _, provider := range providers { + if provider.Performance.AvgLatencyMs > 0 && provider.Performance.AvgLatencyMs < bestLatency { + bestLatency = provider.Performance.AvgLatencyMs + bestResult = &RoutingResult{ + TargetType: "provider", + TargetName: provider.Name, + TargetEndpoint: provider.Endpoint, + ReasonCode: "default_latency_optimized", + Confidence: 0.8, + EstimatedLatency: provider.Performance.AvgLatencyMs, + } + } + } + + if bestResult == nil { + // Fallback: choose first available + if len(clusters) > 0 { + cluster := clusters[0] + bestResult = &RoutingResult{ + TargetType: "cluster", + TargetName: cluster.Name, + TargetEndpoint: cluster.Endpoint, + ReasonCode: "default_first_available", + Confidence: 0.6, + EstimatedCost: cluster.CostPerToken * 1000, + EstimatedLatency: cluster.Performance.AvgLatencyMs, + } + } else { + provider := providers[0] + bestResult = &RoutingResult{ + TargetType: "provider", + TargetName: provider.Name, + TargetEndpoint: provider.Endpoint, + ReasonCode: "default_first_available", + Confidence: 0.6, + EstimatedLatency: provider.Performance.AvgLatencyMs, + } + } + } + + observability.Infof("Default routing selected '%s' for model '%s' with latency %dms", + bestResult.TargetName, ctx.ModelName, bestResult.EstimatedLatency) + return bestResult, nil +} + +// clusterSupportsModel checks if a cluster supports a specific model +func (r *InterClusterRouter) clusterSupportsModel(cluster *config.ClusterConfig, modelName string) bool { + for _, model := range cluster.Models { + if model == modelName { + return true + } + } + return false +} + +// providerSupportsModel checks if a provider supports a specific model +func (r *InterClusterRouter) providerSupportsModel(provider *config.ProviderConfig, modelName string) bool { + for _, model := range provider.Models { + if model == modelName { + return true + } + } + return false +} \ No newline at end of file diff --git a/src/semantic-router/pkg/intercluster/router_test.go b/src/semantic-router/pkg/intercluster/router_test.go new file mode 100644 index 00000000..3b4fdbea --- /dev/null +++ b/src/semantic-router/pkg/intercluster/router_test.go @@ -0,0 +1,293 @@ +package intercluster + +import ( + "testing" + + "github.com/vllm-project/semantic-router/src/semantic-router/pkg/config" +) + +func TestInterClusterRouting(t *testing.T) { + // Create test configuration with inter-cluster routing enabled + cfg := &config.RouterConfig{ + InterClusterRouting: config.InterClusterConfig{ + Enabled: true, + ClusterDiscovery: config.ClusterDiscoveryConfig{ + Method: "static", + StaticClusters: []config.ClusterConfig{ + { + Name: "test-cluster-1", + Location: "us-west-2", + Type: "vllm", + Endpoint: "https://cluster1.example.com:8000", + Models: []string{"llama-2-70b", "mistral-7b"}, + Performance: config.PerformanceMetrics{ + AvgLatencyMs: 150, + }, + CostPerToken: 0.001, + }, + { + Name: "test-cluster-2", + Location: "eu-west-1", + Type: "vllm", + Endpoint: "https://cluster2.example.com:8000", + Models: []string{"llama-2-70b", "gpt-4"}, + Performance: config.PerformanceMetrics{ + AvgLatencyMs: 200, + }, + CostPerToken: 0.002, + Compliance: []string{"gdpr"}, + }, + }, + }, + Providers: []config.ProviderConfig{ + { + Name: "openai-provider", + Type: "openai", + Endpoint: "https://api.openai.com/v1", + Models: []string{"gpt-4", "gpt-3.5-turbo"}, + Performance: config.PerformanceMetrics{ + AvgLatencyMs: 300, + }, + }, + }, + RoutingStrategies: []config.RoutingStrategy{ + { + Name: "latency-optimized", + Priority: 100, + Conditions: []config.RoutingCondition{ + { + Type: "latency_requirement", + MaxLatencyMs: 200, + }, + }, + Actions: []config.RoutingAction{ + { + Type: "route_to_cluster", + Target: "test-cluster-1", + }, + }, + }, + { + Name: "compliance-routing", + Priority: 200, + Conditions: []config.RoutingCondition{ + { + Type: "compliance_requirement", + RequiredCompliance: []string{"gdpr"}, + }, + }, + Actions: []config.RoutingAction{ + { + Type: "route_to_cluster", + Target: "test-cluster-2", + }, + }, + }, + }, + }, + } + + router := NewInterClusterRouter(cfg) + + t.Run("TestLatencyBasedRouting", func(t *testing.T) { + ctx := &RoutingContext{ + ModelName: "llama-2-70b", + Category: "general", + UserContent: "Test query", + LatencyRequirement: intPtr(200), + } + + result, err := router.RouteRequest(ctx) + if err != nil { + t.Fatalf("Expected successful routing, got error: %v", err) + } + + if result.TargetName != "test-cluster-1" { + t.Errorf("Expected cluster 'test-cluster-1', got '%s'", result.TargetName) + } + + if result.TargetType != "cluster" { + t.Errorf("Expected target type 'cluster', got '%s'", result.TargetType) + } + + if result.ReasonCode != "strategy_latency-optimized" { + t.Errorf("Expected reason code 'strategy_latency-optimized', got '%s'", result.ReasonCode) + } + }) + + t.Run("TestComplianceBasedRouting", func(t *testing.T) { + ctx := &RoutingContext{ + ModelName: "llama-2-70b", + Category: "general", + UserContent: "Test query", + ComplianceRequirements: []string{"gdpr"}, + } + + result, err := router.RouteRequest(ctx) + if err != nil { + t.Fatalf("Expected successful routing, got error: %v", err) + } + + if result.TargetName != "test-cluster-2" { + t.Errorf("Expected cluster 'test-cluster-2', got '%s'", result.TargetName) + } + + if result.ReasonCode != "strategy_compliance-routing" { + t.Errorf("Expected reason code 'strategy_compliance-routing', got '%s'", result.ReasonCode) + } + }) + + t.Run("TestDefaultRouting", func(t *testing.T) { + ctx := &RoutingContext{ + ModelName: "mistral-7b", + Category: "general", + UserContent: "Test query", + } + + result, err := router.RouteRequest(ctx) + if err != nil { + t.Fatalf("Expected successful routing, got error: %v", err) + } + + // Should route to cluster with better latency (test-cluster-1) + if result.TargetName != "test-cluster-1" { + t.Errorf("Expected cluster 'test-cluster-1' for default routing, got '%s'", result.TargetName) + } + + if result.ReasonCode != "default_latency_optimized" { + t.Errorf("Expected reason code 'default_latency_optimized', got '%s'", result.ReasonCode) + } + }) + + t.Run("TestProviderRouting", func(t *testing.T) { + ctx := &RoutingContext{ + ModelName: "gpt-3.5-turbo", + Category: "general", + UserContent: "Test query", + } + + result, err := router.RouteRequest(ctx) + if err != nil { + t.Fatalf("Expected successful routing, got error: %v", err) + } + + // Should route to either cluster or provider that supports gpt-3.5-turbo + // Since only openai-provider supports gpt-3.5-turbo, it should route there + if result.TargetName != "openai-provider" && result.TargetName != "test-cluster-2" { + t.Errorf("Expected 'openai-provider' or 'test-cluster-2', got '%s'", result.TargetName) + } + }) + + t.Run("TestNoSupportingCluster", func(t *testing.T) { + ctx := &RoutingContext{ + ModelName: "unsupported-model", + Category: "general", + UserContent: "Test query", + } + + _, err := router.RouteRequest(ctx) + if err == nil { + t.Fatalf("Expected error for unsupported model, got success") + } + }) + + t.Run("TestDisabledInterClusterRouting", func(t *testing.T) { + disabledCfg := &config.RouterConfig{ + InterClusterRouting: config.InterClusterConfig{ + Enabled: false, + }, + } + + disabledRouter := NewInterClusterRouter(disabledCfg) + + ctx := &RoutingContext{ + ModelName: "llama-2-70b", + Category: "general", + UserContent: "Test query", + } + + _, err := disabledRouter.RouteRequest(ctx) + if err == nil { + t.Fatalf("Expected error when inter-cluster routing is disabled, got success") + } + }) +} + +func TestConditionEvaluation(t *testing.T) { + cfg := &config.RouterConfig{ + InterClusterRouting: config.InterClusterConfig{ + Enabled: true, + }, + } + + router := NewInterClusterRouter(cfg) + + t.Run("TestLatencyCondition", func(t *testing.T) { + condition := config.RoutingCondition{ + Type: "latency_requirement", + MaxLatencyMs: 200, + } + + ctx := &RoutingContext{ + LatencyRequirement: intPtr(150), + } + + if !router.evaluateCondition(condition, ctx) { + t.Errorf("Expected latency condition to pass") + } + + ctx.LatencyRequirement = intPtr(250) + if router.evaluateCondition(condition, ctx) { + t.Errorf("Expected latency condition to fail") + } + }) + + t.Run("TestCostCondition", func(t *testing.T) { + condition := config.RoutingCondition{ + Type: "cost_sensitivity", + MaxCostPer1kTokens: 0.0015, + } + + ctx := &RoutingContext{ + CostSensitivity: float64Ptr(0.001), + } + + if !router.evaluateCondition(condition, ctx) { + t.Errorf("Expected cost condition to pass") + } + + ctx.CostSensitivity = float64Ptr(0.002) + if router.evaluateCondition(condition, ctx) { + t.Errorf("Expected cost condition to fail") + } + }) + + t.Run("TestComplianceCondition", func(t *testing.T) { + condition := config.RoutingCondition{ + Type: "compliance_requirement", + RequiredCompliance: []string{"gdpr", "sox"}, + } + + ctx := &RoutingContext{ + ComplianceRequirements: []string{"gdpr", "sox", "hipaa"}, + } + + if !router.evaluateCondition(condition, ctx) { + t.Errorf("Expected compliance condition to pass") + } + + ctx.ComplianceRequirements = []string{"gdpr"} + if router.evaluateCondition(condition, ctx) { + t.Errorf("Expected compliance condition to fail (missing sox)") + } + }) +} + +// Helper functions for test pointers +func intPtr(i int) *int { + return &i +} + +func float64Ptr(f float64) *float64 { + return &f +} \ No newline at end of file diff --git a/website/docs/getting-started/multi-cloud-routing.md b/website/docs/getting-started/multi-cloud-routing.md new file mode 100644 index 00000000..0c87ea2d --- /dev/null +++ b/website/docs/getting-started/multi-cloud-routing.md @@ -0,0 +1,205 @@ +# Multi-Cloud and Hybrid Cloud Routing + +The Semantic Router now supports **inter-cluster and multi-cloud routing**, enabling sophisticated routing across multiple clusters, cloud providers, and deployment environments. This feature extends the existing semantic classification and routing capabilities to work seamlessly across distributed infrastructure. + +## Overview + +Multi-cloud routing allows you to: + +- **Route across multiple clusters**: Distribute load across vLLM clusters in different regions or environments +- **Integrate cloud providers**: Route to OpenAI, Anthropic Claude, Grok, and other API providers +- **Optimize for performance**: Route based on latency, throughput, and availability +- **Control costs**: Route to the most cost-effective clusters and providers +- **Ensure compliance**: Route based on data residency and regulatory requirements +- **Provide fault tolerance**: Automatic failover and circuit breaker patterns + +## Configuration + +### Enabling Inter-Cluster Routing + +Add the `inter_cluster_routing` section to your configuration: + +```yaml +inter_cluster_routing: + enabled: true + + cluster_discovery: + method: "static" # Options: "static", "kubernetes", "consul", "etcd" + refresh_interval: "30s" + health_check_interval: "10s" +``` + +### Cluster Configuration + +Define your clusters in the `static_clusters` section: + +```yaml +inter_cluster_routing: + cluster_discovery: + static_clusters: + - name: "on-prem-gpu-cluster" + location: "us-west-2" + type: "vllm" + endpoint: "https://on-prem.company.com:8000" + authentication: + type: "bearer" + token: "bearer-token-secret" + models: + - "llama-2-70b" + - "codellama-34b" + capabilities: + max_context_length: 4096 + max_tokens_per_second: 100 + performance: + avg_latency_ms: 150 + throughput_rps: 50 + availability: 99.5 + compliance: + - "hipaa" + - "sox" + cost_per_token: 0.001 +``` + +For more detailed examples and configuration options, see the [complete multi-cloud configuration example](https://github.com/vllm-project/semantic-router/blob/main/config/multi-cloud-config-example.yaml). + +## Routing Strategies + +Configure sophisticated routing logic with prioritized strategies. Strategies are evaluated in priority order (higher number = higher priority). + +### Example Strategy Configuration + +```yaml +routing_strategies: + # Highest Priority: Compliance-based routing + - name: "gdpr-compliance-routing" + priority: 300 + conditions: + - type: "compliance_requirement" + required_compliance: ["gdpr"] + actions: + - type: "route_to_cluster" + target: "eu-west-cluster" + + # Medium Priority: Latency-optimized routing + - name: "latency-optimized-routing" + priority: 200 + conditions: + - type: "latency_requirement" + max_latency_ms: 200 + actions: + - type: "route_to_cluster" + target: "edge-cluster" + - type: "failover" + failover_targets: ["backup-cluster"] +``` + +## Supported Providers + +The system supports routing to multiple types of providers: + +- **vLLM Clusters**: Self-hosted vLLM deployments +- **OpenAI API**: GPT-4, GPT-3.5-turbo, and other OpenAI models +- **Anthropic Claude**: Claude-3 and related models +- **Grok API**: xAI's Grok models +- **Custom Providers**: Extensible plugin architecture + +## Use Cases + +### 1. On-Premises + Cloud Hybrid + +Route sensitive data to on-premises clusters while using cloud providers for general queries: + +```yaml +routing_strategies: + - name: "sensitive-data-routing" + priority: 300 + conditions: + - type: "compliance_requirement" + required_compliance: ["hipaa"] + actions: + - type: "route_to_cluster" + target: "on-prem-secure-cluster" +``` + +### 2. Multi-Region GDPR Compliance + +Ensure EU data stays in EU clusters: + +```yaml +routing_strategies: + - name: "eu-data-residency" + priority: 300 + conditions: + - type: "data_residency" + required_region: "eu-west-1" + actions: + - type: "route_to_cluster" + target: "eu-west-cluster" +``` + +### 3. Cost Optimization + +Route to the most cost-effective clusters: + +```yaml +routing_strategies: + - name: "cost-optimization" + priority: 150 + conditions: + - type: "cost_sensitivity" + max_cost_per_1k_tokens: 0.001 + actions: + - type: "route_to_cluster" + target: "cost-effective-cluster" +``` + +## Key Features + +### ✅ **Condition-Based Routing** +- Latency requirements +- Cost sensitivity +- Compliance needs +- Data residency +- Model-specific routing + +### ✅ **Multiple Action Types** +- Direct cluster routing +- Provider routing +- Load balancing +- Failover strategies + +### ✅ **Fault Tolerance** +- Circuit breaker patterns +- Retry policies with backoff +- Automatic failover +- Health monitoring + +### ✅ **Authentication Support** +- Bearer tokens +- API keys +- OAuth (future) +- Per-cluster/provider auth + +### ✅ **Monitoring & Observability** +- Detailed routing logs +- Performance metrics +- Cost tracking +- Health status + +## Migration from Single-Cluster + +Existing configurations remain fully compatible! To add multi-cloud routing: + +1. **Enable the feature**: Add `inter_cluster_routing.enabled: true` +2. **Define clusters**: Move existing endpoints to `static_clusters` +3. **Add strategies**: Configure routing logic based on your needs +4. **Test gradually**: Start with simple strategies and expand + +## Getting Started + +1. **Review the [complete configuration example](https://github.com/vllm-project/semantic-router/blob/main/config/multi-cloud-config-example.yaml)** +2. **Read the [detailed configuration guide](../getting-started/configuration.md)** +3. **Check the [API reference](../api/router/) for programmatic access** +4. **See [troubleshooting tips](../categories/technical-details.md) for common issues** + +This powerful feature enables enterprise-grade routing across complex, distributed LLM infrastructure while maintaining the simplicity and intelligence of semantic classification. \ No newline at end of file