Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/api/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ async fn main() -> anyhow::Result<()> {
model_pricing_cache: api::model_pricing::ModelPricingCache::new(
config.openai.base_url.clone().unwrap_or_default(),
),
system_configs_cache: Arc::new(tokio::sync::RwLock::new(None)),
rate_limit_state,
bi_metrics_service,
};
Expand Down
14 changes: 13 additions & 1 deletion crates/api/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::consts::SYSTEM_PROMPT_MAX_LEN;
use crate::ApiError;
use serde::{Deserialize, Serialize};
use services::file::ports::FileData;
use services::system_configs::ports::SubscriptionPlanConfig;
use services::system_configs::ports::{AutoRouteConfig, SubscriptionPlanConfig};
use services::UserId;
use std::collections::HashMap;
use utoipa::ToSchema;
Expand Down Expand Up @@ -599,12 +599,16 @@ pub struct PublicSystemConfigsResponse {
/// Default model identifier to use when not specified
#[serde(skip_serializing_if = "Option::is_none")]
pub default_model: Option<String>,
/// Auto-routing configuration for `model: "auto"` requests
#[serde(skip_serializing_if = "Option::is_none")]
pub auto_route: Option<AutoRouteConfig>,
}

impl From<services::system_configs::ports::SystemConfigs> for PublicSystemConfigsResponse {
fn from(config: services::system_configs::ports::SystemConfigs) -> Self {
Self {
default_model: config.default_model,
auto_route: config.auto_route,
}
}
}
Expand All @@ -623,6 +627,9 @@ pub struct SystemConfigsResponse {
/// Maximum number of agent instances per manager (round-robin skips full managers)
#[serde(skip_serializing_if = "Option::is_none")]
pub max_instances_per_manager: Option<u64>,
/// Auto-routing configuration for `model: "auto"` requests
#[serde(skip_serializing_if = "Option::is_none")]
pub auto_route: Option<AutoRouteConfig>,
}

impl From<services::system_configs::ports::SystemConfigs> for SystemConfigsResponse {
Expand All @@ -632,6 +639,7 @@ impl From<services::system_configs::ports::SystemConfigs> for SystemConfigsRespo
rate_limit: config.rate_limit.into(),
subscription_plans: config.subscription_plans,
max_instances_per_manager: config.max_instances_per_manager,
auto_route: config.auto_route,
}
}
}
Expand All @@ -651,6 +659,9 @@ pub struct UpsertSystemConfigsRequest {
/// Maximum number of agent instances per manager (round-robin skips full managers)
#[serde(skip_serializing_if = "Option::is_none")]
pub max_instances_per_manager: Option<u64>,
/// Auto-routing configuration for `model: "auto"` requests
#[serde(skip_serializing_if = "Option::is_none")]
pub auto_route: Option<AutoRouteConfig>,
}

impl TryFrom<UpsertSystemConfigsRequest> for services::system_configs::ports::PartialSystemConfigs {
Expand All @@ -668,6 +679,7 @@ impl TryFrom<UpsertSystemConfigsRequest> for services::system_configs::ports::Pa
rate_limit,
subscription_plans: req.subscription_plans,
max_instances_per_manager: req.max_instances_per_manager,
auto_route: req.auto_route,
})
}
}
Expand Down
40 changes: 39 additions & 1 deletion crates/api/src/routes/admin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -939,14 +939,46 @@ pub async fn upsert_system_configs(

#[cfg(not(feature = "test"))]
if let Some(ref model_id) = request.default_model {
ensure_proxy_model_exists(app_state.proxy_service, model_id).await?;
ensure_proxy_model_exists(app_state.proxy_service.clone(), model_id).await?;
}

// Validate rate limit config if provided
if let Some(ref rate_limit) = request.rate_limit {
rate_limit.validate()?;
}

// Validate auto_route config if provided
if let Some(ref auto_route) = request.auto_route {
if auto_route.model.trim().is_empty() {
return Err(ApiError::bad_request(
"auto_route.model must not be empty".to_string(),
));
}
#[cfg(not(feature = "test"))]
ensure_proxy_model_exists(app_state.proxy_service.clone(), &auto_route.model).await?;
if let Some(t) = auto_route.temperature {
if t < 0.0 {
return Err(ApiError::bad_request(
"auto_route.temperature must be >= 0".to_string(),
));
}
}
if let Some(p) = auto_route.top_p {
if !(0.0..=1.0).contains(&p) {
return Err(ApiError::bad_request(
"auto_route.top_p must be between 0 and 1".to_string(),
));
}
}
if let Some(m) = auto_route.max_tokens {
if m == 0 {
return Err(ApiError::bad_request(
"auto_route.max_tokens must be > 0".to_string(),
));
}
}
}

let partial: services::system_configs::ports::PartialSystemConfigs =
request.try_into().map_err(|e: String| {
tracing::error!(error = %e, "Failed to convert rate limit config");
Expand Down Expand Up @@ -995,6 +1027,12 @@ pub async fn upsert_system_configs(
.update_config(updated.rate_limit.clone())
.await;

// Invalidate system configs cache so auto-route picks up changes immediately
{
let mut cache = app_state.system_configs_cache.write().await;
*cache = None;
}

Ok(Json(updated.into()))
}

Expand Down
95 changes: 85 additions & 10 deletions crates/api/src/routes/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ const NEAR_BALANCE_CACHE_TTL_SECS: i64 = 5 * 60;
/// Duration to cache model settings needed by /v1/responses in memory (in seconds)
const MODEL_SETTINGS_CACHE_TTL_SECS: i64 = 60;

/// Auto-routing: target model and default parameters for `model: "auto"` requests
/// Duration to cache system configs in memory (in seconds)
const SYSTEM_CONFIGS_CACHE_TTL_SECS: i64 = 60;

/// Fallback defaults for `model: "auto"` routing when no `auto_route` system config is set
pub const AUTO_ROUTE_MODEL: &str = "zai-org/GLM-5-FP8";
pub const AUTO_ROUTE_TEMPERATURE: f64 = 1.0;
pub const AUTO_ROUTE_TOP_P: f64 = 0.95;
Expand Down Expand Up @@ -3007,6 +3010,45 @@ async fn proxy_signature(
.await
}

/// Get system configs with in-memory TTL caching.
/// Returns `None` when no configs exist or on DB error (graceful degradation).
async fn get_system_configs_cached(
state: &crate::state::AppState,
) -> Option<services::system_configs::ports::SystemConfigs> {
// 1) Try cache first
{
let cache = state.system_configs_cache.read().await;
if let Some(entry) = cache.as_ref() {
let age = Utc::now().signed_duration_since(entry.last_checked_at);
if age.num_seconds() >= 0 && age.num_seconds() < SYSTEM_CONFIGS_CACHE_TTL_SECS {
return entry.configs.clone();
}
}
}

// 2) Cache miss or expired: fetch from DB and populate cache
let configs = state
.system_configs_service
.get_configs()
.await
.map_err(|e| {
tracing::warn!("Failed to load system configs, using defaults: {e}");
e
})
.ok()
.flatten();

{
let mut cache = state.system_configs_cache.write().await;
*cache = Some(crate::state::SystemConfigsCacheEntry {
last_checked_at: Utc::now(),
configs: configs.clone(),
});
}

configs
}

/// Helper function to get model settings with caching support.
/// Returns (system_prompt, cache_hit) if model is found and public.
/// Validates model visibility and populates cache if needed.
Expand Down Expand Up @@ -3136,15 +3178,48 @@ async fn prepare_chat_completions_body(
if let Some(body) = body_json.as_mut() {
if body.get("model").and_then(|v| v.as_str()) == Some("auto") {
tracing::info!("Auto-routing model: user_id={}", user.user_id);
body["model"] = json!(AUTO_ROUTE_MODEL);
if body.get("temperature").is_none_or(|v| v.is_null()) {
body["temperature"] = json!(AUTO_ROUTE_TEMPERATURE);
}
if body.get("top_p").is_none_or(|v| v.is_null()) {
body["top_p"] = json!(AUTO_ROUTE_TOP_P);
}
if body.get("max_tokens").is_none_or(|v| v.is_null()) {
body["max_tokens"] = json!(AUTO_ROUTE_MAX_TOKENS);

// Load auto-route config from system configs (cached with TTL)
let auto_config = get_system_configs_cached(state)
.await
.and_then(|c| c.auto_route);

let route_model = auto_config
.as_ref()
.map(|c| c.model.as_str())
.unwrap_or(AUTO_ROUTE_MODEL);

body["model"] = json!(route_model);

// When config exists, only inject params explicitly set in it.
// When no config exists, use hardcoded fallback defaults.
if let Some(ref cfg) = auto_config {
if let Some(t) = cfg.temperature {
if body.get("temperature").is_none_or(|v| v.is_null()) {
body["temperature"] = json!(t);
}
}
if let Some(p) = cfg.top_p {
if body.get("top_p").is_none_or(|v| v.is_null()) {
body["top_p"] = json!(p);
}
}
if let Some(m) = cfg.max_tokens {
if body.get("max_tokens").is_none_or(|v| v.is_null()) {
body["max_tokens"] = json!(m);
}
}
} else {
// No auto_route config set — use hardcoded fallback defaults
if body.get("temperature").is_none_or(|v| v.is_null()) {
body["temperature"] = json!(AUTO_ROUTE_TEMPERATURE);
}
if body.get("top_p").is_none_or(|v| v.is_null()) {
body["top_p"] = json!(AUTO_ROUTE_TOP_P);
}
if body.get("max_tokens").is_none_or(|v| v.is_null()) {
body["max_tokens"] = json!(AUTO_ROUTE_MAX_TOKENS);
}
}
auto_routed = true;
}
Expand Down
12 changes: 12 additions & 0 deletions crates/api/src/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ pub struct ModelSettingsCacheEntry {
/// Type alias for model settings cache (per-model)
pub type ModelSettingsCache = Arc<RwLock<HashMap<String, ModelSettingsCacheEntry>>>;

/// Cached system configs entry (single global value)
#[derive(Debug, Clone)]
pub struct SystemConfigsCacheEntry {
pub last_checked_at: DateTime<Utc>,
pub configs: Option<services::system_configs::ports::SystemConfigs>,
}

/// Type alias for system configs cache
pub type SystemConfigsCache = Arc<RwLock<Option<SystemConfigsCacheEntry>>>;

/// Application state shared across all handlers
#[derive(Clone)]
pub struct AppState {
Expand Down Expand Up @@ -67,6 +77,8 @@ pub struct AppState {
pub model_settings_cache: ModelSettingsCache,
/// In-memory cache for model pricing (input/output nano per token) from cloud-api for cost calculation
pub model_pricing_cache: crate::model_pricing::ModelPricingCache,
/// In-memory cache for system configs (single entry with TTL)
pub system_configs_cache: SystemConfigsCache,
/// Rate limit state for hot-reloadable rate limit configuration
pub rate_limit_state: crate::middleware::RateLimitState,
/// BI metrics service for deployment and usage analytics
Expand Down
1 change: 1 addition & 0 deletions crates/api/tests/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ pub async fn create_test_server_and_db(
model_pricing_cache: api::model_pricing::ModelPricingCache::new(
test_config.cloud_api_base_url.clone(),
),
system_configs_cache: Arc::new(tokio::sync::RwLock::new(None)),
rate_limit_state,
bi_metrics_service,
};
Expand Down
23 changes: 23 additions & 0 deletions crates/services/src/system_configs/ports.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,23 @@ impl Default for RateLimitConfig {
}
}

/// Auto-routing configuration for `model: "auto"` chat completion requests
#[cfg_attr(feature = "utoipa", derive(utoipa::ToSchema))]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AutoRouteConfig {
/// Target model to substitute for "auto"
pub model: String,
/// Default temperature (injected when client doesn't provide one; omitted if None)
#[serde(default, skip_serializing_if = "Option::is_none")]
pub temperature: Option<f64>,
/// Default top_p (injected when client doesn't provide one; omitted if None)
#[serde(default, skip_serializing_if = "Option::is_none")]
pub top_p: Option<f64>,
/// Default max_tokens (injected when client doesn't provide one; omitted if None)
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_tokens: Option<u64>,
}

/// Application-wide configuration stored in `system_configs` table
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemConfigs {
Expand All @@ -135,6 +152,9 @@ pub struct SystemConfigs {
/// round-robin skips it. If all managers are full, instance creation is rejected.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_instances_per_manager: Option<u64>,
/// Auto-routing configuration for `model: "auto"` requests
#[serde(default, skip_serializing_if = "Option::is_none")]
pub auto_route: Option<AutoRouteConfig>,
}

#[derive(Debug, Clone, Serialize, Deserialize, Default)]
Expand All @@ -143,6 +163,7 @@ pub struct PartialSystemConfigs {
pub rate_limit: Option<RateLimitConfig>,
pub subscription_plans: Option<HashMap<String, SubscriptionPlanConfig>>,
pub max_instances_per_manager: Option<u64>,
pub auto_route: Option<AutoRouteConfig>,
}

#[allow(clippy::derivable_impls)]
Expand All @@ -153,6 +174,7 @@ impl Default for SystemConfigs {
rate_limit: RateLimitConfig::default(),
subscription_plans: None,
max_instances_per_manager: Some(200),
auto_route: None,
}
}
}
Expand All @@ -166,6 +188,7 @@ impl SystemConfigs {
max_instances_per_manager: partial
.max_instances_per_manager
.or(self.max_instances_per_manager),
auto_route: partial.auto_route.or(self.auto_route),
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion crates/services/src/system_configs/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ impl SystemConfigsServiceImpl {
#[async_trait]
impl SystemConfigsService for SystemConfigsServiceImpl {
async fn get_configs(&self) -> anyhow::Result<Option<SystemConfigs>> {
tracing::info!("Getting system configs");
tracing::debug!("Getting system configs");

self.repository.get_configs().await
}
Expand Down