Skip to content

Commit abbf01a

Browse files
committed
feat: enhance model alias handling and error reporting in orchestrator
1 parent f9d30c2 commit abbf01a

File tree

3 files changed

+43
-21
lines changed

3 files changed

+43
-21
lines changed

assets/bootstrap.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,22 @@ def clean_str(s):
100100
# AEGIS_MODEL_ALIAS is injected by the orchestrator from spec.runtime.model.
101101
# It routes this execution to the correct provider alias (e.g. "judge",
102102
# "smart", "default"). The key must match InnerLoopRequest.model_alias.
103+
# The orchestrator MUST always inject this variable; a missing value indicates
104+
# a misconfiguration that must be fixed at the source, not silently masked.
105+
model_alias = os.environ.get("AEGIS_MODEL_ALIAS")
106+
if model_alias is None:
107+
print(
108+
"Error: AEGIS_MODEL_ALIAS environment variable is not set. "
109+
"The orchestrator must inject this for every execution via spec.runtime.model.",
110+
file=sys.stderr,
111+
)
112+
sys.exit(1)
113+
debug_print(f"Model alias: {model_alias}")
103114
payload = {
104115
"prompt": final_prompt,
105116
"execution_id": execution_id,
106117
"iteration_number": iteration_number,
107-
"model_alias": os.environ.get("AEGIS_MODEL_ALIAS", "default")
118+
"model_alias": model_alias,
108119
}
109120

110121
debug_print(f"Preparing LLM request - prompt length: {len(final_prompt)} chars")

cli/src/daemon/server.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1828,11 +1828,8 @@ async fn lookup_agent_handler(
18281828
struct LlmGenerateRequest {
18291829
execution_id: Option<Uuid>,
18301830
iteration_number: Option<u8>,
1831-
_provider: Option<String>,
1832-
model: Option<String>,
1831+
model_alias: Option<String>,
18331832
prompt: String,
1834-
_temperature: Option<f32>,
1835-
_max_tokens: Option<u32>,
18361833
}
18371834

18381835
async fn llm_generate_handler(
@@ -1869,7 +1866,10 @@ async fn llm_generate_handler(
18691866
.map(|id| id.to_string())
18701867
.unwrap_or_else(|| Uuid::nil().to_string());
18711868

1872-
let model_alias = req.model.clone().unwrap_or_else(|| "default".to_string());
1869+
let model_alias = req
1870+
.model_alias
1871+
.clone()
1872+
.unwrap_or_else(|| "default".to_string());
18731873

18741874
// Build the inner loop request, seeding the conversation with the rendered prompt
18751875
let inner_req = InnerLoopRequest {

orchestrator/core/src/infrastructure/llm/registry.rs

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ use super::openai::OpenAIAdapter;
3131
/// `providers` holds one health-check adapter per provider name (using `models.first()`).
3232
pub struct ProviderRegistry {
3333
/// alias → pre-configured adapter for that exact model.
34-
alias_map: HashMap<String, Arc<dyn LLMProvider>>,
34+
alias_map: HashMap<String, (String, Arc<dyn LLMProvider>)>,
3535
/// provider_name → adapter used for health checks (built from models.first()).
3636
providers: HashMap<String, Arc<dyn LLMProvider>>,
3737
/// Fallback adapter resolved at construction time; used when primary exhausts retries.
38-
fallback_provider: Option<Arc<dyn LLMProvider>>,
38+
fallback_provider: Option<(String, Arc<dyn LLMProvider>)>,
3939
max_retries: u32,
4040
retry_delay_ms: u64,
4141
}
@@ -135,7 +135,7 @@ impl ProviderRegistry {
135135
}
136136

137137
// ── Phase 2: build one per-model adapter per winning alias ─────────────────────
138-
let mut alias_map: HashMap<String, Arc<dyn LLMProvider>> = HashMap::new();
138+
let mut alias_map: HashMap<String, (String, Arc<dyn LLMProvider>)> = HashMap::new();
139139

140140
for provider_config in &config.spec.llm_providers {
141141
if !provider_config.enabled {
@@ -149,7 +149,7 @@ impl ProviderRegistry {
149149
{
150150
match Self::create_adapter(provider_config, winner_model) {
151151
Ok(adapter) => {
152-
alias_map.insert(alias.clone(), adapter);
152+
alias_map.insert(alias.clone(), (winner_model.clone(), adapter));
153153
}
154154
Err(e) => {
155155
warn!(
@@ -164,12 +164,18 @@ impl ProviderRegistry {
164164
}
165165

166166
// Resolve fallback to a concrete adapter at construction time.
167+
// We look up the provider name and assume the first model was used for health checks.
167168
let fallback_provider = config
168169
.spec
169170
.llm_selection
170171
.fallback_provider
171172
.as_deref()
172-
.and_then(|name| providers.get(name).cloned());
173+
.and_then(|name| {
174+
let adapter = providers.get(name)?.clone();
175+
let provider_config = config.spec.llm_providers.iter().find(|p| p.name == name)?;
176+
let first_model = provider_config.models.first()?.model.clone();
177+
Some((first_model, adapter))
178+
});
173179

174180
Ok(Self {
175181
alias_map,
@@ -235,31 +241,34 @@ impl ProviderRegistry {
235241
tools: &[ToolSchema],
236242
options: &GenerationOptions,
237243
) -> Result<ChatResponse, LLMError> {
238-
let provider = self
244+
let (model_name, provider) = self
239245
.alias_map
240246
.get(alias)
241247
.ok_or_else(|| LLMError::ModelNotFound(format!("Model alias '{}' not found", alias)))?;
242248

249+
info!("LLM inference: alias='{}', model='{}'", alias, model_name);
250+
243251
let mut last_error = None;
244252

245253
for attempt in 0..self.max_retries {
246254
match provider.generate_chat(messages, tools, options).await {
247255
Ok(response) => {
248-
info!("generate_chat successful on attempt {}", attempt + 1);
256+
info!("generate_chat successful: alias='{}', model='{}', attempt={}", alias, model_name, attempt + 1);
249257
return Ok(response);
250258
}
251259
Err(e) => {
252260
warn!(
253-
"generate_chat failed (attempt {}/{}): {:?}",
261+
"generate_chat failed: alias='{}', attempt={}/{}: {:?}",
262+
alias,
254263
attempt + 1,
255264
self.max_retries,
256265
e
257266
);
258267
last_error = Some(e);
259268

260269
if attempt == self.max_retries - 1 {
261-
if let Some(fallback) = &self.fallback_provider {
262-
info!("Trying fallback provider");
270+
if let Some((fallback_model, fallback)) = &self.fallback_provider {
271+
info!("Trying fallback provider (model='{}')", fallback_model);
263272
return fallback.generate_chat(messages, tools, options).await;
264273
}
265274
}
@@ -286,17 +295,19 @@ impl ProviderRegistry {
286295
prompt: &str,
287296
options: &GenerationOptions,
288297
) -> Result<GenerationResponse, LLMError> {
289-
let provider = self
298+
let (model_name, provider) = self
290299
.alias_map
291300
.get(alias)
292301
.ok_or_else(|| LLMError::ModelNotFound(format!("Model alias '{}' not found", alias)))?;
293302

303+
info!("LLM text generation: alias='{}', model='{}'", alias, model_name);
304+
294305
let mut last_error = None;
295306

296307
for attempt in 0..self.max_retries {
297308
match provider.generate(prompt, options).await {
298309
Ok(response) => {
299-
info!("Generation successful on attempt {}", attempt + 1);
310+
info!("Generation successful on attempt {} (model='{}')", attempt + 1, model_name);
300311
return Ok(response);
301312
}
302313
Err(e) => {
@@ -309,8 +320,8 @@ impl ProviderRegistry {
309320
last_error = Some(e);
310321

311322
if attempt == self.max_retries - 1 {
312-
if let Some(fallback) = &self.fallback_provider {
313-
info!("Trying fallback provider");
323+
if let Some((fallback_model, fallback)) = &self.fallback_provider {
324+
info!("Trying fallback provider (model='{}')", fallback_model);
314325
return fallback.generate(prompt, options).await;
315326
}
316327
}
@@ -434,7 +445,7 @@ impl LLMProvider for ProviderRegistry {
434445
}
435446

436447
async fn health_check(&self) -> Result<(), LLMError> {
437-
let provider = self
448+
let (_, provider) = self
438449
.alias_map
439450
.get("default")
440451
.ok_or_else(|| LLMError::Provider("Default model alias not configured".into()))?;

0 commit comments

Comments
 (0)