feat(routing): add BaRP cost_weight dial and MAR memory_hit_confidence signal (#2466)

bug-ops · web-flow · commit 00b8644c034d · 2026-03-30T20:27:22.000Z
Implements two complementary enhancements to the LinUCB bandit router: BaRP (#2415): add cost_weight: f32 field to BanditConfig (0.0 = pure quality, 1.0 = pure cost). The UCB arm selection penalizes expensive providers proportionally: adjusted_ucb = raw_ucb - cost_weight * cost_est. provider_cost_estimate() maps model names to relative cost tiers using pattern matching on both model_id and provider name fields. MAR (#2443): add memory_hit_confidence: Option<f32> propagation from SemanticMemory top-1 recall score through MemoryState.last_recall_confidence to the router. When confidence >= memory_confidence_threshold (default 0.9), cheap providers receive a boost: (1 - cost_est) * confidence * cost_weight. When cost_weight = 0.0, the boost is zero, preserving pure-quality mode. Both config fields have serde defaults (cost_weight = 0.0, threshold = 0.9) and are clamped to [0.0, 1.0] at bootstrap.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 - feat(core): `/new` slash command — resets conversation context (messages, compaction state, tool caches, focus/sidequest, pending plans) while preserving memory, MCP connections, providers, and skills; creates a new `ConversationId` in SQLite for audit trail; generates a session digest for the outgoing conversation fire-and-forget unless `--no-digest` is passed; active sub-agents and background compression tasks are cancelled; `--keep-plan` preserves a pending plan graph; available in all channels (CLI, TUI, Telegram) via the unified `handle_builtin_command` path (closes #2451)
 - feat(memory): Kumiho AGM-inspired belief revision for graph edges — new `BeliefRevisionConfig` with `similarity_threshold`; `find_superseded_edges()` uses contradiction heuristic (same relation domain + high cosine similarity = supersession); `superseded_by` column added to `graph_edges` for audit trail; `invalidate_edge_with_supersession()` in `GraphStore`; `resolve_edge_typed` accepts optional `BeliefRevisionConfig`; controlled by `[memory.graph.belief_revision] enabled = false` (migration 056, closes #2441)
 - feat(memory): D-MEM RPE-based tiered graph extraction routing — `RpeRouter` computes heuristic surprise score from context similarity and entity novelty; low-RPE turns skip the MAGMA LLM extraction pipeline; `consecutive_skips` safety valve forces extraction after `max_skip_turns` consecutive skips; `extract_candidate_entities()` helper for cheap regex+keyword entity detection; controlled by `[memory.graph.rpe] enabled = false, threshold = 0.3, max_skip_turns = 5` (closes #2442)
+- feat(llm): BaRP cost-weight dial in bandit router — `cost_weight` now penalises UCB arm scores during provider selection in addition to the existing reward-signal penalty; higher values bias the bandit toward cheaper providers at inference time; static cost tier heuristics based on provider name and model identifier; `cost_weight` is clamped to [0.0, 1.0] at bootstrap (#2415)
+- feat(llm): MAR (Memory-Augmented Routing) — new `[llm.routing.bandit] memory_confidence_threshold` (default 0.9); when the top-1 semantic recall score for the current query meets or exceeds the threshold the bandit biases toward fast/cheap providers; signal propagated from `SemanticMemory::recall` through `ContextSlot::SemanticRecall` to `RouterProvider`; no routing change when `cost_weight = 0.0` (operator intent respected) (#2443)
 - feat(acp): expose current model in `session/list` and emit `SessionInfoUpdate` on model change — each in-memory `SessionInfo` now carries `meta.currentModel`; after `session/set_config_option` with `configId=model` a `SessionInfoUpdate` notification with `meta.currentModel` is sent in addition to the existing `ConfigOptionUpdate`; same notification is sent after `session/set_session_model` (closes #2435)
 - feat(tools): adversarial policy agent — LLM-based pre-execution tool call validation against plain-language policies; configurable fail-closed/fail-open behavior (`fail_open = false` default); prompt injection hardening via code-fence param quoting; strict allow/deny response parsing; full `ToolExecutor` trait delegation; audit log `adversarial_policy_decision` field; executor chain order `PolicyGateExecutor → AdversarialPolicyGateExecutor → TrustGateExecutor`; gated on `policy-enforcer` feature; config `[tools.adversarial_policy]` (closes #2447)
 - feat(memory): Memex tool output archive — before compaction, `ToolOutput` bodies in the compaction range are saved to `tool_overflow` with `archive_type = 'archive'`; archived UUIDs are appended as a postfix after LLM summarization so references survive compaction; controlled by `[memory.compression] archive_tool_outputs = false`; archives are excluded from the short-lived cleanup job via `archive_type` column (migration 054, closes #2432)
diff --git a/crates/zeph-config/src/providers.rs b/crates/zeph-config/src/providers.rs
@@ -565,6 +565,18 @@ pub struct BanditConfig {
     /// Do not place it in world-writable directories.
     #[serde(default)]
     pub state_path: Option<String>,
+
+    /// MAR (Memory-Augmented Routing) confidence threshold.
+    ///
+    /// When the top-1 semantic recall score for the current query is >= this value,
+    /// the bandit biases toward cheaper providers (the answer is likely in memory).
+    /// Set to 1.0 to disable MAR. Default: 0.9.
+    #[serde(default = "default_bandit_memory_confidence_threshold")]
+    pub memory_confidence_threshold: f32,
+}
+
+fn default_bandit_memory_confidence_threshold() -> f32 {
+    0.9
 }
 
 impl Default for BanditConfig {
@@ -578,6 +590,7 @@ impl Default for BanditConfig {
             embedding_timeout_ms: default_bandit_embedding_timeout_ms(),
             cache_size: default_bandit_cache_size(),
             state_path: None,
+            memory_confidence_threshold: default_bandit_memory_confidence_threshold(),
         }
     }
 }
diff --git a/crates/zeph-core/src/agent/context/assembly.rs b/crates/zeph-core/src/agent/context/assembly.rs
@@ -237,14 +237,15 @@ impl<C: Channel> Agent<C> {
     ) -> Result<(), super::super::error::AgentError> {
         self.remove_recall_messages();
 
-        if let Some(msg) = Self::fetch_semantic_recall(
+        let (msg, _score) = Self::fetch_semantic_recall(
             &self.memory_state,
             query,
             token_budget,
             &self.metrics.token_counter,
             None,
         )
-        .await?
+        .await?;
+        if let Some(msg) = msg
             && self.msg.messages.len() > 1
         {
             self.msg.messages.insert(1, msg);
@@ -259,12 +260,12 @@ impl<C: Channel> Agent<C> {
         token_budget: usize,
         tc: &TokenCounter,
         router: Option<&dyn zeph_memory::MemoryRouter>,
-    ) -> Result<Option<Message>, super::super::error::AgentError> {
+    ) -> Result<(Option<Message>, Option<f32>), super::super::error::AgentError> {
         let Some(memory) = &memory_state.memory else {
-            return Ok(None);
+            return Ok((None, None));
         };
         if memory_state.recall_limit == 0 || token_budget == 0 {
-            return Ok(None);
+            return Ok((None, None));
         }
 
         let recalled = if let Some(r) = router {
@@ -277,9 +278,11 @@ impl<C: Channel> Agent<C> {
                 .await?
         };
         if recalled.is_empty() {
-            return Ok(None);
+            return Ok((None, None));
         }
 
+        let top_score = recalled.first().map(|r| r.score);
+
         let mut recall_text = String::with_capacity(token_budget * 3);
         recall_text.push_str(RECALL_PREFIX);
         let mut tokens_used = tc.count_tokens(&recall_text);
@@ -300,12 +303,15 @@ impl<C: Channel> Agent<C> {
         }
 
         if tokens_used > tc.count_tokens(RECALL_PREFIX) {
-            Ok(Some(Message::from_parts(
-                Role::System,
-                vec![MessagePart::Recall { text: recall_text }],
-            )))
+            Ok((
+                Some(Message::from_parts(
+                    Role::System,
+                    vec![MessagePart::Recall { text: recall_text }],
+                )),
+                top_score,
+            ))
         } else {
-            Ok(None)
+            Ok((None, None))
         }
     }
 
@@ -852,6 +858,7 @@ impl<C: Channel> Agent<C> {
         let mut summaries_msg: Option<Message> = None;
         let mut cross_session_msg: Option<Message> = None;
         let mut recall_msg: Option<Message> = None;
+        let mut recall_confidence: Option<f32> = None;
         let mut doc_rag_msg: Option<Message> = None;
         let mut corrections_msg: Option<Message> = None;
         let mut code_rag_text: Option<String> = None;
@@ -894,7 +901,7 @@ impl<C: Channel> Agent<C> {
                     Some(&router),
                 )
                 .await
-                .map(ContextSlot::SemanticRecall)
+                .map(|(msg, score)| ContextSlot::SemanticRecall(msg, score))
             }));
             fetchers.push(Box::pin(async {
                 Self::fetch_document_rag(memory_state, &query, alloc.semantic_recall, &tc)
@@ -922,7 +929,10 @@ impl<C: Channel> Agent<C> {
                     Ok(slot) => match slot {
                         ContextSlot::Summaries(msg) => summaries_msg = msg,
                         ContextSlot::CrossSession(msg) => cross_session_msg = msg,
-                        ContextSlot::SemanticRecall(msg) => recall_msg = msg,
+                        ContextSlot::SemanticRecall(msg, score) => {
+                            recall_msg = msg;
+                            recall_confidence = score;
+                        }
                         ContextSlot::DocumentRag(msg) => doc_rag_msg = msg,
                         ContextSlot::Corrections(msg) => corrections_msg = msg,
                         ContextSlot::CodeContext(text) => code_rag_text = text,
@@ -938,6 +948,9 @@ impl<C: Channel> Agent<C> {
             }
         }
 
+        // Store top-1 recall score on agent state for MAR routing signal.
+        self.memory_state.last_recall_confidence = recall_confidence;
+
         // MemoryFirst: drain conversation history BEFORE inserting memory messages so that the
         // memory inserts land into the shorter array and are not accidentally removed.
         if memory_first {
diff --git a/crates/zeph-core/src/agent/context/mod.rs b/crates/zeph-core/src/agent/context/mod.rs
@@ -93,7 +93,8 @@ pub(super) enum CompactionOutcome {
 pub(super) enum ContextSlot {
     Summaries(Option<Message>),
     CrossSession(Option<Message>),
-    SemanticRecall(Option<Message>),
+    /// Semantic recall result. Carries the formatted message and the top-1 similarity score.
+    SemanticRecall(Option<Message>, Option<f32>),
     DocumentRag(Option<Message>),
     Corrections(Option<Message>),
     CodeContext(Option<String>),
diff --git a/crates/zeph-core/src/agent/context/tests.rs b/crates/zeph-core/src/agent/context/tests.rs
@@ -3179,6 +3179,7 @@ fn make_mem_state(
         shutdown_summary_max_messages: 20,
         shutdown_summary_timeout_secs: 10,
         structured_summaries: false,
+        last_recall_confidence: None,
         digest_config: crate::config::DigestConfig::default(),
         cached_session_digest: None,
         context_strategy: crate::config::ContextStrategy::default(),
diff --git a/crates/zeph-core/src/agent/mod.rs b/crates/zeph-core/src/agent/mod.rs
@@ -320,6 +320,7 @@ impl<C: Channel> Agent<C> {
                 shutdown_summary_max_messages: 20,
                 shutdown_summary_timeout_secs: 10,
                 structured_summaries: false,
+                last_recall_confidence: None,
                 digest_config: crate::config::DigestConfig::default(),
                 cached_session_digest: None,
                 context_strategy: crate::config::ContextStrategy::default(),
@@ -3771,6 +3772,10 @@ impl<C: Channel> Agent<C> {
             tracing::warn!("context preparation failed: {e:#}");
         }
 
+        // MAR: propagate top-1 recall confidence to the router for cost-aware routing.
+        self.provider
+            .set_memory_confidence(self.memory_state.last_recall_confidence);
+
         self.learning_engine.reset_reflection();
 
         let mut all_image_parts = std::mem::take(&mut self.msg.pending_image_parts);
diff --git a/crates/zeph-core/src/agent/state/mod.rs b/crates/zeph-core/src/agent/state/mod.rs
@@ -55,6 +55,10 @@ pub(crate) struct MemoryState {
     /// When `true`, hard compaction uses `AnchoredSummary` (structured JSON) instead of
     /// free-form prose. Falls back to prose on any LLM or validation failure.
     pub(crate) structured_summaries: bool,
+    /// Top-1 semantic recall score from the most recent `prepare_context` cycle.
+    /// Used by MAR (Memory-Augmented Routing) to bias the bandit toward cheap providers
+    /// when memory confidence is high. Reset to `None` at the start of each turn.
+    pub(crate) last_recall_confidence: Option<f32>,
     /// Session digest configuration (#2289).
     pub(crate) digest_config: crate::config::DigestConfig,
     /// Cached session digest text and its token count, loaded at session start.
diff --git a/crates/zeph-core/src/bootstrap/provider.rs b/crates/zeph-core/src/bootstrap/provider.rs
@@ -535,11 +535,12 @@ fn create_provider_from_pool(config: &Config) -> Result<AnyProvider, BootstrapEr
             let router_bandit_cfg = BanditRouterConfig {
                 alpha: bandit_cfg.alpha,
                 dim: bandit_cfg.dim,
-                cost_weight: bandit_cfg.cost_weight,
+                cost_weight: bandit_cfg.cost_weight.clamp(0.0, 1.0),
                 decay_factor: bandit_cfg.decay_factor,
                 warmup_queries: 0, // computed by with_bandit() from provider count
                 embedding_timeout_ms: bandit_cfg.embedding_timeout_ms,
                 cache_size: bandit_cfg.cache_size,
+                memory_confidence_threshold: bandit_cfg.memory_confidence_threshold.clamp(0.0, 1.0),
             };
             // Resolve embedding provider for feature vectors.
             let embed_provider = if bandit_cfg.embedding_provider.is_empty() {
diff --git a/crates/zeph-llm/src/any.rs b/crates/zeph-llm/src/any.rs
@@ -56,6 +56,16 @@ pub enum AnyProvider {
 }
 
 impl AnyProvider {
+    /// Set the MAR memory recall confidence for the current turn.
+    ///
+    /// Delegates to [`RouterProvider::set_memory_confidence`] when the inner provider is
+    /// a bandit router. No-op for all other provider types.
+    pub fn set_memory_confidence(&self, confidence: Option<f32>) {
+        if let AnyProvider::Router(r) = self {
+            r.set_memory_confidence(confidence);
+        }
+    }
+
     /// Return a cloneable closure that calls `embed()` on this provider.
     pub fn embed_fn(&self) -> impl Fn(&str) -> crate::provider::EmbedFuture + Send + Sync + use<> {
         let provider = std::sync::Arc::new(self.clone());
@@ -296,6 +306,10 @@ impl LlmProvider for AnyProvider {
         delegate_provider!(self, |p| p.name())
     }
 
+    fn model_identifier(&self) -> &str {
+        delegate_provider!(self, |p| p.model_identifier())
+    }
+
     fn supports_structured_output(&self) -> bool {
         delegate_provider!(self, |p| p.supports_structured_output())
     }
diff --git a/crates/zeph-llm/src/claude/mod.rs b/crates/zeph-llm/src/claude/mod.rs
@@ -814,6 +814,10 @@ impl LlmProvider for ClaudeProvider {
         "claude"
     }
 
+    fn model_identifier(&self) -> &str {
+        &self.model
+    }
+
     fn supports_structured_output(&self) -> bool {
         true
     }
diff --git a/crates/zeph-llm/src/compatible.rs b/crates/zeph-llm/src/compatible.rs
@@ -105,6 +105,10 @@ impl LlmProvider for CompatibleProvider {
         &self.provider_name
     }
 
+    fn model_identifier(&self) -> &str {
+        self.inner.model_identifier()
+    }
+
     fn list_models(&self) -> Vec<String> {
         self.inner.list_models()
     }
diff --git a/crates/zeph-llm/src/ollama.rs b/crates/zeph-llm/src/ollama.rs
@@ -404,6 +404,10 @@ impl LlmProvider for OllamaProvider {
         "ollama"
     }
 
+    fn model_identifier(&self) -> &str {
+        &self.model
+    }
+
     fn last_usage(&self) -> Option<(u64, u64)> {
         self.usage.last_usage()
     }
diff --git a/crates/zeph-llm/src/openai/mod.rs b/crates/zeph-llm/src/openai/mod.rs
@@ -452,6 +452,10 @@ impl LlmProvider for OpenAiProvider {
         "openai"
     }
 
+    fn model_identifier(&self) -> &str {
+        &self.model
+    }
+
     fn list_models(&self) -> Vec<String> {
         vec![self.model.clone()]
     }
diff --git a/crates/zeph-llm/src/provider.rs b/crates/zeph-llm/src/provider.rs
@@ -503,6 +503,13 @@ pub trait LlmProvider: Send + Sync {
     /// Provider name for logging and identification.
     fn name(&self) -> &str;
 
+    /// Model identifier string (e.g. `gpt-4o-mini`, `claude-sonnet-4-6`).
+    /// Used by cost-estimation heuristics. Returns `""` when not applicable.
+    #[allow(clippy::unnecessary_literal_bound)]
+    fn model_identifier(&self) -> &str {
+        ""
+    }
+
     /// Whether this provider supports image input (vision).
     fn supports_vision(&self) -> bool {
         false
diff --git a/crates/zeph-llm/src/router/bandit.rs b/crates/zeph-llm/src/router/bandit.rs
diff --git a/crates/zeph-llm/src/router/mod.rs b/crates/zeph-llm/src/router/mod.rs

Original file line number	Diff line number	Diff line change
`@@ -814,6 +814,10 @@ impl LlmProvider for ClaudeProvider {`
`814`	`814`	`"claude"`
`815`	`815`	`}`
`816`	`816`
	`817`	`+ fn model_identifier(&self) -> &str {`
	`818`	`+ &self.model`
	`819`	`+ }`
	`820`	`+`
`817`	`821`	`fn supports_structured_output(&self) -> bool {`
`818`	`822`	`true`
`819`	`823`	`}`