Skip to content

Commit 2fd6f13

Browse files
committed
feat: add skill trust levels and quarantine system (M25)
Implement 4-tier trust model (Trusted/Verified/Quarantined/Blocked) for skills with runtime enforcement via TrustGateExecutor, anomaly detection, blake3 integrity verification, and quarantine prompt wrapping. Quarantined skills are denied bash, file_write, and web_scrape tools. Self-learning is disabled for skills below Verified trust level. Hot-reload hash mismatch auto-downgrades to Quarantined. Closes #419, closes #420, closes #421, closes #422
1 parent 156960a commit 2fd6f13

File tree

19 files changed

+1354
-20
lines changed

19 files changed

+1354
-20
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
77
## [Unreleased]
88

99
### Added
10+
- Skill trust levels: 4-tier model (Trusted, Verified, Quarantined, Blocked) with per-turn enforcement
11+
- `TrustGateExecutor` wrapping tool execution with trust-level permission checks
12+
- `AnomalyDetector` with sliding-window threshold counters for quarantined skill monitoring
13+
- blake3 content hashing for skill integrity verification on load and hot-reload
14+
- Quarantine prompt wrapping for structural isolation of untrusted skill bodies
15+
- Self-learning gate: skills with trust < Verified skip auto-improvement
16+
- `skill_trust` SQLite table with migration 009
17+
- CLI commands: `/skill trust`, `/skill block`, `/skill unblock`
18+
- `[skills.trust]` config section (default_level, local_level, hash_mismatch_level)
1019
- `ProviderKind` enum for type-safe provider selection in config
1120
- `RuntimeConfig` struct grouping agent runtime fields
1221
- `AnyProvider::embed_fn()` shared embedding closure helper

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/default.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,14 @@ max_versions = 10
100100
# Cooldown between improvements for same skill (minutes)
101101
cooldown_minutes = 60
102102

103+
[skills.trust]
104+
# Default trust level for newly discovered skills: trusted, verified, quarantined, blocked
105+
default_level = "quarantined"
106+
# Trust level assigned to local (built-in) skills
107+
local_level = "trusted"
108+
# Trust level after blake3 hash mismatch on hot-reload
109+
hash_mismatch_level = "quarantined"
110+
103111
[memory]
104112
# SQLite database path for conversation history
105113
sqlite_path = "./data/zeph.db"

crates/zeph-core/src/agent/context.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,8 @@ impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C,
696696
.cloned()
697697
.collect();
698698

699-
let skills_prompt = format_skills_prompt(&active_skills, std::env::consts::OS);
699+
let trust_map = self.build_skill_trust_map().await;
700+
let skills_prompt = format_skills_prompt(&active_skills, std::env::consts::OS, &trust_map);
700701
let catalog_prompt = format_skills_catalog(&remaining_skills);
701702
self.skill_state
702703
.last_skills_prompt

crates/zeph-core/src/agent/learning.rs

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,17 @@ impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C,
1010
self.learning_config.as_ref().is_some_and(|c| c.enabled)
1111
}
1212

13+
#[cfg(feature = "self-learning")]
14+
async fn is_skill_trusted_for_learning(&self, skill_name: &str) -> bool {
15+
let Some(memory) = &self.memory_state.memory else {
16+
return true;
17+
};
18+
let Ok(Some(row)) = memory.sqlite().load_skill_trust(skill_name).await else {
19+
return true; // no trust record = local skill = trusted
20+
};
21+
matches!(row.trust_level.as_str(), "trusted" | "verified")
22+
}
23+
1324
#[cfg(not(feature = "self-learning"))]
1425
#[allow(dead_code, clippy::unused_self)]
1526
pub(super) fn is_learning_enabled(&self) -> bool {
@@ -66,6 +77,10 @@ impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C,
6677
return Ok(false);
6778
};
6879

80+
if !self.is_skill_trusted_for_learning(&name).await {
81+
return Ok(false);
82+
}
83+
6984
let Ok(skill) = self.skill_state.registry.get_skill(&name) else {
7085
return Ok(false);
7186
};
@@ -117,6 +132,9 @@ impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C,
117132
if !self.is_learning_enabled() {
118133
return Ok(());
119134
}
135+
if !self.is_skill_trusted_for_learning(skill_name).await {
136+
return Ok(());
137+
}
120138

121139
let Some(memory) = &self.memory_state.memory else {
122140
return Ok(());
@@ -378,9 +396,12 @@ impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C,
378396
}
379397
Some("approve") => self.handle_skill_approve(parts.get(1).copied()).await,
380398
Some("reset") => self.handle_skill_reset(parts.get(1).copied()).await,
399+
Some("trust") => self.handle_skill_trust_command(&parts[1..]).await,
400+
Some("block") => self.handle_skill_block(parts.get(1).copied()).await,
401+
Some("unblock") => self.handle_skill_unblock(parts.get(1).copied()).await,
381402
_ => {
382403
self.channel
383-
.send("Unknown /skill subcommand. Available: stats, versions, activate, approve, reset")
404+
.send("Unknown /skill subcommand. Available: stats, versions, activate, approve, reset, trust, block, unblock")
384405
.await?;
385406
Ok(())
386407
}
@@ -390,12 +411,20 @@ impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C,
390411
#[cfg(not(feature = "self-learning"))]
391412
pub(super) async fn handle_skill_command(
392413
&mut self,
393-
_args: &str,
414+
args: &str,
394415
) -> Result<(), super::error::AgentError> {
395-
self.channel
396-
.send("Self-learning feature is not enabled.")
397-
.await?;
398-
Ok(())
416+
let parts: Vec<&str> = args.split_whitespace().collect();
417+
match parts.first().copied() {
418+
Some("trust") => self.handle_skill_trust_command(&parts[1..]).await,
419+
Some("block") => self.handle_skill_block(parts.get(1).copied()).await,
420+
Some("unblock") => self.handle_skill_unblock(parts.get(1).copied()).await,
421+
_ => {
422+
self.channel
423+
.send("Available /skill subcommands: trust, block, unblock")
424+
.await?;
425+
Ok(())
426+
}
427+
}
399428
}
400429

401430
#[cfg(feature = "self-learning")]

crates/zeph-core/src/agent/mod.rs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ mod learning;
77
mod mcp;
88
mod persistence;
99
mod streaming;
10+
mod trust_commands;
1011

1112
use std::collections::VecDeque;
1213
use std::path::PathBuf;
@@ -16,6 +17,7 @@ use tokio::sync::{mpsc, watch};
1617
use zeph_llm::provider::{LlmProvider, Message, Role};
1718

1819
use crate::metrics::MetricsSnapshot;
20+
use std::collections::HashMap;
1921
use zeph_memory::semantic::SemanticMemory;
2022
use zeph_skills::loader::Skill;
2123
use zeph_skills::matcher::{SkillMatcher, SkillMatcherBackend};
@@ -149,7 +151,8 @@ impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C,
149151
.iter()
150152
.filter_map(|m| registry.get_skill(&m.name).ok())
151153
.collect();
152-
let skills_prompt = format_skills_prompt(&all_skills, std::env::consts::OS);
154+
let empty_trust = HashMap::new();
155+
let skills_prompt = format_skills_prompt(&all_skills, std::env::consts::OS, &empty_trust);
153156
let system_prompt = build_system_prompt(&skills_prompt, None, None, false);
154157
tracing::debug!(len = system_prompt.len(), "initial system prompt built");
155158
tracing::trace!(prompt = %system_prompt, "full system prompt");
@@ -657,7 +660,18 @@ impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C,
657660
let mut output = String::from("Available skills:\n\n");
658661

659662
for meta in self.skill_state.registry.all_meta() {
660-
let _ = writeln!(output, "- {} — {}", meta.name, meta.description);
663+
let trust_info = if let Some(memory) = &self.memory_state.memory {
664+
memory
665+
.sqlite()
666+
.load_skill_trust(&meta.name)
667+
.await
668+
.ok()
669+
.flatten()
670+
.map_or_else(String::new, |r| format!(" [{}]", r.trust_level))
671+
} else {
672+
String::new()
673+
};
674+
let _ = writeln!(output, "- {} — {}{trust_info}", meta.name, meta.description);
661675
}
662676

663677
if let Some(memory) = &self.memory_state.memory {
@@ -777,7 +791,8 @@ impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C,
777791
.iter()
778792
.filter_map(|m| self.skill_state.registry.get_skill(&m.name).ok())
779793
.collect();
780-
let skills_prompt = format_skills_prompt(&all_skills, std::env::consts::OS);
794+
let trust_map = self.build_skill_trust_map().await;
795+
let skills_prompt = format_skills_prompt(&all_skills, std::env::consts::OS, &trust_map);
781796
self.skill_state
782797
.last_skills_prompt
783798
.clone_from(&skills_prompt);
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
use std::collections::HashMap;
2+
use std::fmt::Write;
3+
4+
use zeph_skills::TrustLevel;
5+
6+
use super::{Agent, Channel, LlmProvider, ToolExecutor};
7+
8+
impl<P: LlmProvider + Clone + 'static, C: Channel, T: ToolExecutor> Agent<P, C, T> {
9+
/// Handle `/skill trust [name [level]]`.
10+
pub(super) async fn handle_skill_trust_command(
11+
&mut self,
12+
args: &[&str],
13+
) -> Result<(), super::error::AgentError> {
14+
let Some(memory) = &self.memory_state.memory else {
15+
self.channel.send("Memory not available.").await?;
16+
return Ok(());
17+
};
18+
19+
match args.first().copied() {
20+
None => {
21+
// List all trust levels
22+
let rows = memory.sqlite().load_all_skill_trust().await?;
23+
if rows.is_empty() {
24+
self.channel.send("No skill trust data recorded.").await?;
25+
return Ok(());
26+
}
27+
let mut output = String::from("Skill trust levels:\n\n");
28+
for row in &rows {
29+
let _ = writeln!(
30+
output,
31+
"- {} [{}] (source: {}, hash: {}..)",
32+
row.skill_name,
33+
row.trust_level,
34+
row.source_kind,
35+
&row.blake3_hash[..row.blake3_hash.len().min(8)]
36+
);
37+
}
38+
self.channel.send(&output).await?;
39+
}
40+
Some(name) => {
41+
if let Some(level_str) = args.get(1).copied() {
42+
// Set trust level
43+
let level = match level_str {
44+
"trusted" => TrustLevel::Trusted,
45+
"verified" => TrustLevel::Verified,
46+
"quarantined" => TrustLevel::Quarantined,
47+
"blocked" => TrustLevel::Blocked,
48+
_ => {
49+
self.channel
50+
.send("Invalid trust level. Use: trusted, verified, quarantined, blocked")
51+
.await?;
52+
return Ok(());
53+
}
54+
};
55+
let updated = memory
56+
.sqlite()
57+
.set_skill_trust_level(name, &level.to_string())
58+
.await?;
59+
if updated {
60+
self.channel
61+
.send(&format!("Trust level for \"{name}\" set to {level}."))
62+
.await?;
63+
} else {
64+
self.channel
65+
.send(&format!("Skill \"{name}\" not found in trust database."))
66+
.await?;
67+
}
68+
} else {
69+
// Show single skill trust
70+
let row = memory.sqlite().load_skill_trust(name).await?;
71+
match row {
72+
Some(r) => {
73+
self.channel
74+
.send(&format!(
75+
"{}: level={}, source={}, hash={}",
76+
r.skill_name, r.trust_level, r.source_kind, r.blake3_hash
77+
))
78+
.await?;
79+
}
80+
None => {
81+
self.channel
82+
.send(&format!("No trust data for \"{name}\"."))
83+
.await?;
84+
}
85+
}
86+
}
87+
}
88+
}
89+
Ok(())
90+
}
91+
92+
/// Handle `/skill block <name>`.
93+
pub(super) async fn handle_skill_block(
94+
&mut self,
95+
name: Option<&str>,
96+
) -> Result<(), super::error::AgentError> {
97+
let Some(name) = name else {
98+
self.channel.send("Usage: /skill block <name>").await?;
99+
return Ok(());
100+
};
101+
let Some(memory) = &self.memory_state.memory else {
102+
self.channel.send("Memory not available.").await?;
103+
return Ok(());
104+
};
105+
let updated = memory
106+
.sqlite()
107+
.set_skill_trust_level(name, "blocked")
108+
.await?;
109+
if updated {
110+
self.channel
111+
.send(&format!("Skill \"{name}\" blocked."))
112+
.await?;
113+
} else {
114+
self.channel
115+
.send(&format!("Skill \"{name}\" not found in trust database."))
116+
.await?;
117+
}
118+
Ok(())
119+
}
120+
121+
/// Handle `/skill unblock <name>`.
122+
pub(super) async fn handle_skill_unblock(
123+
&mut self,
124+
name: Option<&str>,
125+
) -> Result<(), super::error::AgentError> {
126+
let Some(name) = name else {
127+
self.channel.send("Usage: /skill unblock <name>").await?;
128+
return Ok(());
129+
};
130+
let Some(memory) = &self.memory_state.memory else {
131+
self.channel.send("Memory not available.").await?;
132+
return Ok(());
133+
};
134+
let updated = memory
135+
.sqlite()
136+
.set_skill_trust_level(name, "quarantined")
137+
.await?;
138+
if updated {
139+
self.channel
140+
.send(&format!("Skill \"{name}\" unblocked (set to quarantined)."))
141+
.await?;
142+
} else {
143+
self.channel
144+
.send(&format!("Skill \"{name}\" not found in trust database."))
145+
.await?;
146+
}
147+
Ok(())
148+
}
149+
150+
pub(super) async fn build_skill_trust_map(&self) -> HashMap<String, TrustLevel> {
151+
let Some(memory) = &self.memory_state.memory else {
152+
return HashMap::new();
153+
};
154+
let Ok(rows) = memory.sqlite().load_all_skill_trust().await else {
155+
return HashMap::new();
156+
};
157+
rows.into_iter()
158+
.filter_map(|r| {
159+
let level = match r.trust_level.as_str() {
160+
"trusted" => TrustLevel::Trusted,
161+
"verified" => TrustLevel::Verified,
162+
"quarantined" => TrustLevel::Quarantined,
163+
"blocked" => TrustLevel::Blocked,
164+
_ => return None,
165+
};
166+
Some((r.skill_name, level))
167+
})
168+
.collect()
169+
}
170+
}

0 commit comments

Comments
 (0)