Skip to content

Commit 23a323a

Browse files
feat: implement behavioral anomaly detection in RingBreachDetector
1 parent 53adea3 commit 23a323a

File tree

2 files changed

+296
-33
lines changed

2 files changed

+296
-33
lines changed

packages/agent-hypervisor/src/hypervisor/rings/breach_detector.py

Lines changed: 139 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,26 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT License.
3-
# Community Edition — basic implementation
43
"""
5-
Ring Breach Detector — stub implementation.
4+
Ring Breach Detector — behavioral anomaly detection for rogue agents.
65
7-
Community edition: breach detection is not available.
8-
All methods return safe defaults.
6+
Detects two classes of anomaly:
7+
8+
1. **Tool-call frequency spikes** — an agent's call rate inside a sliding
9+
window exceeds a configurable baseline by a severity-dependent multiplier.
10+
2. **Privilege-escalation attempts** — a low-privilege agent (Ring 3)
11+
repeatedly calls into higher-privilege rings (Ring 0/1). The *ring
12+
distance* amplifies the anomaly score so that sandbox→root jumps are
13+
scored more aggressively than standard→privileged ones.
14+
15+
When a HIGH or CRITICAL breach is detected the internal circuit-breaker
16+
trips and ``is_breaker_tripped()`` returns ``True`` until explicitly reset
17+
via ``reset_breaker()``.
918
"""
1019

1120
from __future__ import annotations
1221

22+
import time
23+
from collections import deque
1324
from dataclasses import dataclass, field
1425
from datetime import UTC, datetime
1526
from enum import Enum
@@ -25,6 +36,15 @@ class BreachSeverity(str, Enum):
2536
CRITICAL = "critical"
2637

2738

39+
# Multiplier thresholds: actual_rate / baseline_rate
40+
_SEVERITY_THRESHOLDS: list[tuple[float, BreachSeverity]] = [
41+
(20.0, BreachSeverity.CRITICAL),
42+
(10.0, BreachSeverity.HIGH),
43+
(5.0, BreachSeverity.MEDIUM),
44+
(2.0, BreachSeverity.LOW),
45+
]
46+
47+
2848
@dataclass
2949
class BreachEvent:
3050
"""A detected ring breach anomaly."""
@@ -40,12 +60,45 @@ class BreachEvent:
4060
details: str = ""
4161

4262

43-
class RingBreachDetector:
44-
"""Breach detector stub (community edition: no detection)."""
63+
def _agent_key(agent_did: str, session_id: str) -> str:
64+
"""Internal composite key for per-agent tracking."""
65+
return f"{agent_did}::{session_id}"
4566

46-
def __init__(self, window_seconds: int = 60) -> None:
47-
self._breach_history: list[BreachEvent] = []
67+
68+
class RingBreachDetector:
69+
"""Behavioural anomaly detector for rogue-agent ring abuse.
70+
71+
Parameters
72+
----------
73+
window_seconds:
74+
Sliding window (in seconds) over which call rates are measured.
75+
baseline_rate:
76+
Expected calls-per-second within the window. Rates above multiples
77+
of this value trigger breach events of increasing severity.
78+
max_events_per_agent:
79+
Maximum call timestamps retained per agent (bounded ``deque``).
80+
"""
81+
82+
def __init__(
83+
self,
84+
window_seconds: int = 60,
85+
baseline_rate: float = 10.0,
86+
max_events_per_agent: int = 1_000,
87+
) -> None:
4888
self.window_seconds = window_seconds
89+
self.baseline_rate = baseline_rate
90+
self.max_events_per_agent = max_events_per_agent
91+
92+
# Per-agent sliding-window timestamps
93+
self._call_windows: dict[str, deque[float]] = {}
94+
# Per-agent circuit-breaker flag
95+
self._tripped: dict[str, bool] = {}
96+
# Global breach history
97+
self._breach_history: list[BreachEvent] = []
98+
99+
# ------------------------------------------------------------------
100+
# Public API
101+
# ------------------------------------------------------------------
49102

50103
def record_call(
51104
self,
@@ -54,14 +107,88 @@ def record_call(
54107
agent_ring: ExecutionRing,
55108
called_ring: ExecutionRing,
56109
) -> BreachEvent | None:
57-
"""Record a ring call (community edition: no-op, never detects breach)."""
58-
return None
110+
"""Record a ring call and return a ``BreachEvent`` if anomalous.
111+
112+
Returns ``None`` when the call is within normal parameters.
113+
"""
114+
key = _agent_key(agent_did, session_id)
115+
now = time.monotonic()
116+
117+
# --- 1. Track timestamp in bounded deque ---
118+
if key not in self._call_windows:
119+
self._call_windows[key] = deque(maxlen=self.max_events_per_agent)
120+
window = self._call_windows[key]
121+
window.append(now)
122+
123+
# --- 2. Prune timestamps outside the sliding window ---
124+
cutoff = now - self.window_seconds
125+
while window and window[0] < cutoff:
126+
window.popleft()
127+
128+
# --- 3. Compute actual rate (calls / second) ---
129+
call_count = len(window)
130+
actual_rate = call_count / self.window_seconds if self.window_seconds > 0 else 0.0
131+
132+
# --- 4. Ring-distance amplifier ---
133+
# Upward calls (low value = higher privilege) are escalations.
134+
# ExecutionRing values: 0=root, 1=priv, 2=std, 3=sandbox.
135+
# ring_distance > 0 means privilege escalation.
136+
ring_distance = int(agent_ring) - int(called_ring)
137+
amplifier = max(ring_distance, 1) # at least 1× (no reduction)
138+
139+
# --- 5. Score = (actual / baseline) × amplifier ---
140+
if self.baseline_rate <= 0:
141+
ratio = 0.0
142+
else:
143+
ratio = actual_rate / self.baseline_rate
144+
anomaly_score = ratio * amplifier
145+
146+
# --- 6. Map score → severity ---
147+
severity = BreachSeverity.NONE
148+
for threshold, sev in _SEVERITY_THRESHOLDS:
149+
if anomaly_score >= threshold:
150+
severity = sev
151+
break
152+
153+
if severity == BreachSeverity.NONE:
154+
return None
155+
156+
# --- 7. Build event ---
157+
event = BreachEvent(
158+
agent_did=agent_did,
159+
session_id=session_id,
160+
severity=severity,
161+
anomaly_score=round(anomaly_score, 4),
162+
call_count_window=call_count,
163+
expected_rate=self.baseline_rate,
164+
actual_rate=round(actual_rate, 4),
165+
details=(
166+
f"rate={actual_rate:.2f}/s (baseline={self.baseline_rate:.2f}/s), "
167+
f"ring_distance={ring_distance}, amplifier={amplifier}×, "
168+
f"score={anomaly_score:.2f}"
169+
),
170+
)
171+
self._breach_history.append(event)
172+
173+
# --- 8. Trip circuit-breaker on HIGH / CRITICAL ---
174+
if severity in (BreachSeverity.HIGH, BreachSeverity.CRITICAL):
175+
self._tripped[key] = True
176+
177+
return event
59178

60179
def is_breaker_tripped(self, agent_did: str, session_id: str) -> bool:
61-
return False
180+
"""Return ``True`` if the circuit-breaker is tripped for this agent."""
181+
return self._tripped.get(_agent_key(agent_did, session_id), False)
62182

63183
def reset_breaker(self, agent_did: str, session_id: str) -> None:
64-
pass
184+
"""Reset the circuit-breaker and clear the call window for this agent."""
185+
key = _agent_key(agent_did, session_id)
186+
self._tripped.pop(key, None)
187+
self._call_windows.pop(key, None)
188+
189+
# ------------------------------------------------------------------
190+
# Read-only accessors
191+
# ------------------------------------------------------------------
65192

66193
@property
67194
def breach_history(self) -> list[BreachEvent]:

0 commit comments

Comments
 (0)