feat: proactive security hardening (P06-P11)

imran-siddique · Copilot · imran-siddique · commit 48e43a3ddf09 · 2026-03-17T20:55:59.000-07:00
- P06: HMAC peer integrity tracking in TrustBridge
- P07: AgentBehaviorMonitor for rogue agent detection
- P10: Circuit breaker on MCP tool invocations
- P11: PII sanitization in MCP error responses
- P09: SBOM generation script (CycloneDX + pip-audit)
- Added 16 negative security tests covering all new patterns
- Added sbom/ to .gitignore

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Generated artifacts
+sbom/
+
 # Python
 __pycache__/
 *.py[cod]
diff --git a/packages/agent-mesh/src/agentmesh/integrations/mcp/__init__.py b/packages/agent-mesh/src/agentmesh/integrations/mcp/__init__.py
@@ -118,6 +118,11 @@ def __init__(
         self._verification_ttl = timedelta(minutes=10)
         self._max_verified_clients = 10_000
 
+        # P10: Circuit breaker — track consecutive failures per tool
+        self._tool_failures: Dict[str, int] = {}
+        self._circuit_breaker_threshold = 5  # open circuit after 5 consecutive failures
+        self._circuit_breaker_reset = timedelta(minutes=1)
+
     # P05: Maximum tool description length to prevent prompt injection via descriptions
     _MAX_DESCRIPTION_LENGTH = 1000
     # P12: Maximum total size of tool arguments (bytes when serialized)
@@ -312,6 +317,15 @@ async def invoke_tool(
 
         # Execute tool
         call.trust_verified = True
+
+        # P10: Circuit breaker — reject if tool has too many consecutive failures
+        fail_count = self._tool_failures.get(tool_name, 0)
+        if fail_count >= self._circuit_breaker_threshold:
+            call.error = f"Circuit breaker open: {tool_name} has {fail_count} consecutive failures"
+            call.completed_at = datetime.utcnow()
+            self._record_call(call)
+            return call
+
         try:
             # V12: Validate arguments against input_schema before dispatch
             allowed_keys = set(tool.input_schema.get("properties", {}).keys())
@@ -330,11 +344,15 @@ async def invoke_tool(
             call.result = result
             tool.total_calls += 1
             tool.last_called = datetime.utcnow()
+            self._tool_failures.pop(tool_name, None)  # reset on success
             logger.info(f"Tool {tool_name} invoked successfully by {caller_did}")
         except Exception as e:
-            call.error = str(e)
+            # P11: Sanitize exception — don't log full message (may contain PII)
+            error_type = type(e).__name__
+            call.error = f"{error_type}: {str(e)[:200]}"
             tool.failed_calls += 1
-            logger.error(f"Tool {tool_name} failed: {e}")
+            self._tool_failures[tool_name] = self._tool_failures.get(tool_name, 0) + 1
+            logger.error("Tool %s failed with %s (caller: %s)", tool_name, error_type, caller_did)
 
         call.completed_at = datetime.utcnow()
         self._record_call(call)
diff --git a/packages/agent-mesh/src/agentmesh/services/behavior_monitor.py b/packages/agent-mesh/src/agentmesh/services/behavior_monitor.py
@@ -0,0 +1,181 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""
+Agent Behavior Monitor
+======================
+
+Runtime anomaly detection and quarantine for rogue agent behavior.
+Tracks per-agent metrics and triggers alerts or quarantine when
+thresholds are breached.
+
+Monitored signals:
+  - Tool call frequency (burst detection)
+  - Consecutive failure rate
+  - Capability escalation attempts
+  - Trust score manipulation attempts
+
+Usage::
+
+    monitor = AgentBehaviorMonitor()
+    monitor.record_tool_call("did:mesh:abc", "sql_query", success=True)
+    # ... later ...
+    if monitor.is_quarantined("did:mesh:abc"):
+        raise PermissionError("Agent is quarantined")
+"""
+
+from __future__ import annotations
+
+import logging
+import threading
+from collections import defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class AgentMetrics:
+    """Rolling metrics for a single agent."""
+
+    agent_did: str
+    total_calls: int = 0
+    failed_calls: int = 0
+    consecutive_failures: int = 0
+    capability_denials: int = 0
+    last_activity: Optional[datetime] = None
+    quarantined: bool = False
+    quarantine_reason: Optional[str] = None
+    quarantined_at: Optional[datetime] = None
+    # Rolling window for burst detection
+    call_timestamps: list[datetime] = field(default_factory=list)
+
+
+class AgentBehaviorMonitor:
+    """Monitors agent behavior and quarantines anomalous agents.
+
+    Args:
+        burst_window_seconds: Time window for burst detection.
+        burst_threshold: Max calls in the burst window before alert.
+        consecutive_failure_threshold: Failures in a row before quarantine.
+        capability_denial_threshold: Denied capability checks before quarantine.
+        quarantine_duration: How long an auto-quarantine lasts.
+        max_tracked_agents: Evict oldest agents beyond this limit.
+    """
+
+    def __init__(
+        self,
+        burst_window_seconds: int = 60,
+        burst_threshold: int = 100,
+        consecutive_failure_threshold: int = 20,
+        capability_denial_threshold: int = 10,
+        quarantine_duration: timedelta = timedelta(minutes=15),
+        max_tracked_agents: int = 50_000,
+    ) -> None:
+        self._agents: dict[str, AgentMetrics] = {}
+        self._lock = threading.Lock()
+        self._burst_window = timedelta(seconds=burst_window_seconds)
+        self._burst_threshold = burst_threshold
+        self._consecutive_failure_threshold = consecutive_failure_threshold
+        self._capability_denial_threshold = capability_denial_threshold
+        self._quarantine_duration = quarantine_duration
+        self._max_tracked = max_tracked_agents
+
+    def _get_metrics(self, agent_did: str) -> AgentMetrics:
+        with self._lock:
+            if agent_did not in self._agents:
+                if len(self._agents) >= self._max_tracked:
+                    oldest = min(
+                        self._agents,
+                        key=lambda d: self._agents[d].last_activity or datetime.min,
+                    )
+                    del self._agents[oldest]
+                self._agents[agent_did] = AgentMetrics(agent_did=agent_did)
+            return self._agents[agent_did]
+
+    def record_tool_call(
+        self,
+        agent_did: str,
+        tool_name: str,
+        *,
+        success: bool,
+    ) -> None:
+        """Record a tool invocation and check for anomalies."""
+        m = self._get_metrics(agent_did)
+        now = datetime.utcnow()
+        m.total_calls += 1
+        m.last_activity = now
+
+        if success:
+            m.consecutive_failures = 0
+        else:
+            m.failed_calls += 1
+            m.consecutive_failures += 1
+            if m.consecutive_failures >= self._consecutive_failure_threshold:
+                self._quarantine(
+                    agent_did,
+                    f"Consecutive failure threshold breached "
+                    f"({m.consecutive_failures} failures)",
+                )
+
+        # Burst detection
+        cutoff = now - self._burst_window
+        m.call_timestamps = [t for t in m.call_timestamps if t > cutoff]
+        m.call_timestamps.append(now)
+        if len(m.call_timestamps) > self._burst_threshold:
+            self._quarantine(
+                agent_did,
+                f"Burst threshold breached ({len(m.call_timestamps)} calls "
+                f"in {self._burst_window.total_seconds()}s)",
+            )
+
+    def record_capability_denial(self, agent_did: str, capability: str) -> None:
+        """Record a denied capability check (possible privilege escalation)."""
+        m = self._get_metrics(agent_did)
+        m.capability_denials += 1
+        if m.capability_denials >= self._capability_denial_threshold:
+            self._quarantine(
+                agent_did,
+                f"Capability denial threshold breached "
+                f"({m.capability_denials} denials, last: {capability})",
+            )
+
+    def _quarantine(self, agent_did: str, reason: str) -> None:
+        m = self._get_metrics(agent_did)
+        if m.quarantined:
+            return  # already quarantined
+        m.quarantined = True
+        m.quarantine_reason = reason
+        m.quarantined_at = datetime.utcnow()
+        logger.warning("QUARANTINE agent %s: %s", agent_did, reason)
+
+    def is_quarantined(self, agent_did: str) -> bool:
+        """Check if an agent is currently quarantined."""
+        m = self._agents.get(agent_did)
+        if not m or not m.quarantined:
+            return False
+        # Auto-release after quarantine duration
+        if m.quarantined_at and datetime.utcnow() - m.quarantined_at > self._quarantine_duration:
+            self.release_quarantine(agent_did)
+            return False
+        return True
+
+    def release_quarantine(self, agent_did: str) -> None:
+        """Manually release an agent from quarantine."""
+        m = self._agents.get(agent_did)
+        if m:
+            m.quarantined = False
+            m.quarantine_reason = None
+            m.quarantined_at = None
+            m.consecutive_failures = 0
+            m.capability_denials = 0
+            logger.info("Released agent %s from quarantine", agent_did)
+
+    def get_metrics(self, agent_did: str) -> Optional[AgentMetrics]:
+        """Get current metrics for an agent (read-only snapshot)."""
+        return self._agents.get(agent_did)
+
+    def get_quarantined_agents(self) -> list[AgentMetrics]:
+        """List all currently quarantined agents."""
+        return [m for m in self._agents.values() if m.quarantined]
diff --git a/packages/agent-mesh/src/agentmesh/trust/bridge.py b/packages/agent-mesh/src/agentmesh/trust/bridge.py
@@ -10,9 +10,15 @@
 from datetime import datetime
 from typing import Optional, Any
 from pydantic import BaseModel, Field
+import hashlib
+import hmac
+import logging
+import os
 
 from .handshake import TrustHandshake, HandshakeResult
 
+logger = logging.getLogger(__name__)
+
 # Import IATP from agent-os (the source of truth for trust protocol)
 try:
     from modules.iatp import IATPClient, IATPMessage, TrustLevel  # noqa: F401
@@ -94,6 +100,25 @@ def __init__(self, **data):
             identity=identity,
             registry=registry,
         )
+        # P06: HMAC key for peer record integrity
+        self._peer_hmac_key = os.urandom(32)
+        self._peer_signatures: dict[str, str] = {}
+
+    def _sign_peer(self, peer: PeerInfo) -> str:
+        """Compute HMAC over peer's critical fields."""
+        payload = f"{peer.peer_did}:{peer.trust_score}:{peer.trust_verified}:{','.join(peer.capabilities)}"
+        return hmac.new(self._peer_hmac_key, payload.encode(), hashlib.sha256).hexdigest()
+
+    def _verify_peer_integrity(self, peer_did: str) -> bool:
+        """Verify that a stored peer record has not been tampered with."""
+        peer = self.peers.get(peer_did)
+        if not peer:
+            return False
+        expected = self._peer_signatures.get(peer_did)
+        if not expected:
+            return False
+        actual = self._sign_peer(peer)
+        return hmac.compare_digest(actual, expected)
 
     async def verify_peer(
         self,
@@ -115,7 +140,7 @@ async def verify_peer(
         )
 
         if result.verified:
-            self.peers[peer_did] = PeerInfo(
+            peer = PeerInfo(
                 peer_did=peer_did,
                 peer_name=result.peer_name,
                 protocol=protocol,
@@ -124,6 +149,8 @@ async def verify_peer(
                 last_verified=datetime.utcnow(),
                 capabilities=result.capabilities,
             )
+            self.peers[peer_did] = peer
+            self._peer_signatures[peer_did] = self._sign_peer(peer)
 
         return result
 
@@ -137,6 +164,13 @@ async def is_peer_trusted(
         if not peer or not peer.trust_verified:
             return False
 
+        # P06: Verify record integrity before trusting cached score
+        if not self._verify_peer_integrity(peer_did):
+            logger.warning("Peer %s record integrity check failed — rejecting", peer_did)
+            del self.peers[peer_did]
+            self._peer_signatures.pop(peer_did, None)
+            return False
+
         threshold = required_score or self.default_trust_threshold
         return peer.trust_score >= threshold
 
diff --git a/packages/agent-mesh/tests/test_negative_security.py b/packages/agent-mesh/tests/test_negative_security.py
diff --git a/scripts/generate_sbom.py b/scripts/generate_sbom.py