diff --git a/docs/FEDERATION_PROPOSAL.md b/docs/FEDERATION_PROPOSAL.md new file mode 100644 index 00000000..9bfb4925 --- /dev/null +++ b/docs/FEDERATION_PROPOSAL.md @@ -0,0 +1,1991 @@ +# Agent Relay Federation: Cross-Server Communication Proposal + +**Status:** Draft v2 (revised after critical review) +**Last Updated:** 2025-12-21 + +## Executive Summary + +This proposal extends agent-relay to support **federated multi-server deployments** while preserving the core differentiator: **automatic message injection via tmux**. Unlike polling-based systems (mcp_agent_mail OSS), federated agent-relay maintains real-time, interrupt-driven communication across server boundaries. + +### Key Design Decisions (v2) + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| Transport | Pluggable (WebSocket default, NATS optional) | Start simple, scale up | +| Delivery | End-to-end confirmation | Sender knows agent received | +| Naming | Fleet-wide unique names | Avoid split-brain complexity | +| Auth | Asymmetric keys (Ed25519) | Scales better than N² tokens | +| Backpressure | Credit-based flow control | Prevent OOM on slow peers | + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ FEDERATED AGENT-RELAY │ +│ │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ Server A │ │ Server B │ │ Server C │ │ +│ │ │ │ │ │ │ │ +│ │ ┌───┐ ┌───┐ │ wss:// │ ┌───┐ ┌───┐ │ wss:// │ ┌───┐ │ │ +│ │ │Ali│ │Bob│ │◄───────►│ │Car│ │Dav│ │◄───────►│ │Eve│ │ │ +│ │ └─┬─┘ └─┬─┘ │ │ └─┬─┘ └─┬─┘ │ │ └─┬─┘ │ │ +│ │ │ │ │ │ │ │ │ │ │ │ │ +│ │ ┌─┴─────┴─┐ │ │ ┌─┴─────┴─┐ │ │ ┌─┴───┐ │ │ +│ │ │ Daemon │ │ │ │ Daemon │ │ │ │Daemon│ │ │ +│ │ └─────────┘ │ │ └─────────┘ │ │ └─────┘ │ │ +│ └─────────────┘ └─────────────┘ └─────────────┘ │ +│ │ +│ • Agents run in tmux (unchanged) │ +│ • Local injection via send-keys (unchanged) │ +│ • Cross-server routing via WebSocket (NEW) │ +│ • Fleet-wide agent discovery (NEW) │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Table of Contents + +1. [Design Principles](#1-design-principles) +2. [Architecture Overview](#2-architecture-overview) +3. [Network Topology](#3-network-topology) +4. [Protocol Specification](#4-protocol-specification) +5. [Agent Discovery & Registry](#5-agent-discovery--registry) +6. [Message Routing](#6-message-routing) +7. [Delivery Confirmation](#7-delivery-confirmation) *(NEW)* +8. [Security Model](#8-security-model) +9. [Flow Control & Backpressure](#9-flow-control--backpressure) *(NEW)* +10. [Failure Handling & Resilience](#10-failure-handling--resilience) +11. [Transport Abstraction (NATS Option)](#11-transport-abstraction-nats-option) *(NEW)* +12. [Configuration](#12-configuration) +13. [CLI Interface](#13-cli-interface) +14. [Implementation Plan](#14-implementation-plan) +15. [Migration Path](#15-migration-path) +16. [Open Questions](#16-open-questions) *(NEW)* +17. [Storage Architecture](#17-storage-architecture) *(NEW)* + +--- + +## 1. Design Principles + +### 1.1 Preserve the Core Magic + +The #1 requirement is preserving **automatic message injection**: + +``` +Message arrives → tmux send-keys → Agent receives as user input +``` + +This is what differentiates agent-relay from polling-based systems. Federation must not compromise this. + +### 1.2 Separation of Concerns + +``` +┌─────────────────────────────────────────────────────────┐ +│ ROUTING LAYER (NEW) │ +│ Cross-server message delivery via WebSocket │ +├─────────────────────────────────────────────────────────┤ +│ INJECTION LAYER (UNCHANGED) │ +│ Local tmux send-keys for each server │ +└─────────────────────────────────────────────────────────┘ +``` + +- **Routing** is a network problem → WebSocket between daemons +- **Injection** is a local problem → tmux send-keys (unchanged) + +### 1.3 Progressive Enhancement + +- Single-server deployments work exactly as before +- Federation is opt-in via configuration +- No breaking changes to existing setups + +### 1.4 Operational Simplicity + +- No external dependencies (Redis, NATS) required +- Optional hub for convenience, not required +- Static peer configuration works fine +- Simple CLI for fleet management + +--- + +## 2. Architecture Overview + +### 2.1 Component Diagram + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SERVER NODE │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ AGENT LAYER │ │ +│ │ │ │ +│ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ │ +│ │ │ tmux: Alice │ │ tmux: Bob │ │ tmux: Carol │ │ │ +│ │ │ (claude) │ │ (codex) │ │ (gemini) │ │ │ +│ │ └──────┬───────┘ └──────┬───────┘ └──────┬───────┘ │ │ +│ │ │ capture-pane │ │ │ │ +│ │ │ send-keys │ │ │ │ +│ │ ▼ ▼ ▼ │ │ +│ │ ┌──────────────────────────────────────────────────────────┐ │ │ +│ │ │ TmuxWrapper (per agent) │ │ │ +│ │ │ • Parse @relay: patterns │ │ │ +│ │ │ • Inject incoming messages │ │ │ +│ │ │ • Connect to local daemon │ │ │ +│ │ └──────────────────────────┬───────────────────────────────┘ │ │ +│ │ │ Unix Socket │ │ +│ └──────────────────────────────┼─────────────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────────────▼─────────────────────────────────────┐ │ +│ │ DAEMON LAYER │ │ +│ │ │ │ +│ │ ┌─────────────────┐ ┌─────────────────┐ ┌────────────────┐ │ │ +│ │ │ LocalServer │ │ PeerManager │ │ Registry │ │ │ +│ │ │ (Unix socket) │ │ (WebSocket) │ │ (agents map) │ │ │ +│ │ │ │ │ │ │ │ │ │ +│ │ │ • Accept local │ │ • Connect peers │ │ • Local agents │ │ │ +│ │ │ connections │ │ • Route cross- │ │ • Remote agents│ │ │ +│ │ │ • Handle HELLO │ │ server msgs │ │ • Server map │ │ │ +│ │ └────────┬────────┘ └────────┬────────┘ └────────┬───────┘ │ │ +│ │ │ │ │ │ │ +│ │ └──────────────────────┼──────────────────────┘ │ │ +│ │ │ │ │ +│ │ ┌─────────▼─────────┐ │ │ +│ │ │ Router │ │ │ +│ │ │ │ │ │ +│ │ │ • Decide local vs │ │ │ +│ │ │ remote routing │ │ │ +│ │ │ • Handle broadcast│ │ │ +│ │ │ • Queue on disco- │ │ │ +│ │ │ nnect │ │ │ +│ │ └───────────────────┘ │ │ +│ │ │ │ +│ └──────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ │ WebSocket (wss://) │ +│ ▼ │ +│ ┌─────────────────────────────┐ │ +│ │ PEER SERVERS │ │ +│ │ (other fleet members) │ │ +│ └─────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 2.2 Data Flow: Cross-Server Message + +``` +Alice@ServerA sends to Bob@ServerB: + +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SERVER A │ +│ │ +│ 1. Alice outputs: @relay:Bob Can you review auth.ts? │ +│ │ │ +│ ▼ │ +│ 2. TmuxWrapper captures via tmux capture-pane │ +│ │ │ +│ ▼ │ +│ 3. Parser extracts: { to: "Bob", body: "Can you review auth.ts?" } │ +│ │ │ +│ ▼ │ +│ 4. RelayClient sends SEND envelope to local daemon (Unix socket) │ +│ │ │ +│ ▼ │ +│ 5. Router checks registry: Bob not local, Bob is on ServerB │ +│ │ │ +│ ▼ │ +│ 6. PeerManager sends PEER_ROUTE to ServerB (WebSocket) │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + │ wss:// (TLS encrypted) + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SERVER B │ +│ │ +│ 7. PeerManager receives PEER_ROUTE │ +│ │ │ +│ ▼ │ +│ 8. Router looks up Bob in local connections │ +│ │ │ +│ ▼ │ +│ 9. Router calls TmuxWrapper.deliverToLocal(Bob, envelope) │ +│ │ │ +│ ▼ │ +│ 10. TmuxWrapper executes: │ +│ tmux send-keys -t relay-Bob-12345 -l "Relay message from Alice..." │ +│ tmux send-keys -t relay-Bob-12345 Enter │ +│ │ │ +│ ▼ │ +│ 11. Bob's agent receives message AS USER INPUT │ +│ (automatic, no polling, no checking!) │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 3. Network Topology + +### 3.1 Topology Options + +#### Option A: Full Mesh (Recommended for <10 servers) + +``` + ServerA ◄────────► ServerB + ▲ ▲ + │ │ + │ │ + ▼ ▼ + ServerC ◄────────► ServerD +``` + +- Every daemon connects to every other daemon +- O(n²) connections, but fine for small fleets +- No single point of failure +- Lowest latency (direct paths) + +#### Option B: Hub-and-Spoke (Recommended for 10+ servers) + +``` + ┌─────────┐ + ┌─────────►│ Hub │◄─────────┐ + │ └────┬────┘ │ + │ │ │ + ▼ ▼ ▼ + ServerA ServerB ServerC +``` + +- All daemons connect to central hub +- Hub routes messages between servers +- Single point of failure (mitigate with hub HA) +- Simpler operations + +#### Option C: Hybrid (Recommended for production) + +``` + ┌─────────┐ + ┌─────────►│ Hub │◄─────────┐ + │ │(discover)│ │ + │ └─────────┘ │ + │ │ + ▼ ▼ + ServerA ◄─────────────────────► ServerB + ▲ ▲ + │ │ + └───────────► ServerC ◄─────────┘ +``` + +- Hub provides discovery and registry sync +- Daemons establish direct peer connections +- Messages route directly (low latency) +- Hub failure doesn't break existing connections + +### 3.2 Recommended Approach + +**Hybrid topology with optional hub:** + +1. Daemons can be configured with static peer list (no hub needed) +2. Optionally connect to hub for dynamic discovery +3. Once peers are known, establish direct connections +4. Hub going down doesn't break messaging (just discovery) + +--- + +## 4. Protocol Specification + +### 4.1 Peer Protocol Messages + +Extend the existing envelope format with peer-specific message types: + +```typescript +// New message types for federation +type PeerMessageType = + | 'PEER_HELLO' // Initial handshake + | 'PEER_WELCOME' // Handshake response + | 'PEER_SYNC' // Registry synchronization + | 'PEER_ROUTE' // Route message to local agent + | 'PEER_BROADCAST' // Broadcast to local agents + | 'PEER_PING' // Heartbeat + | 'PEER_PONG' // Heartbeat response + | 'PEER_BYE'; // Graceful disconnect + +// Peer envelope (over WebSocket) +interface PeerEnvelope { + v: 1; // Protocol version + type: PeerMessageType; + id: string; // Message UUID + ts: number; // Timestamp + from_server: string; // Originating server ID + payload: T; +} +``` + +### 4.2 PEER_HELLO / PEER_WELCOME + +Initial handshake between daemons: + +```typescript +// Client → Server +interface PeerHelloPayload { + server_id: string; // e.g., "nyc-prod-01" + server_name?: string; // Human-readable name + version: string; // agent-relay version + capabilities: { + max_message_size: number; + supports_broadcast: boolean; + supports_topics: boolean; + }; + agents: AgentInfo[]; // Local agents to register + auth_token: string; // Pre-shared token +} + +// Server → Client +interface PeerWelcomePayload { + server_id: string; + session_id: string; // For reconnection + agents: AgentInfo[]; // Server's local agents + peers: PeerInfo[]; // Other known peers (for mesh) + config: { + heartbeat_ms: number; // Ping interval + sync_interval_ms: number; // Registry sync interval + }; +} + +interface AgentInfo { + name: string; + server_id: string; + cli?: string; // claude, codex, gemini + connected_at: string; // ISO timestamp + status: 'online' | 'idle' | 'busy'; +} + +interface PeerInfo { + server_id: string; + url: string; // WebSocket URL + agents: string[]; // Agent names on this peer +} +``` + +### 4.3 PEER_SYNC + +Registry updates when agents join/leave: + +```typescript +interface PeerSyncPayload { + type: 'agent_joined' | 'agent_left' | 'full_sync'; + agents?: AgentInfo[]; // For full_sync + agent?: AgentInfo; // For join/leave +} +``` + +### 4.4 PEER_ROUTE + +Forward a message to a specific agent: + +```typescript +interface PeerRoutePayload { + original_envelope: Envelope; // The actual message + target_agent: string; // Local agent name + hops: string[]; // Servers traversed (loop prevention) +} +``` + +### 4.5 PEER_BROADCAST + +Forward a broadcast to all local agents: + +```typescript +interface PeerBroadcastPayload { + original_envelope: Envelope; + exclude_agents?: string[]; // Don't deliver to these + scope?: 'fleet' | 'server'; // Broadcast scope +} +``` + +### 4.6 Connection State Machine + +``` + ┌─────────────────┐ + │ DISCONNECTED │◄─────────────────────┐ + └────────┬────────┘ │ + │ connect() │ + ▼ │ + ┌─────────────────┐ │ + ┌──────────│ CONNECTING │──────────┐ │ + │ └────────┬────────┘ │ │ + │ │ socket open │ │ + │ error ▼ │ error │ + │ ┌─────────────────┐ │ │ + │ │ HANDSHAKING │──────────┤ │ + │ └────────┬────────┘ │ │ + │ │ WELCOME received │ │ + │ ▼ │ │ + │ ┌─────────────────┐ │ │ + │ │ ACTIVE │──────────┤ │ + │ └────────┬────────┘ │ │ + │ │ BYE or error │ │ + │ ▼ │ │ + │ ┌─────────────────┐ │ │ + └─────────►│ RECONNECTING │◄─────────┘ │ + └────────┬────────┘ │ + │ max retries │ + └───────────────────────────────┘ + + Reconnection: exponential backoff 1s → 2s → 4s → 8s → 16s → 30s (max) + Max attempts: unlimited (peers are persistent) +``` + +--- + +## 5. Agent Discovery & Registry + +### 5.1 Registry Structure + +```typescript +interface FleetRegistry { + // All known agents across the fleet + agents: Map; + + // Server information + servers: Map; + + // Index for fast lookup + agentToServer: Map; // agentName → serverId +} + +interface AgentRecord { + name: string; + server_id: string; + qualified_name: string; // "Alice@nyc-prod-01" + cli?: string; + status: 'online' | 'idle' | 'offline'; + connected_at: string; + last_seen: string; + metadata?: Record; +} + +interface ServerRecord { + id: string; + name?: string; + url: string; + status: 'connected' | 'disconnected' | 'unknown'; + agents: Set; + connected_at?: string; + last_seen: string; + latency_ms?: number; +} +``` + +### 5.2 Name Resolution + +Agents can be addressed in multiple ways: + +| Pattern | Resolution | +|---------|------------| +| `@relay:Bob` | Local first, then fleet-wide lookup | +| `@relay:Bob@nyc` | Explicitly route to server "nyc" | +| `@relay:Bob@*` | Send to ALL agents named Bob (rare) | +| `@relay:*` | Broadcast to entire fleet | +| `@relay:*@local` | Broadcast to local server only | +| `@relay:*@nyc` | Broadcast to all agents on "nyc" | + +### 5.3 Name Collision Handling (v2: Fleet-Wide Uniqueness) + +**Design Decision:** Agent names must be unique across the entire fleet. + +This is simpler than "first-registered wins" which has race conditions with async gossip. With fleet-wide uniqueness: +- No split-brain scenarios +- No ambiguous routing +- Clear error on collision + +```typescript +// Registration flow +async function registerAgent(name: string, serverId: string): Promise { + // Check fleet-wide registry + if (registry.exists(name)) { + const existing = registry.get(name); + return { + success: false, + error: `Name "${name}" already registered on ${existing.server_id}`, + suggestion: `${name}-${serverId.slice(0, 4)}` // e.g., "Bob-nyc1" + }; + } + + // Broadcast reservation with Lamport timestamp + await broadcastReservation(name, serverId, lamportClock.tick()); + + // Wait for quorum acknowledgment (majority of peers) + const acks = await waitForAcks(name, QUORUM_TIMEOUT_MS); + if (acks < quorumSize()) { + return { success: false, error: 'Failed to achieve quorum' }; + } + + registry.add(name, serverId); + return { success: true }; +} + +// Resolution is now simple +function resolveAgent(name: string): AgentRecord | null { + // Check for explicit qualification (still supported) + if (name.includes('@')) { + const [agentName, serverSpec] = name.split('@'); + return registry.findOnServer(agentName, serverSpec); + } + + // Fleet-wide lookup (guaranteed unique) + return registry.get(name); +} +``` + +### 5.4 Registry Synchronization + +Registries sync via gossip-like protocol: + +``` +Server A joins fleet: +1. A → B: PEER_HELLO (includes A's agents) +2. B → A: PEER_WELCOME (includes B's agents + known peers) +3. A → C: PEER_HELLO (A learned about C from B) +4. A → B: PEER_SYNC (A now knows about C's agents) +...eventually consistent... +``` + +**Sync triggers:** +- New peer connection +- Agent joins/leaves locally +- Periodic full sync (every 60s) +- On reconnection after disconnect + +--- + +## 6. Message Routing + +### 6.1 Routing Algorithm + +```typescript +class FederatedRouter { + route(from: string, envelope: Envelope): void { + const target = envelope.to; + + // 1. Broadcast handling + if (target === '*' || target?.startsWith('*@')) { + return this.handleBroadcast(from, envelope, target); + } + + // 2. Resolve target agent + const resolved = this.registry.resolve(target, this.serverId); + if (!resolved) { + this.sendNack(from, envelope.id, 'UNKNOWN_AGENT'); + return; + } + + // 3. Local delivery + if (resolved.server_id === this.serverId) { + this.deliverLocal(from, resolved.name, envelope); + return; + } + + // 4. Remote delivery + this.deliverRemote(from, resolved, envelope); + } + + private handleBroadcast(from: string, envelope: Envelope, scope: string): void { + const [, serverSpec] = scope.split('@'); + + // Deliver to local agents (except sender) + if (!serverSpec || serverSpec === 'local' || serverSpec === this.serverId) { + for (const agent of this.localAgents) { + if (agent.name !== from) { + this.deliverLocal(from, agent.name, envelope); + } + } + } + + // Forward to peers (unless local-only) + if (serverSpec !== 'local') { + for (const peer of this.peers.values()) { + if (!serverSpec || peer.serverId === serverSpec) { + peer.send({ + type: 'PEER_BROADCAST', + payload: { + original_envelope: envelope, + exclude_agents: [from], + } + }); + } + } + } + } + + private deliverRemote( + from: string, + target: AgentRecord, + envelope: Envelope + ): void { + const peer = this.peers.get(target.server_id); + + if (!peer || peer.state !== 'ACTIVE') { + // Queue for later delivery + this.queueMessage(target.server_id, envelope); + return; + } + + peer.send({ + type: 'PEER_ROUTE', + payload: { + original_envelope: envelope, + target_agent: target.name, + hops: [this.serverId], + } + }); + } +} +``` + +### 6.2 Message Queuing + +When a peer is disconnected, messages are queued: + +```typescript +interface QueuedMessage { + envelope: Envelope; + target_server: string; + queued_at: number; + attempts: number; + expires_at: number; // TTL +} + +class MessageQueue { + private queues: Map; + + enqueue(serverId: string, envelope: Envelope): void { + const queue = this.queues.get(serverId) ?? []; + queue.push({ + envelope, + target_server: serverId, + queued_at: Date.now(), + attempts: 0, + expires_at: Date.now() + 3600000, // 1 hour TTL + }); + this.queues.set(serverId, queue); + } + + // Called when peer reconnects + flush(serverId: string, peer: PeerConnection): void { + const queue = this.queues.get(serverId) ?? []; + for (const msg of queue) { + if (Date.now() < msg.expires_at) { + peer.send({ type: 'PEER_ROUTE', payload: msg.envelope }); + } + } + this.queues.delete(serverId); + } +} +``` + +### 6.3 Loop Prevention + +Prevent messages from bouncing between servers: + +```typescript +interface PeerRoutePayload { + // ... other fields + hops: string[]; // Servers this message has traversed +} + +// On receiving PEER_ROUTE +function handlePeerRoute(msg: PeerEnvelope): void { + // Check for loop + if (msg.payload.hops.includes(this.serverId)) { + console.warn('Loop detected, dropping message'); + return; + } + + // Add ourselves to hops if forwarding + if (needsForwarding) { + msg.payload.hops.push(this.serverId); + } +} +``` + +--- + +## 7. Delivery Confirmation *(NEW)* + +### 7.1 The Problem + +Without end-to-end confirmation, senders don't know if messages were actually received: + +``` +Alice sends → Daemon A → Daemon B → tmux send-keys → ??? + ↑ + ACK (peer received) + +But: Did Bob's agent actually see it? + - tmux session might have crashed + - Agent might be blocked + - Injection might have failed silently +``` + +### 7.2 Solution: DELIVERY_CONFIRMED Message + +Add a new message type that confirms the message was injected into the agent's terminal: + +```typescript +interface DeliveryConfirmedPayload { + original_message_id: string; // ID of the message being confirmed + injected_at: number; // Timestamp when send-keys completed + agent_status: 'active' | 'idle' | 'unknown'; +} +``` + +### 7.3 Confirmation Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ END-TO-END DELIVERY CONFIRMATION │ +│ │ +│ Alice@A Daemon A Daemon B Bob@B │ +│ │ │ │ │ │ +│ │ ── @relay:Bob msg ────► │ │ │ │ +│ │ │ ─ PEER_ROUTE ─►│ │ │ +│ │ │ │ ── send-keys ────► │ │ +│ │ │ │ (inject msg) │ │ +│ │ │ │ │ │ +│ │ │ │ ◄── capture-pane ──│ │ +│ │ │ │ (detect receipt)│ │ +│ │ │ │ │ │ +│ │ │ ◄─ DELIVERY_ ──│ │ │ +│ │ │ CONFIRMED │ │ │ +│ │ │ │ │ │ +│ │ ◄─ inject confirmation ──│ │ │ │ +│ │ "[✓] Bob received" │ │ │ │ +│ │ │ │ │ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 7.4 Detection Mechanism + +After injecting a message, the TmuxWrapper watches for evidence the agent received it: + +```typescript +async function confirmDelivery(messageId: string): Promise { + // Wait for agent to echo/process the message + const startTime = Date.now(); + const timeout = 5000; // 5 seconds + + while (Date.now() - startTime < timeout) { + const output = await capturePane(); + + // Look for our injected message in output + if (output.includes(`Relay message from`) && output.includes(messageId.slice(0, 8))) { + return true; + } + + await sleep(200); + } + + return false; // Timeout - uncertain delivery +} +``` + +### 7.5 Sender Notification + +Senders receive confirmation or timeout notification: + +``` +# Success case +[relay:Alice] → Bob: Can you review auth.ts? +[relay:Alice] ✓ Bob received (145ms) + +# Timeout case +[relay:Alice] → Bob: Can you review auth.ts? +[relay:Alice] ⚠ Delivery to Bob unconfirmed (timeout) +``` + +### 7.6 Configuration + +Delivery confirmation is optional (adds latency): + +```yaml +federation: + delivery_confirmation: + enabled: true # Enable end-to-end confirmation + timeout_ms: 5000 # How long to wait for confirmation + notify_sender: true # Inject confirmation into sender's terminal +``` + +--- + +## 8. Security Model + +### 8.1 Authentication (v2: Asymmetric Keys) + +**Design Decision:** Use Ed25519 keypairs instead of pre-shared tokens. + +Pre-shared tokens don't scale: N servers = N² tokens. With asymmetric keys: +- Each server has one keypair +- Servers exchange public keys once +- Challenge-response authentication +- Easy key rotation + +```typescript +// Each server generates a keypair on first run +interface ServerIdentity { + server_id: string; + public_key: string; // Ed25519 public key (base64) + private_key: string; // Ed25519 private key (stored securely) +} + +// Handshake uses challenge-response +interface PeerHelloPayload { + server_id: string; + public_key: string; + challenge: string; // Random nonce + challenge_signature: string; // Sign our challenge with our private key +} + +interface PeerWelcomePayload { + server_id: string; + challenge_response: string; // Sign their challenge with our private key + // ... rest of welcome +} +``` + +### 8.2 Key Distribution + +Options for distributing public keys: + +**Option A: Static Configuration (Simple)** +```yaml +auth: + private_key_path: /etc/agent-relay/server.key + known_peers: + london-prod-01: "ed25519:abc123..." # Public key + tokyo-prod-01: "ed25519:def456..." +``` + +**Option B: Trust-on-First-Use (TOFU)** +```yaml +auth: + tofu_enabled: true # Accept new peers, remember their keys + tofu_require_approval: true # Require human approval for new peers +``` + +**Option C: Certificate Authority (Enterprise)** +```yaml +auth: + ca_cert: /etc/agent-relay/ca.pem + server_cert: /etc/agent-relay/server.pem + server_key: /etc/agent-relay/server.key +``` + +### 8.3 Message Signing + +Each peer-to-peer message is signed: + +```typescript +interface PeerEnvelope { + // ... existing fields + signature: string; // Ed25519 signature of (type + id + ts + payload) +} + +function signEnvelope(envelope: PeerEnvelope, privateKey: Key): string { + const payload = JSON.stringify({ + type: envelope.type, + id: envelope.id, + ts: envelope.ts, + payload: envelope.payload + }); + return ed25519.sign(payload, privateKey); +} + +function verifyEnvelope(envelope: PeerEnvelope, publicKey: Key): boolean { + // Reject if signature invalid - prevents spoofing + return ed25519.verify(envelope.signature, publicKey); +} +``` + +### 8.4 Transport Security + +**Mandatory TLS** for peer connections: + +```typescript +const ws = new WebSocket(peerUrl, { + // TLS configuration + rejectUnauthorized: true, // Verify peer certificate + ca: fs.readFileSync('/etc/agent-relay/ca.pem'), + + // Client certificate (for mTLS) + cert: fs.readFileSync('/etc/agent-relay/client.pem'), + key: fs.readFileSync('/etc/agent-relay/client-key.pem'), +}); +``` + +### 8.5 Authorization + +Simple capability model: + +```typescript +interface ServerCapabilities { + can_broadcast: boolean; // Can send fleet-wide broadcasts + can_route_to: string[]; // Allowed target servers + max_message_rate: number; // Rate limit (msgs/sec) + allowed_agents: string[]; // Can message these agents (or '*') +} +``` + +### 8.6 Security Checklist + +| Control | Status | Notes | +|---------|--------|-------| +| TLS encryption | Required | wss:// only | +| Peer authentication | Required | Pre-shared token | +| mTLS (mutual TLS) | Optional | For high-security | +| Message signing | Future | Verify message origin | +| Rate limiting | Recommended | Prevent floods | +| Audit logging | Recommended | Log all cross-server | + +--- + +## 9. Flow Control & Backpressure *(NEW)* + +### 15.1 The Problem + +Without flow control, a fast sender can overwhelm a slow receiver: + +``` +Server A: sends 1000 msgs/sec to Server B +Server B: can only inject 10 msgs/sec (agents busy) +Server B: queue grows → memory exhaustion → OOM crash +``` + +### 15.2 Credit-Based Flow Control + +Each peer connection has a credit window: + +```typescript +interface FlowControl { + // Sender side + credits: number; // How many messages we can send + pendingAcks: Map; // Messages awaiting ACK + + // Receiver side + windowSize: number; // Max messages before ACK required + received: number; // Messages received since last ACK +} + +// Sender checks credits before sending +function canSend(): boolean { + return this.credits > 0; +} + +function send(envelope: Envelope): boolean { + if (!this.canSend()) { + this.queue.push(envelope); // Queue locally + return false; + } + + this.credits--; + this.pendingAcks.set(envelope.id, envelope); + this.peer.send(envelope); + return true; +} + +// Receiver sends PEER_ACK to replenish credits +function onReceive(envelope: Envelope): void { + this.received++; + + if (this.received >= this.windowSize / 2) { + this.peer.send({ + type: 'PEER_ACK', + payload: { credits: this.received } + }); + this.received = 0; + } +} + +// Sender receives ACK, replenishes credits +function onAck(ack: PeerAckPayload): void { + this.credits += ack.credits; + this.flushQueue(); // Send queued messages +} +``` + +### 15.3 Backpressure Signals + +When a receiver is overwhelmed, it sends PEER_BUSY: + +```typescript +type PeerMessageType = + // ... existing types + | 'PEER_ACK' // Replenish sender credits + | 'PEER_BUSY' // Receiver overwhelmed, stop sending + | 'PEER_READY'; // Receiver recovered, resume + +interface PeerBusyPayload { + reason: 'queue_full' | 'agent_busy' | 'rate_limited'; + retry_after_ms?: number; // Suggested wait time +} +``` + +### 9.4 Rate Limiting + +Per-peer and fleet-wide rate limits: + +```typescript +interface RateLimiter { + // Token bucket algorithm + tokens: number; + maxTokens: number; + refillRate: number; // tokens per second + + tryConsume(count: number): boolean { + this.refill(); + if (this.tokens >= count) { + this.tokens -= count; + return true; + } + return false; + } +} + +// Applied at multiple levels +const limits = { + perPeer: new RateLimiter({ maxTokens: 100, refillRate: 50 }), // 50/sec per peer + perAgent: new RateLimiter({ maxTokens: 20, refillRate: 10 }), // 10/sec per agent + fleetWide: new RateLimiter({ maxTokens: 1000, refillRate: 200 }), // 200/sec total +}; +``` + +### 9.5 Bounded Queues + +Queues have maximum sizes with drop policies: + +```typescript +interface BoundedQueue { + maxSize: number; + dropPolicy: 'oldest' | 'newest' | 'reject'; + + push(item: T): boolean { + if (this.items.length >= this.maxSize) { + switch (this.dropPolicy) { + case 'oldest': + this.items.shift(); // Drop oldest + break; + case 'newest': + return false; // Reject new item + case 'reject': + throw new QueueFullError(); + } + } + this.items.push(item); + return true; + } +} +``` + +### 9.6 Configuration + +```yaml +federation: + flow_control: + window_size: 100 # Messages before ACK required + max_queue_size: 1000 # Max queued messages per peer + queue_drop_policy: oldest # oldest | newest | reject + + rate_limits: + per_peer_per_second: 50 + per_agent_per_second: 10 + fleet_wide_per_second: 200 +``` + +--- + +## 10. Failure Handling & Resilience + +### 13.1 Connection Failures + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ CONNECTION FAILURE HANDLING │ +│ │ +│ Peer disconnects │ +│ │ │ +│ ▼ │ +│ Mark peer as DISCONNECTED │ +│ │ │ +│ ├──► Queue outbound messages │ +│ │ │ +│ ├──► Start reconnection timer │ +│ │ (exponential backoff: 1s, 2s, 4s, ... 30s max) │ +│ │ │ +│ └──► Notify local agents (optional) │ +│ "@relay:* [SYSTEM] Lost connection to server-b" │ +│ │ +│ On reconnect: │ +│ │ │ +│ ├──► Re-authenticate (PEER_HELLO/WELCOME) │ +│ │ │ +│ ├──► Sync registries (PEER_SYNC full_sync) │ +│ │ │ +│ └──► Flush queued messages │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 13.2 Split Brain Prevention + +If the fleet gets partitioned: + +1. **Agents remain addressable** within their partition +2. **Cross-partition messages queue** until healed +3. **No automatic conflict resolution** - messages deliver in order received +4. **TTL expiration** - queued messages expire after 1 hour (configurable) + +### 13.3 Graceful Degradation + +``` +Fleet healthy: A ◄──► B ◄──► C (full connectivity) + +B goes down: A ◄─X─► B ◄─X─► C + A ◄──────────────► C (A-C still works) + +B comes back: A ◄──► B ◄──► C (queued messages flush) +``` + +### 10.4 Health Monitoring + +```typescript +// Heartbeat every 30 seconds +setInterval(() => { + for (const peer of this.peers.values()) { + if (peer.state === 'ACTIVE') { + peer.send({ type: 'PEER_PING', ts: Date.now() }); + + // If no PONG in 60s, consider dead + peer.setTimeout(() => { + if (!peer.lastPong || Date.now() - peer.lastPong > 60000) { + peer.reconnect(); + } + }, 60000); + } + } +}, 30000); +``` + +--- + +## 11. Transport Abstraction (NATS Option) *(NEW)* + +### 14.1 Motivation + +The custom WebSocket protocol works for simple deployments, but production fleets may benefit from battle-tested message infrastructure. NATS JetStream provides: + +- ✅ Persistent message queues (survive restarts) +- ✅ Exactly-once delivery semantics +- ✅ Built-in clustering and HA +- ✅ Backpressure and flow control +- ✅ Rich observability (metrics, tracing) +- ✅ Years of production hardening + +**Trade-off:** External dependency vs. implementation effort. + +### 14.2 Transport Interface + +Abstract the transport layer so implementations are swappable: + +```typescript +interface PeerTransport { + // Lifecycle + connect(config: TransportConfig): Promise; + disconnect(): Promise; + + // Messaging + send(serverId: string, envelope: PeerEnvelope): Promise; + broadcast(envelope: PeerEnvelope): Promise; + subscribe(handler: (from: string, envelope: PeerEnvelope) => void): void; + + // Discovery + getConnectedPeers(): string[]; + onPeerJoin(handler: (serverId: string) => void): void; + onPeerLeave(handler: (serverId: string) => void): void; +} +``` + +### 14.3 WebSocket Implementation (Default) + +```typescript +class WebSocketTransport implements PeerTransport { + private connections: Map; + + async connect(config: TransportConfig): Promise { + for (const peer of config.peers) { + const ws = new WebSocket(peer.url); + // ... handshake, auth + this.connections.set(peer.serverId, ws); + } + } + + async send(serverId: string, envelope: PeerEnvelope): Promise { + const ws = this.connections.get(serverId); + ws?.send(JSON.stringify(envelope)); + } + + // ... rest of implementation +} +``` + +### 11.4 NATS Implementation (Optional) + +```typescript +class NatsTransport implements PeerTransport { + private nc: NatsConnection; + private js: JetStreamClient; + + async connect(config: TransportConfig): Promise { + this.nc = await connect({ servers: config.natsUrl }); + this.js = this.nc.jetstream(); + + // Create stream for fleet messages + await this.js.streams.add({ + name: 'RELAY_FLEET', + subjects: ['relay.>'], + retention: RetentionPolicy.Limits, + max_age: 3600 * 1e9, // 1 hour + }); + } + + async send(serverId: string, envelope: PeerEnvelope): Promise { + // Publish to server-specific subject + await this.js.publish(`relay.server.${serverId}`, encode(envelope)); + } + + async broadcast(envelope: PeerEnvelope): Promise { + // Publish to broadcast subject + await this.js.publish('relay.broadcast', encode(envelope)); + } + + subscribe(handler: (from: string, envelope: PeerEnvelope) => void): void { + // Subscribe to our server subject + broadcast + const sub = this.nc.subscribe(`relay.server.${this.serverId}`); + const broadcastSub = this.nc.subscribe('relay.broadcast'); + + (async () => { + for await (const msg of sub) { + const envelope = decode(msg.data); + handler(envelope.from_server, envelope); + } + })(); + } +} +``` + +### 11.5 Configuration + +```yaml +federation: + # Transport selection + transport: websocket # websocket | nats + + # WebSocket config (if transport: websocket) + websocket: + peers: + - url: wss://london.example.com:8765 + server_id: london + + # NATS config (if transport: nats) + nats: + url: nats://nats.example.com:4222 + credentials: /etc/agent-relay/nats.creds + stream_name: RELAY_FLEET +``` + +### 11.6 When to Use Which + +| Scenario | Recommended Transport | Rationale | +|----------|----------------------|-----------| +| 2-5 servers, simple setup | WebSocket | No external deps | +| Development/testing | WebSocket | Easy to run locally | +| 10+ servers | NATS | Better scaling | +| High reliability required | NATS | Persistence, HA | +| Already have NATS | NATS | Leverage existing | +| Air-gapped/restricted | WebSocket | No external deps | + +### 11.7 Migration Path + +Start with WebSocket, migrate to NATS when needed: + +1. Deploy NATS cluster +2. Update config: `transport: nats` +3. Restart daemons (one at a time) +4. Messages route through NATS immediately + +No changes to agents or injection logic. + +--- + +## 12. Configuration + +### 15.1 Configuration File + +```yaml +# /etc/agent-relay/config.yaml (or ~/.agent-relay/config.yaml) + +# Server identity +server: + id: nyc-prod-01 # Unique server ID + name: "NYC Production 01" # Human-readable name + +# Local daemon settings (unchanged from current) +local: + socket_path: /tmp/agent-relay/relay.sock + storage_path: /tmp/agent-relay/messages.sqlite + +# Federation settings (NEW) +federation: + enabled: true + + # Listen for peer connections + listen: + host: 0.0.0.0 + port: 8765 + + # TLS configuration + tls: + enabled: true + cert: /etc/agent-relay/server.pem + key: /etc/agent-relay/server-key.pem + ca: /etc/agent-relay/ca.pem # For client verification + mutual: false # Require client certs + + # Authentication + auth: + # This server's token (peers use this to connect to us) + server_token: "${RELAY_SERVER_TOKEN}" + + # Tokens for connecting to peers + peer_tokens: + london-prod-01: "${RELAY_TOKEN_LONDON}" + tokyo-prod-01: "${RELAY_TOKEN_TOKYO}" + + # Peer connections + peers: + - url: wss://london.example.com:8765 + server_id: london-prod-01 + auto_connect: true + + - url: wss://tokyo.example.com:8765 + server_id: tokyo-prod-01 + auto_connect: true + + # Optional hub for discovery + hub: + url: wss://hub.example.com:8765 + token: "${RELAY_HUB_TOKEN}" + enabled: false # Hub is optional + + # Behavior settings + settings: + heartbeat_interval_ms: 30000 # Ping peers every 30s + reconnect_max_delay_ms: 30000 # Max backoff delay + message_queue_ttl_ms: 3600000 # 1 hour queue TTL + sync_interval_ms: 60000 # Full registry sync + max_message_size_bytes: 1048576 # 1 MiB + +# Dashboard settings +dashboard: + enabled: true + port: 3888 + show_fleet: true # Show all fleet agents +``` + +### 15.2 Environment Variables + +All config can be overridden via environment: + +```bash +# Server identity +AGENT_RELAY_SERVER_ID=nyc-prod-01 +AGENT_RELAY_SERVER_NAME="NYC Production 01" + +# Federation +AGENT_RELAY_FEDERATION_ENABLED=true +AGENT_RELAY_FEDERATION_PORT=8765 +AGENT_RELAY_SERVER_TOKEN=secret-token +AGENT_RELAY_PEER_london-prod-01_TOKEN=london-token +AGENT_RELAY_PEER_london-prod-01_URL=wss://london.example.com:8765 +``` + +### 15.3 Minimal Configuration + +For simple two-server setup: + +```yaml +# Server A (nyc) +server: + id: nyc +federation: + enabled: true + listen: + port: 8765 + auth: + server_token: "shared-secret" + peers: + - url: wss://london.example.com:8765 + server_id: london +``` + +```yaml +# Server B (london) +server: + id: london +federation: + enabled: true + listen: + port: 8765 + auth: + server_token: "shared-secret" + peers: + - url: wss://nyc.example.com:8765 + server_id: nyc +``` + +--- + +## 13. CLI Interface + +### 13.1 New Commands + +```bash +# Start daemon with federation +agent-relay up [--peer-port 8765] [--config /path/to/config.yaml] + +# Peer management +agent-relay peer list # List connected peers +agent-relay peer add [--token ] # Add peer dynamically +agent-relay peer remove # Remove peer +agent-relay peer status # Detailed peer status + +# Fleet-wide agent listing +agent-relay agents # Local agents only (default) +agent-relay agents --fleet # All agents in fleet +agent-relay agents --server # Agents on specific server + +# Send to remote agent +agent-relay send [@] + +# Fleet status +agent-relay fleet status # Overview of all servers +agent-relay fleet topology # Show connection graph + +# Debugging +agent-relay fleet ping # Ping a peer +agent-relay fleet trace # Trace route to agent +``` + +### 13.2 Example Session + +```bash +# On Server NYC +$ agent-relay up --peer-port 8765 +Daemon started (federation enabled) +Listening for peers on :8765 +Connecting to peer: london.example.com:8765... +Connected to london (3 agents) +Connecting to peer: tokyo.example.com:8765... +Connected to tokyo (2 agents) + +$ agent-relay agents --fleet +AGENT SERVER CLI STATUS CONNECTED +Alice nyc claude online 2 min ago +Bob nyc codex online 5 min ago +Carol london claude online 1 min ago +Dave london gemini idle 10 min ago +Eve tokyo claude online 3 min ago + +$ agent-relay fleet status +SERVER STATUS AGENTS LATENCY LAST SEEN +nyc local 2 - - +london connected 2 45ms just now +tokyo connected 1 120ms 2s ago + +# Start an agent that can message across servers +$ agent-relay -n Alice claude + +# Inside Claude session: +# Alice> @relay:Carol Can you help with the auth module? +# [relay:Alice] → Carol@london: Can you help with the auth module? + +# Carol (on london server) automatically receives: +# "Relay message from Alice@nyc [abc123]: Can you help with the auth module?" +``` + +### 13.3 Addressing Examples + +```bash +# From Alice on NYC server: + +@relay:Bob hello # Bob is local (NYC) → local delivery +@relay:Carol hello # Carol is on london → routes to london +@relay:Carol@london hello # Explicit: route to Carol on london +@relay:Eve@tokyo hello # Explicit: route to Eve on tokyo +@relay:* Status update # Broadcast to ALL agents in fleet +@relay:*@local Status update # Broadcast only to NYC agents +@relay:*@london Status update # Broadcast to all london agents +``` + +--- + +## 14. Implementation Plan + +### 14.1 Phases + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PHASE 1: Foundation (1 week) │ +│ │ +│ • Define peer protocol types (src/protocol/peer-types.ts) │ +│ • Implement PeerConnection class (src/federation/peer-connection.ts) │ +│ • Implement basic HELLO/WELCOME handshake │ +│ • Add peer WebSocket listener to daemon │ +│ • Unit tests for protocol │ +│ │ +│ Deliverable: Two daemons can connect and handshake │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PHASE 2: Registry & Discovery (1 week) │ +│ │ +│ • Implement FleetRegistry (src/federation/registry.ts) │ +│ • Implement PEER_SYNC message handling │ +│ • Add registry to Router for lookups │ +│ • Implement name resolution (local → fleet) │ +│ • CLI: `agent-relay agents --fleet` │ +│ │ +│ Deliverable: Agents visible across servers │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PHASE 3: Message Routing (1 week) │ +│ │ +│ • Implement PEER_ROUTE handling │ +│ • Implement PEER_BROADCAST handling │ +│ • Integrate with existing Router │ +│ • Cross-server message delivery │ +│ • Local tmux injection on receipt (existing code!) │ +│ │ +│ Deliverable: Alice@NYC can message Bob@London automatically │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PHASE 4: Resilience (1 week) │ +│ │ +│ • Implement reconnection logic │ +│ • Implement message queue for disconnected peers │ +│ • Implement heartbeat (PING/PONG) │ +│ • Handle graceful shutdown (PEER_BYE) │ +│ • CLI: `agent-relay peer status` │ +│ │ +│ Deliverable: Fleet survives server restarts │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PHASE 5: Security & Polish (1 week) │ +│ │ +│ • Add TLS support for peer connections │ +│ • Add token-based authentication │ +│ • Add configuration file support │ +│ • Update dashboard for fleet view │ +│ • Documentation │ +│ │ +│ Deliverable: Production-ready federation │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 14.2 File Structure + +``` +src/ +├── federation/ # NEW: Federation module +│ ├── index.ts # Exports +│ ├── peer-connection.ts # WebSocket connection to peer +│ ├── peer-manager.ts # Manages all peer connections +│ ├── peer-server.ts # WebSocket server for incoming +│ ├── registry.ts # Fleet-wide agent registry +│ ├── message-queue.ts # Queue for disconnected peers +│ └── config.ts # Federation configuration +│ +├── protocol/ +│ ├── types.ts # Existing types +│ └── peer-types.ts # NEW: Peer protocol types +│ +├── daemon/ +│ ├── server.ts # Modified: integrate federation +│ ├── router.ts # Modified: federated routing +│ └── ... +│ +└── cli/ + └── index.ts # Modified: new commands +``` + +### 14.3 Estimated Effort (Revised) + +**Original estimate was too optimistic.** Distributed systems are hard. Realistic timeline: + +| Phase | Optimistic | Realistic | Notes | +|-------|------------|-----------|-------| +| Phase 1: Foundation | 1 week | 1.5-2 weeks | WebSocket edge cases | +| Phase 2: Registry | 1 week | 2 weeks | Consistency is hard | +| Phase 3: Routing | 1 week | 1.5 weeks | Broadcast complexity | +| Phase 4: Resilience | 1 week | 2-3 weeks | Reconnection, testing | +| Phase 5: Security | 1 week | 2 weeks | TLS setup, key mgmt | +| Phase 6: Stabilization | - | 2 weeks | Bug fixes, edge cases | +| **Total** | 4-5 weeks | **8-10 weeks** | | + +### 14.4 MVP Option + +To ship faster, consider a reduced-scope MVP: + +**MVP Scope (4 weeks):** +- Static peer list only (no hub) +- No TLS (rely on VPN/private network) +- Single fleet token (not per-pair) +- Require unique names (no conflict resolution) +- Memory-only queues (no persistence) +- No message priorities + +**Post-MVP (add incrementally):** +- TLS + asymmetric key auth +- Hub for discovery +- Queue persistence +- Delivery confirmation +- NATS transport option +- Observability + +--- + +## 15. Migration Path + +### 15.1 Backward Compatibility + +Existing single-server deployments work without changes: + +```yaml +# No federation block = single-server mode (current behavior) +local: + socket_path: /tmp/agent-relay/relay.sock +``` + +### 15.2 Upgrade Path + +1. **Update agent-relay** to federation-capable version +2. **Add federation config** to enable cross-server +3. **Start daemons** - they auto-connect to peers +4. **Agents just work** - no changes needed + +### 15.3 Rollback + +If issues arise: +1. Set `federation.enabled: false` +2. Restart daemon +3. Back to single-server mode + +--- + +## Summary + +This proposal extends agent-relay to support federated multi-server deployments while **preserving the core differentiator**: automatic message injection via tmux. + +**Key points:** + +1. **Injection stays local** - Each server runs tmux sessions, does local send-keys +2. **Routing goes network** - Daemons connect via WebSocket for cross-server +3. **Progressive enhancement** - Single-server still works, federation is opt-in +4. **Simple operations** - Static peer config works, hub optional +5. **Resilient** - Reconnection, message queuing, graceful degradation + +**What we preserve:** +- Zero-config agent integration (@relay: pattern) +- Automatic message delivery (no polling) +- Low latency (<5ms local, +network RTT remote) +- Simple mental model + +**What we add:** +- Cross-server messaging +- Fleet-wide agent discovery +- Peer-to-peer daemon connections +- Message queuing for resilience + +--- + +## 16. Open Questions *(NEW)* + +These questions remain unresolved and need input before/during implementation: + +### Architecture + +1. **Hub vs. Mesh for MVP?** + - Hub is simpler but single point of failure + - Mesh is resilient but more complex + - Recommendation: Start with mesh (static peers), add hub later + +2. **Queue persistence?** + - Memory-only: Simple, but loses messages on crash + - SQLite: Survives restarts, but adds complexity + - Recommendation: Memory for MVP, SQLite for v2 + +3. **NATS priority?** + - Implement WebSocket first, NATS later? + - Or start with NATS to avoid reimplementing? + - Recommendation: WebSocket MVP, NATS for production scale + +### Protocol + +4. **Message ordering guarantees?** + - Per-agent FIFO? Global ordering? Best-effort? + - Strict ordering adds latency and complexity + - Recommendation: Document best-effort, no guarantees + +5. **Broadcast scalability?** + - O(n) messages for n agents - acceptable? + - Need gossip-style fan-out for large fleets? + - Recommendation: Direct broadcast for <50 agents, revisit at scale + +### Security + +6. **Key distribution method?** + - Static config, TOFU, or CA? + - Trade-off: security vs. operational simplicity + - Recommendation: TOFU with approval for dev, CA for enterprise + +7. **mTLS required?** + - Adds complexity but strong authentication + - Alternative: TLS + Ed25519 challenge-response + - Recommendation: TLS + challenge-response for MVP + +### Operations + +8. **Testing strategy?** + - How to test multi-server locally? + - Need chaos testing framework? + - Recommendation: Docker Compose for integration tests + +9. **Observability from day one?** + - Add Prometheus metrics in MVP? + - Or defer to post-MVP? + - Recommendation: Basic metrics (connections, messages) in MVP + +10. **NAT traversal?** + - Support servers behind NAT? + - Requires connection reversal or TURN relay + - Recommendation: Document requirement for direct connectivity, defer NAT to v2 + +--- + +## 17. Storage Architecture *(NEW)* + +Federation introduces storage requirements for **ephemeral message routing**. Durable storage for trajectories and work history is handled by the separate [agent-trajectories](https://github.com/khaliqgant/agent-trajectories) project. + +### 17.1 Separation of Concerns + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ STORAGE RESPONSIBILITY │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────────────┐ ┌─────────────────────────────────────┐ │ +│ │ agent-relay │ │ agent-trajectories │ │ +│ │ (This Project) │ │ (Separate Project) │ │ +│ │ │ │ │ │ +│ │ EPHEMERAL STORAGE │ │ DURABLE STORAGE │ │ +│ │ • Peer message queues │ │ • Agent work history │ │ +│ │ • Pending ACKs │ │ • Decisions & retrospectives │ │ +│ │ • Flow control credits │ │ • Knowledge workspace │ │ +│ │ • Connection state │ │ • Exported artifacts │ │ +│ │ • Registry cache │ │ • Semantic search / RAG │ │ +│ │ │ │ │ │ +│ │ Lifetime: minutes/hours│ │ Lifetime: months/years │ │ +│ │ Backend: Memory, NATS │ │ Backend: File, SQLite, PG, S3 │ │ +│ └─────────────────────────┘ └─────────────────────────────────────┘ │ +│ │ +│ Integration: agent-relay emits events → agent-trajectories captures them │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +> **See:** [agent-trajectories](https://github.com/khaliqgant/agent-trajectories) for trajectory format, storage backends, and knowledge workspace. +> +> **Memory Layer:** agent-trajectories uses [Mem0](https://github.com/mem0ai/mem0) as the memory substrate. See [MEMORY_STACK_DECISION.md](./MEMORY_STACK_DECISION.md) for rationale. + +### 17.2 Ephemeral Storage (Message Routing) + +For federation's real-time message routing, **memory is the default**. Messages are transient—they matter for delivery, not history. + +#### In-Memory Queues (Default) + +```typescript +class EphemeralStore { + // Per-peer outbound queues (for disconnected peers) + peerQueues: Map>; + + // Pending delivery confirmations + pendingAcks: Map; + + // Flow control state + peerCredits: Map; + + // Configuration + config: { + maxQueueSize: 1000; // Bounded to prevent OOM + ackTimeoutMs: 30000; // Expire pending ACKs after 30s + queueTtlMs: 3600000; // Drop queued messages after 1 hour + }; +} +``` + +**Properties:** +- ✅ Fast (no I/O) +- ✅ Simple (no external deps) +- ❌ Lost on daemon restart +- ❌ Limited by available memory + +**When this is fine:** +- Most messages deliver immediately +- Disconnections are brief (seconds to minutes) +- Acceptable to lose queued messages on crash + +#### NATS JetStream (Optional Upgrade) + +When using NATS transport (Section 11), streams provide ephemeral persistence: + +```typescript +// NATS stream for routing messages +const routingStream = { + name: 'RELAY_ROUTING', + subjects: ['relay.route.*', 'relay.broadcast'], + retention: RetentionPolicy.Limits, + max_age: 3600 * 1e9, // 1 hour retention + max_bytes: 100 * 1024 * 1024, // 100 MB max + discard: DiscardPolicy.Old, // Drop oldest on limit +}; +``` + +**Properties:** +- ✅ Survives daemon restarts +- ✅ Shared across peers (no per-peer queuing) +- ✅ Built-in flow control and backpressure +- ❌ External dependency +- ❌ Additional operational complexity + +**When to use NATS:** +- High message volume +- Long disconnection tolerance needed +- Already have NATS infrastructure + +### 17.3 Integration with agent-trajectories + +agent-relay emits events that [agent-trajectories](https://github.com/khaliqgant/agent-trajectories) can capture: + +```typescript +// agent-relay daemon emits events +interface RelayEvent { + type: 'message_sent' | 'message_received' | 'broadcast'; + from: string; + to: string; + content: string; + ts: number; + messageId: string; +} + +// Event emitter in daemon +this.events.emit('relay:message', { + type: 'message_sent', + from: 'Alice', + to: 'Bob', + content: 'Can you review auth.ts?', + ts: Date.now(), + messageId: 'msg-abc123' +}); +``` + +agent-trajectories subscribes to these events and incorporates them into task trajectories as inter-agent communication records. + +### 17.4 Storage Configuration + +```yaml +# /etc/agent-relay/config.yaml + +storage: + # Ephemeral (routing) - managed by agent-relay + ephemeral: + type: memory # memory | nats + max_queue_per_peer: 1000 + queue_ttl_ms: 3600000 + + # If using NATS + nats: + stream: RELAY_ROUTING + max_age_seconds: 3600 + max_bytes: 104857600 # 100 MB + + # Event emission for agent-trajectories integration + events: + enabled: true + emit_to: 'unix:///tmp/agent-trajectories/events.sock' +``` + +### 17.5 Federation Impact on Storage + +When federation is enabled: + +| Concern | Single Server | Federated Fleet | +|---------|---------------|-----------------| +| **Routing queues** | Per-agent | Per-peer + per-agent | +| **Registry** | Local only | Fleet-wide sync | +| **Event emission** | Optional | Recommended (for trajectories) | + +**Recommendations:** + +1. **Routing:** Use NATS if available, otherwise memory with bounded queues +2. **Registry:** Memory + periodic persistence (survive restarts) +3. **Events:** Enable emission to agent-trajectories for cross-agent history + +--- + +## Summary (v2) + +This revised proposal addresses the critical issues identified in review: + +| Issue | Resolution | +|-------|------------| +| No end-to-end ACK | Added delivery confirmation (Section 7) | +| Registry split-brain | Fleet-wide unique names + quorum (Section 5.3) | +| Token scaling | Asymmetric keys (Section 8.1) | +| No backpressure | Credit-based flow control (Section 9) | +| Timeline unrealistic | Revised to 8-10 weeks (Section 14.3) | +| NATS consideration | Pluggable transport layer (Section 11) | +| Trajectory storage | Delegated to [agent-trajectories](https://github.com/khaliqgant/agent-trajectories) (Section 17) | + +**Key additions in v2:** +- End-to-end delivery confirmation +- Fleet-wide unique name enforcement +- Ed25519 authentication (scales better) +- Credit-based flow control + rate limiting +- Transport abstraction for NATS option +- Ephemeral storage for routing, events emitted to agent-trajectories +- Realistic timeline with MVP option +- Open questions for discussion + +--- + +## Next Steps + +1. Review v2 proposal, discuss open questions +2. Decide on MVP scope +3. Create implementation tasks in Beads +4. Begin Phase 1: Foundation diff --git a/docs/FEDERATION_PROPOSAL_REVIEW.md b/docs/FEDERATION_PROPOSAL_REVIEW.md new file mode 100644 index 00000000..833b65e6 --- /dev/null +++ b/docs/FEDERATION_PROPOSAL_REVIEW.md @@ -0,0 +1,584 @@ +# Federation Proposal: Critical Review + +A critical analysis of FEDERATION_PROPOSAL.md identifying gaps, risks, and areas needing more thought. + +--- + +## Executive Summary: Major Concerns + +| Category | Severity | Issue | +|----------|----------|-------| +| **Delivery Guarantees** | 🔴 High | No end-to-end acknowledgment | +| **Registry Consistency** | 🔴 High | Race conditions in name resolution | +| **Security** | 🟡 Medium | Token management doesn't scale | +| **Operational** | 🟡 Medium | Debugging distributed failures is hard | +| **Timeline** | 🟡 Medium | 4-5 weeks is optimistic | +| **NAT/Firewall** | 🟡 Medium | Assumes direct connectivity | + +--- + +## 1. Fundamental Architecture Issues + +### 1.1 🔴 No End-to-End Delivery Guarantee + +**The Problem:** + +``` +Alice@A → Daemon A → Daemon B → ??? → Bob receives? + ↑ ↑ + ACK ACK + (local) (peer) + +But does Bob's agent actually SEE the message? +``` + +The proposal has ACKs between daemons, but: +- No confirmation that `tmux send-keys` succeeded +- No confirmation that the agent processed the message +- Sender Alice has no idea if Bob actually received it + +**Real failure modes:** +- Bob's tmux session crashed between delivery and injection +- Bob's agent is in a blocking state (waiting for human input) +- Injection happened but Bob's agent ignored it (prompt too long, agent confused) + +**Recommendation:** +Add optional end-to-end ACK pattern: +``` +Alice sends → Bob receives → Bob's daemon detects "Relay message" in output + → Bob's daemon sends DELIVERY_CONFIRMED back to Alice +``` + +### 1.2 🔴 Registry Consistency Race Conditions + +**The Problem:** + +The proposal says "first-registered wins" for name collisions, but with async gossip: + +``` +Time 0: Server A has no "Bob" + Server B has no "Bob" + +Time 1: Alice on A starts "Bob" agent + Carol on B starts "Bob" agent + +Time 2: A sends PEER_SYNC: "Bob joined" + B sends PEER_SYNC: "Bob joined" + (messages cross in flight) + +Time 3: Both A and B think THEIR Bob is the "real" Bob + Fleet has split-brain on who "Bob" is +``` + +**Result:** `@relay:Bob` from Server C routes to different agents depending on which PEER_SYNC arrived first. Completely non-deterministic. + +**Recommendation:** +- Use Lamport timestamps or vector clocks for ordering +- Or: require unique names fleet-wide (reject registration if name exists anywhere) +- Or: always require qualified names (`Bob@server-a`) + +### 1.3 🟡 Message Ordering Not Guaranteed + +**The Problem:** + +``` +Alice sends M1 to Bob +Alice sends M2 to Bob +Network hiccup: M1 queued, M2 takes different path +Bob receives: M2, then M1 + +Bob: "Why is Alice saying 'yes' before asking the question?" +``` + +The proposal mentions sequence numbers per-topic but doesn't specify: +- Are they enforced on delivery? +- What happens to out-of-order messages? +- How do sequence numbers work across server restarts? + +**Recommendation:** +- Add explicit ordering guarantees (or document that there are none) +- Consider per-conversation sequence numbers +- Or: accept eventual consistency and document it clearly + +--- + +## 2. Security Issues + +### 2.1 🟡 Token Management Doesn't Scale + +**The Problem:** + +With N servers, you need N² tokens (each pair needs a shared secret): + +``` +3 servers: 3 tokens (A-B, A-C, B-C) +5 servers: 10 tokens +10 servers: 45 tokens +20 servers: 190 tokens +``` + +Managing 190 tokens across 20 servers is operational nightmare. + +**Additional concerns:** +- No token rotation mechanism specified +- Tokens in config files can leak +- No revocation process + +**Recommendation:** +- Use asymmetric keys (each server has keypair, sign challenges) +- Or: single CA, mTLS with auto-rotation +- Or: hub-based auth (servers auth to hub, hub vouches for peers) + +### 2.2 🟡 No Message-Level Authentication + +**The Problem:** + +Once a peer connection is established, any message from that peer is trusted: + +``` +Malicious/compromised Server B: + - Connects legitimately to A + - Sends: PEER_ROUTE { from: "Alice@A", to: "Bob@A", ... } + - Bob thinks Alice sent it, but B fabricated it +``` + +The `from_server` field isn't cryptographically verified per-message. + +**Recommendation:** +- Sign each message with server's private key +- Include HMAC of message content +- Verify signature before processing + +### 2.3 🟡 No Rate Limiting Specified + +**The Problem:** + +A misbehaving peer can flood the fleet: + +``` +Server B: for i in 1..1000000: send(PEER_BROADCAST, "spam") + +Result: All agents on all servers get 1M messages injected + Fleet is DoS'd +``` + +**Recommendation:** +- Per-peer rate limits +- Per-agent rate limits +- Backpressure signaling (BUSY message already exists locally, extend to peers) + +--- + +## 3. Operational Issues + +### 3.1 🟡 Debugging Distributed Failures is Hard + +**The Problem:** + +"My message from Alice@NYC to Bob@London never arrived. Why?" + +The proposal has no: +- Distributed tracing (correlation IDs across servers) +- Message tracking ("where is message X right now?") +- Visibility into queue depths +- Alerting on delivery failures + +**Scenario:** +``` +1. Alice sends message +2. NYC daemon routes to London +3. London daemon queues (Bob's agent busy) +4. London daemon crashes, loses queue +5. Bob never receives +6. Nobody knows what happened +``` + +**Recommendation:** +- Add correlation ID to all messages +- Log all routing decisions with correlation ID +- Add `agent-relay trace ` command +- Persist queue to disk (not just memory) + +### 3.2 🟡 No Graceful Fleet Operations + +**The Problem:** + +How do you: +- Drain a server (migrate agents before shutdown)? +- Rolling upgrade the fleet? +- Add capacity without disruption? + +The proposal doesn't address: +- Agent migration between servers +- Planned maintenance mode +- Capacity planning + +**Recommendation:** +- Add DRAINING state (accept no new agents, continue routing) +- Add agent handoff protocol +- Document operational runbooks + +### 3.3 🟡 Configuration Drift + +**The Problem:** + +With config files per server: +- Server A thinks B's token is X +- Server B rotated to token Y +- Connection fails, hard to debug + +No central config management, no config validation. + +**Recommendation:** +- Add `agent-relay config validate` command +- Add config sync mechanism (or document best practices) +- Health check should verify peer auth before claiming "healthy" + +--- + +## 4. Protocol Issues + +### 4.1 🟡 No Backpressure Across Servers + +**The Problem:** + +Local daemon has BUSY message for backpressure. But across servers: + +``` +Server A sends 1000 msgs/sec to Server B +Server B can only inject 10 msgs/sec (agents are slow) +Server B's queue grows unbounded → OOM +``` + +**Recommendation:** +- Add PEER_BUSY message +- Flow control with credits/windows +- Bounded queues with drop policy (oldest? newest? random?) + +### 4.2 🟡 Broadcast Scalability + +**The Problem:** + +`@relay:*` with 50 agents across 10 servers: + +``` +Origin server: + - 5 local agents → 5 local deliveries + - 9 peers → 9 PEER_BROADCAST messages + +Each peer: + - 5 local agents → 5 local deliveries + +Total: 5 + (9 × 5) = 50 deliveries (correct) +But: 9 WebSocket messages sent simultaneously +``` + +For larger fleets: +- 100 agents, 20 servers → 19 peer broadcasts +- Each broadcast must be processed fully + +No fan-out optimization, no multicast. + +**Recommendation:** +- For hub topology: single broadcast to hub, hub fans out +- For mesh: consider gossip-style propagation +- Rate limit broadcasts + +### 4.3 🟡 Large Message Handling + +**The Problem:** + +Max message size is 1 MiB. But: +- What if an agent tries to send 5 MiB? +- Silent truncation? Error? Split? + +Not specified. + +**Recommendation:** +- Return NACK with PAYLOAD_TOO_LARGE +- Or: implement message chunking +- Document limits clearly to agent implementers + +--- + +## 5. Edge Cases Not Addressed + +### 5.1 🟡 NAT and Firewall Traversal + +**The Problem:** + +The proposal assumes direct connectivity: + +``` +Server A (public IP) ──► Server B (behind NAT) + ↑ + Cannot initiate inbound +``` + +Many production servers are behind NATs, firewalls, or in private VPCs. + +**Recommendation:** +- Document network requirements explicitly +- Consider connection reversal (B connects to A) +- Consider TURN-style relay for NAT traversal +- Or: explicitly require hub topology for NAT scenarios + +### 5.2 🟡 Clock Skew + +**The Problem:** + +TTL expiration uses timestamps: +```typescript +expires_at: Date.now() + 3600000 // 1 hour +``` + +But if Server A's clock is 30 minutes ahead of B's: +- A queues message with expires_at = A.now + 1hr +- A reconnects to B after 45 min (A's time) +- B receives message, checks expires_at against B.now +- B.now is only 15 min past message creation (from B's perspective) +- Message still valid... but conceptually stale + +Or worse: clocks very far off could cause immediate expiration. + +**Recommendation:** +- Use relative TTL in message (ttl_ms: 3600000) +- Receiving server applies TTL from receipt time +- Or: require NTP sync, document assumption + +### 5.3 🟡 Server ID Collisions + +**The Problem:** + +Two servers configured with same ID: + +```yaml +# Server in NYC +server: + id: production + +# Server in London (copy-paste error) +server: + id: production +``` + +Both connect to hub. Registry confused. Routing broken. + +**Recommendation:** +- Validate ID uniqueness on connection +- Reject PEER_HELLO if server_id already registered +- Generate default ID from hostname/MAC if not configured + +### 5.4 🟡 Message Replay + +**The Problem:** + +No replay protection: +``` +1. Attacker captures PEER_ROUTE message +2. Attacker replays it 1000 times +3. Bob gets same message 1000 times +``` + +**Recommendation:** +- Add nonce/message ID to dedup +- Track seen message IDs (with expiry) +- Already have `id` field, just need dedup + +--- + +## 6. Missing Features + +### 6.1 🟡 No Message Priorities + +**The Problem:** + +All messages treated equally. But: +- System messages (peer down notifications) should be urgent +- Bulk status updates can wait +- User-initiated messages more important than background sync + +**Recommendation:** +- Add priority field (LOW, NORMAL, HIGH, SYSTEM) +- Separate queues per priority +- Process high priority first + +### 6.2 🟡 No Metrics or Observability + +**The Problem:** + +How do you know if federation is healthy? + +No specified: +- Message latency histograms +- Delivery success rates +- Queue depths +- Peer connection status +- Error rates by type + +**Recommendation:** +- Add Prometheus metrics endpoint +- Key metrics: peer_connection_state, messages_routed_total, messages_queued, routing_latency_seconds +- Integrate with dashboard + +### 6.3 🟡 No Testing Strategy + +**The Problem:** + +How do you test federation? + +- Unit tests for protocol parsing ✓ (mentioned) +- Integration tests across "servers"? Not mentioned +- Chaos testing (network partitions, slow peers)? Not mentioned +- Performance benchmarks? Not mentioned + +**Recommendation:** +- Add multi-daemon integration test harness +- Simulate network conditions (latency, packet loss) +- Chaos tests: kill peers, corrupt messages, reorder +- Benchmark: messages/sec at various fleet sizes + +--- + +## 7. Timeline Concerns + +### 7.1 🟡 4-5 Weeks is Optimistic + +**The Reality:** + +Distributed systems are hard. The proposal underestimates: + +| Phase | Proposed | Realistic | +|-------|----------|-----------| +| Foundation | 1 week | 1.5-2 weeks (WebSocket edge cases) | +| Registry | 1 week | 2 weeks (consistency is hard) | +| Routing | 1 week | 1.5 weeks (broadcast complexity) | +| Resilience | 1 week | 2-3 weeks (reconnection, queuing, testing) | +| Security | 1 week | 2 weeks (TLS setup, token management) | +| **Total** | 4-5 weeks | **8-10 weeks** | + +Plus: +- Integration testing +- Documentation +- Bug fixes from early testing +- Edge cases discovered in use + +**Recommendation:** +- Double the estimate +- Plan for Phase 6: Stabilization (2 weeks of bug fixes) +- MVP first: mesh without hub, no TLS, basic routing + +--- + +## 8. Alternative Approaches Worth Considering + +### 8.1 Why Not Use NATS/Redis? + +**The proposal dismisses external dependencies but doesn't fully justify.** + +NATS JetStream provides: +- ✅ Persistent queues (survive restart) +- ✅ Exactly-once delivery +- ✅ Clustering/HA built-in +- ✅ Backpressure +- ✅ Observability +- ✅ Battle-tested + +Custom implementation provides: +- ✅ No external dependency +- ✅ Full control +- ❌ Must implement all of the above + +**Honest trade-off:** +- Custom: 8-10 weeks dev, ongoing maintenance, custom bugs +- NATS: 1 week integration, proven reliability, learning curve + +**Recommendation:** +At minimum, document why custom is preferred. Consider NATS for production, custom for dev/simple cases. + +### 8.2 Simpler Alternative: SSH Tunnels + +For small fleets, SSH tunnels might be simpler: + +```bash +# On NYC server, create tunnel to London +ssh -L 8765:localhost:8765 london.example.com + +# Local daemon connects to localhost:8765 +# Tunnel forwards to London's daemon +``` + +Benefits: +- Auth handled by SSH (keys, etc.) +- Encryption handled by SSH +- No new code needed +- Operationally familiar + +Downsides: +- Manual tunnel management +- Single point of failure per tunnel +- Doesn't scale to large fleets + +**Recommendation:** +Document SSH tunnel option for simple 2-3 server setups. + +--- + +## 9. Recommendations Summary + +### Must Fix Before Implementation + +1. **End-to-end delivery confirmation** - Sender must know message was injected +2. **Registry consistency** - Define conflict resolution, prevent split-brain +3. **Message deduplication** - Prevent replays using message ID + +### Should Address in v1 + +4. **Bounded queues** - Prevent OOM from slow peers +5. **Distributed tracing** - Correlation IDs for debugging +6. **Token rotation** - Or switch to asymmetric auth +7. **Rate limiting** - Prevent flood attacks + +### Can Defer to v2 + +8. **Message priorities** +9. **Graceful drain/migration** +10. **Metrics/observability** +11. **Hub HA** + +### Revise Estimates + +- **Realistic timeline: 8-10 weeks** +- Plan stabilization phase +- Consider MVP with reduced scope + +--- + +## 10. Suggested MVP Scope + +To ship something useful faster, consider this reduced scope: + +**MVP (4 weeks):** +- Static peer list only (no hub) +- No TLS (rely on VPN/private network) +- Single token per fleet (not per-pair) +- Basic registry (no conflict handling - require unique names) +- No queue persistence (memory only) +- No message priorities + +**Post-MVP:** +- TLS + proper auth +- Hub for discovery +- Queue persistence +- Conflict resolution +- Observability + +This gets cross-server messaging working quickly, then hardens iteratively. + +--- + +## Conclusion + +The federation proposal has solid architectural bones—separation of routing and injection is the right call. However, it underestimates the complexity of distributed systems and glosses over critical details around consistency, delivery guarantees, and operations. + +**Verdict:** Good foundation, needs refinement before implementation. Address the 🔴 High severity issues, revise timeline, and consider an MVP approach. diff --git a/docs/HOOKS_API.md b/docs/HOOKS_API.md new file mode 100644 index 00000000..a0c3da10 --- /dev/null +++ b/docs/HOOKS_API.md @@ -0,0 +1,1138 @@ +# Agent Relay Hooks API + +**Date:** 2025-12-21 +**Status:** Proposed + +## Overview + +Hooks are a core primitive in agent-relay that allow: +1. **Intercepting agent output** - React to patterns, events, session lifecycle +2. **Injecting prompts** - Guide agent behavior automatically +3. **Extending with namespaces** - User-defined `@pattern:` handlers + +## Pattern Namespaces + +agent-relay intercepts output patterns in the format `@namespace:target message`. + +### Built-in Namespaces + +| Namespace | Purpose | Example | +|-----------|---------|---------| +| `@relay:` | Inter-agent messaging | `@relay:Alice Check the tests` | +| `@memory:` | Memory operations | `@memory:save User prefers dark mode` | +| `@broadcast:` | Broadcast to all | `@relay:* Status update` | + +### Memory Namespace + +``` +@memory:save # Store a memory +@memory:search # Retrieve relevant memories +@memory:forget # Delete a memory +@memory:list # List recent memories +``` + +### User-Defined Namespaces + +Users can register custom pattern handlers: + +```typescript +// relay.config.ts +export default { + patterns: { + // Custom namespace: @deploy: + deploy: { + handler: async (target, message, context) => { + if (target === 'staging') { + await exec('npm run deploy:staging'); + return { inject: 'Deployed to staging successfully' }; + } + } + }, + + // Custom namespace: @notify: + notify: { + handler: async (target, message, context) => { + await fetch('https://slack.com/api/post', { + body: JSON.stringify({ channel: target, text: message }) + }); + } + } + } +}; +``` + +Usage in agent output: +``` +@deploy:staging Release v1.2.3 +@notify:#engineering Build complete +``` + +## Hook Lifecycle + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ HOOK LIFECYCLE │ +│ │ +│ SESSION START │ +│ │ │ +│ ▼ │ +│ ┌─────────────────┐ │ +│ │ onSessionStart │ → Inject initial context, load memories │ +│ └────────┬────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────┐ │ +│ │ AGENT RUNNING │ │ +│ │ │ │ +│ │ Agent Output ──► onOutput ──► Pattern Match? ──► Handler │ │ +│ │ │ │ │ │ +│ │ │ ▼ │ │ +│ │ │ @relay: → route message │ │ +│ │ │ @memory: → store/search │ │ +│ │ │ @custom: → user handler │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ onToolCall ──► Before/after tool execution │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ onMessageReceived ──► Inject incoming relay messages │ │ +│ │ │ │ │ +│ │ ▼ │ │ +│ │ onIdle ──► Periodic prompts (memory review, status) │ │ +│ │ │ │ +│ └─────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────┐ │ +│ │ onSessionEnd │ → Prompt for memory save, cleanup │ +│ └─────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## Lifecycle Events: Detailed Specification + +### onSessionStart + +**When:** Immediately after tmux session is created, before agent CLI starts producing output. + +**Trigger point:** `TmuxWrapper.start()` after spawn, before first `pollOutput()`. + +```typescript +// In src/wrapper/tmux-wrapper.ts +async start(command: string) { + await this.spawnTmuxSession(command); + + // TRIGGER: onSessionStart + const result = await this.hooks.emit('sessionStart', { + agentId: this.agentId, + agentName: this.agentName, + sessionId: this.sessionId, + workingDir: process.cwd(), + }); + + // Inject any returned text (e.g., loaded memories) + if (result?.inject) { + await this.injectText(result.inject); + } + + this.startPolling(); +} +``` + +**Use cases:** +- Load relevant memories from Mem0 based on project/directory +- Inject user preferences ("User prefers TypeScript") +- Set up agent context ("You are working on the auth module") + +**Handler signature:** +```typescript +onSessionStart: (ctx: HookContext) => Promise +``` + +**Example - Inject project context:** +```typescript +onSessionStart: async (ctx) => { + // Greet agent with project info + return { + inject: `[CONTEXT] You are working in ${ctx.projectName}. +Working directory: ${ctx.workingDir} +Remember to save important learnings with @memory:save`, + log: `Session started: ${ctx.agentName} in ${ctx.projectName}` + }; +} +``` + +**Example - Role-based context:** +```typescript +onSessionStart: async (ctx) => { + const roles = { + 'Reviewer': 'Focus on code quality, security, and best practices.', + 'Architect': 'Design systems, make technical decisions.', + 'Developer': 'Implement features, fix bugs, write tests.', + }; + const role = roles[ctx.agentName] || 'General assistant.'; + return { inject: `[ROLE] ${role}` }; +} +``` + +--- + +### onOutput + +**When:** Every time new output is captured from the agent (polled every 100ms, fires on diff). + +**Trigger point:** `TmuxWrapper.pollOutput()` when `newOutput !== lastOutput`. + +```typescript +// In src/wrapper/tmux-wrapper.ts +async pollOutput() { + const paneContent = await this.capturePane(); + const newOutput = this.diffOutput(paneContent, this.lastContent); + + if (newOutput) { + this.lastContent = paneContent; + this.lastOutputTime = Date.now(); + + // TRIGGER: onOutput + await this.hooks.emit('output', newOutput, this.context); + + // Check for @pattern: matches + await this.matchPatterns(newOutput); + } +} +``` + +**Use cases:** +- Log all agent output to file/database +- Detect errors and alert +- Track progress metrics +- Custom pattern matching beyond @namespace: + +**Handler signature:** +```typescript +onOutput: (output: string, ctx: HookContext) => Promise +``` + +**Note:** This fires frequently. Keep handlers fast. Don't inject on every output. + +**Example - Error detection:** +```typescript +onOutput: async (output, ctx) => { + // Alert coordinator when errors occur + if (output.includes('Error:') || output.includes('FATAL')) { + return { + sendMessage: { + to: 'Coordinator', + content: `[ALERT] ${ctx.agentName} hit error: ${output.slice(0, 200)}` + }, + log: `Error in ${ctx.agentName}: ${output.slice(0, 100)}` + }; + } +} +``` + +**Example - Progress tracking:** +```typescript +onOutput: async (output, ctx) => { + // Log test results + if (output.includes('PASS') || output.includes('FAIL')) { + return { log: `[TEST] ${ctx.agentName}: ${output.slice(0, 150)}` }; + } +} +``` + +**Example - Keyword alerting:** +```typescript +onOutput: async (output, ctx) => { + // Notify on security-related output + const keywords = ['vulnerability', 'CVE-', 'security', 'exploit']; + if (keywords.some(k => output.toLowerCase().includes(k))) { + return { + sendMessage: { to: 'Security', content: `Review needed: ${output.slice(0, 300)}` } + }; + } +} +``` + +--- + +### onIdle + +**When:** Agent has produced no output for `idleThreshold` (default 30 seconds). + +**Trigger point:** `TmuxWrapper.pollOutput()` when idle time exceeds threshold. + +```typescript +// In src/wrapper/tmux-wrapper.ts +private idleThreshold = 30000; // 30 seconds +private lastIdleNotification = 0; + +async pollOutput() { + // ... capture and diff ... + + const idleTime = Date.now() - this.lastOutputTime; + + // TRIGGER: onIdle (once per idle period, not continuously) + if (idleTime > this.idleThreshold && + Date.now() - this.lastIdleNotification > this.idleThreshold) { + this.lastIdleNotification = Date.now(); + + const result = await this.hooks.emit('idle', this.context); + if (result?.inject) { + await this.injectText(result.inject); + } + } +} +``` + +**Use cases:** +- Prompt agent for status update +- Ask if agent is stuck or needs help +- Suggest next steps +- Trigger auto-save of work in progress + +**Handler signature:** +```typescript +onIdle: (ctx: HookContext) => Promise +``` + +**Configuration:** +```typescript +// relay.config.ts +export default { + hooks: { + onIdle: async (ctx) => { + return { inject: '[STATUS CHECK] Are you making progress?' }; + } + }, + options: { + idleThreshold: 60000, // 60 seconds instead of default 30 + } +}; +``` + +**Example - Escalating idle prompts:** +```typescript +onIdle: async (ctx) => { + // Gentle prompt after 30s, escalate after 2min + if (ctx.idleSeconds > 120) { + return { + inject: '[STUCK?] No activity for 2+ minutes. Need help?', + sendMessage: { to: 'Coordinator', content: `${ctx.agentName} idle for ${ctx.idleSeconds}s` } + }; + } else if (ctx.idleSeconds > 30) { + return { inject: '[STATUS] Still working? Update with @relay:* STATUS: ...' }; + } +} +``` + +**Example - Auto-save reminder:** +```typescript +onIdle: async (ctx) => { + return { + inject: '[REMINDER] Consider saving progress: @memory:save ' + }; +} +``` + +**Example - Notify coordinator only (no injection):** +```typescript +onIdle: async (ctx) => { + // Silent monitoring - don't interrupt agent + if (ctx.idleSeconds > 60) { + return { + sendMessage: { to: 'Coordinator', content: `${ctx.agentName} idle ${ctx.idleSeconds}s` }, + log: `Idle alert: ${ctx.agentName}` + }; + } +} +``` + +--- + +### onMessageReceived + +**When:** A relay message arrives for this agent from another agent or broadcast. + +**Trigger point:** `TmuxWrapper.handleIncomingMessage()` when daemon delivers message. + +```typescript +// In src/wrapper/tmux-wrapper.ts +async handleIncomingMessage(message: RelayMessage) { + // TRIGGER: onMessageReceived (before injection) + const result = await this.hooks.emit('messageReceived', message, this.context); + + // Allow handler to modify or suppress injection + if (result?.suppress) { + return; // Don't inject this message + } + + const textToInject = result?.inject || this.formatMessage(message); + await this.injectText(textToInject); +} +``` + +**Use cases:** +- Custom message formatting +- Filter/suppress certain messages +- Log incoming messages +- Transform message content +- Route to different handlers based on sender + +**Handler signature:** +```typescript +onMessageReceived: (message: RelayMessage, ctx: HookContext) => Promise +``` + +**Example - Custom formatting:** +```typescript +onMessageReceived: async (msg, ctx) => { + // Add priority indicator + const priority = msg.metadata?.urgent ? '🚨 URGENT' : '📨'; + return { + inject: `${priority} Message from ${msg.from}: ${msg.content}` + }; +} +``` + +**Example - Suppress broadcasts while focused:** +```typescript +onMessageReceived: async (msg, ctx) => { + // Suppress status broadcasts, keep direct messages + if (msg.from === '*' && msg.content.startsWith('STATUS:')) { + return { suppress: true, log: `Suppressed broadcast: ${msg.content.slice(0, 50)}` }; + } +} +``` + +**Example - Filter by sender:** +```typescript +onMessageReceived: async (msg, ctx) => { + // Only accept messages from Coordinator and Reviewer + const allowedSenders = ['Coordinator', 'Reviewer']; + if (!allowedSenders.includes(msg.from)) { + return { suppress: true, log: `Blocked message from ${msg.from}` }; + } +} +``` + +**Example - Transform task assignments:** +```typescript +onMessageReceived: async (msg, ctx) => { + // Reformat task assignments with clear structure + if (msg.content.startsWith('TASK:')) { + const task = msg.content.replace('TASK:', '').trim(); + return { + inject: ` +─────────────────────────────── +NEW TASK from ${msg.from}: +${task} +───────────────────────────────` + }; + } +} +``` + +--- + +### onSessionEnd + +**When:** Agent session is ending (user pressed Ctrl+C, agent exited, or explicit stop). + +**Trigger point:** `TmuxWrapper.stop()` or SIGINT/SIGTERM handler. + +```typescript +// In src/wrapper/tmux-wrapper.ts +async stop() { + // TRIGGER: onSessionEnd (before cleanup) + const result = await this.hooks.emit('sessionEnd', this.context); + + if (result?.inject) { + await this.injectText(result.inject); + // Give agent time to process and respond + await this.waitForResponse(5000); + } + + await this.cleanup(); +} + +// Also in signal handlers +process.on('SIGINT', async () => { + await wrapper.stop(); + process.exit(0); +}); +``` + +**Use cases:** +- Prompt agent to save important learnings +- Capture final summary +- Cleanup resources +- Save session transcript + +**Handler signature:** +```typescript +onSessionEnd: (ctx: HookContext) => Promise +``` + +**Example - Memory prompt:** +```typescript +onSessionEnd: async (ctx) => { + return { + inject: ` +[SESSION ENDING] +Before you go, save any important learnings: + @memory:save +` + }; +} +``` + +**Example - Notify team of departure:** +```typescript +onSessionEnd: async (ctx) => { + const duration = Math.round((Date.now() - ctx.sessionStartTime) / 60000); + return { + inject: '[ENDING] Save your progress!', + sendMessage: { + to: '*', + content: `${ctx.agentName} signing off after ${duration} minutes` + }, + log: `Session ended: ${ctx.agentName} (${duration}m)` + }; +} +``` + +**Example - Request summary before exit:** +```typescript +onSessionEnd: async (ctx) => { + return { + inject: `[SESSION END] Please provide a brief summary: +1. What did you accomplish? +2. What's left to do? +3. Any blockers for the next agent? + +Reply, then I'll save your response.` + }; +} +``` + +**Example - Silent logging only:** +```typescript +onSessionEnd: async (ctx) => { + // No injection, just audit log + return { + log: `END: ${ctx.agentName} | project: ${ctx.projectName} | messages: ${ctx.recentMessages.length}` + }; +} +``` + +--- + +### onToolCall (Future) + +**When:** Agent invokes a tool (requires parsing tool calls from output). + +**Status:** Future enhancement - requires understanding agent's tool output format. + +```typescript +onToolCall: (tool: string, args: any, ctx: HookContext) => Promise +``` + +**Use cases:** +- Audit tool usage +- Block dangerous operations +- Inject additional context before tool runs + +--- + +## Implementation Location + +All lifecycle events are triggered from `src/wrapper/tmux-wrapper.ts`: + +```typescript +// src/wrapper/tmux-wrapper.ts - Current structure +export class TmuxWrapper { + // ADD: Hook emitter + private hooks: HookEmitter; + + constructor(config: WrapperConfig) { + this.hooks = new HookEmitter(config.hooks); + } + + async start() { /* triggers onSessionStart */ } + async pollOutput() { /* triggers onOutput, onIdle */ } + async handleIncomingMessage() { /* triggers onMessageReceived */ } + async stop() { /* triggers onSessionEnd */ } +} +``` + +New file needed: `src/hooks/emitter.ts`: + +```typescript +// src/hooks/emitter.ts +export class HookEmitter { + private handlers: Map; + + constructor(config?: HooksConfig) { + this.handlers = new Map(); + if (config) this.loadFromConfig(config); + } + + on(event: HookEvent, handler: HookHandler) { + const existing = this.handlers.get(event) || []; + this.handlers.set(event, [...existing, handler]); + } + + async emit(event: HookEvent, ...args: any[]): Promise { + const handlers = this.handlers.get(event) || []; + let result: HookResult | void; + + for (const handler of handlers) { + result = await handler(...args); + if (result?.stop) break; // Stop propagation + } + + return result; + } +} +``` + +## Event Summary Table + +| Event | Trigger | Frequency | Can Inject? | Use Case | +|-------|---------|-----------|-------------|----------| +| `onSessionStart` | Session spawn | Once | ✅ Yes | Load memories, set context | +| `onOutput` | New output captured | Many (100ms poll) | ⚠️ Rarely | Logging, error detection | +| `onIdle` | No output for 30s | Periodic | ✅ Yes | Status prompts | +| `onMessageReceived` | Relay message arrives | Per message | ✅ Yes | Custom formatting | +| `onSessionEnd` | Session closing | Once | ✅ Yes | Save memories, cleanup | +| `onToolCall` | Tool invoked | Per tool | Future | Audit, block | + +## Hook API + +### Configuration File + +```typescript +// relay.config.ts (in project root or ~/.config/agent-relay/) +import type { RelayConfig } from 'agent-relay'; + +export default { + // Pattern handlers (namespaces) + patterns: { + memory: 'builtin', // Use built-in memory handler + deploy: { handler: myDeployHandler }, + }, + + // Lifecycle hooks + hooks: { + onSessionStart: async (ctx) => { + // Load relevant memories + const memories = await ctx.memory.search(ctx.workingDir); + return { inject: `Relevant context:\n${memories}` }; + }, + + onSessionEnd: async (ctx) => { + return { + inject: `Session ending. Save any important learnings with @memory:save` + }; + }, + + onOutput: async (output, ctx) => { + // Custom output processing + if (output.includes('ERROR')) { + await ctx.notify('errors', output); + } + }, + + onIdle: async (ctx) => { + // Called after 30s of no output + // Could prompt for status update + }, + }, + + // Memory configuration + memory: { + backend: 'mem0', // or 'qdrant', 'custom' + autoSave: false, // Don't auto-extract, let agent decide + promptOnEnd: true, // Prompt to save at session end + }, +} satisfies RelayConfig; +``` + +### Programmatic API + +```typescript +import { Relay } from 'agent-relay'; + +const relay = new Relay({ + name: 'MyAgent', +}); + +// Register pattern handler +relay.pattern('deploy', async (target, message, ctx) => { + console.log(`Deploying to ${target}: ${message}`); + await deploy(target); + return { inject: `Deployed to ${target}` }; +}); + +// Register lifecycle hook +relay.on('sessionStart', async (ctx) => { + const memories = await loadMemories(ctx.agentId); + ctx.inject(`Your memories:\n${memories}`); +}); + +relay.on('sessionEnd', async (ctx) => { + ctx.inject('Save important learnings with @memory:save'); +}); + +// Start with wrapped command +relay.wrap('claude'); +``` + +### Hook Context (Read-Only) + +Hooks receive a **read-only context** with limited information. They cannot take actions directly - they return a `HookResult` that the relay system interprets. + +```typescript +interface HookContext { + // Agent identity (read-only) + readonly agentId: string; + readonly agentName: string; + readonly sessionId: string; + + // Environment (read-only) + readonly workingDir: string; + readonly projectName: string; + + // Session state (read-only, last N items) + readonly recentOutput: readonly string[]; // Last 50 output chunks + readonly recentMessages: readonly Message[]; // Last 20 messages + + // Timing (read-only) + readonly sessionStartTime: number; + readonly lastOutputTime: number; + readonly idleSeconds: number; +} +``` + +### Hook Result (Allowed Actions) + +Hooks communicate intent via return value. The relay system executes allowed actions. + +```typescript +interface HookResult { + // Text injection (max 2000 chars, sanitized) + inject?: string; + + // Suppress default behavior (onMessageReceived only) + suppress?: boolean; + + // Stop other handlers from running + stop?: boolean; + + // Send relay message (limited to 1 per hook invocation) + sendMessage?: { + to: string; // Agent name or "*" for broadcast + content: string; // Max 5000 chars + }; + + // Log to relay audit log (optional) + log?: string; +} +``` + +### What Hooks CANNOT Do + +| Prohibited | Why | +|------------|-----| +| File system access | Use @pattern: handlers for controlled file ops | +| Shell execution | Security risk - use dedicated patterns | +| Network requests | Use @pattern: handlers with explicit config | +| Modify env vars | Read-only context | +| Access full output | Memory bounded - only recent chunks | +| Unlimited injection | Max 2000 chars per injection | +| Multiple messages | One sendMessage per invocation | + +### Sandboxing + +Hooks run in a restricted context: + +```typescript +// Hooks are pure functions: context in, result out +type HookHandler = (ctx: HookContext) => Promise; + +// The relay system enforces limits +function executeHook(handler: HookHandler, ctx: HookContext): HookResult { + const result = await handler(Object.freeze(ctx)); // Immutable context + + // Enforce limits + if (result?.inject && result.inject.length > 2000) { + result.inject = result.inject.slice(0, 2000) + '... [truncated]'; + } + + if (result?.sendMessage?.content.length > 5000) { + throw new HookError('Message exceeds 5000 char limit'); + } + + return result; +} +``` + +### Capability Escalation + +For advanced use cases, users can enable additional capabilities in config: + +```typescript +// relay.config.ts +export default { + hooks: { + onSessionEnd: myHandler, + }, + + // Explicit capability grants (default: all false) + capabilities: { + allowNetworkInHooks: false, // Permit fetch() in handlers + allowFileReadInHooks: false, // Permit fs.readFile() + unlimitedInjection: false, // Remove 2000 char limit + } +}; +``` + +**Default is locked down.** Users must explicitly opt-in to dangerous capabilities. + +--- + +## Examples + +### Example 1: Memory Integration (Complete) + +Load memories at session start, prompt to save at session end. + +```typescript +// relay.config.ts +import type { RelayConfig } from 'agent-relay'; + +export default { + hooks: { + onSessionStart: async (ctx) => { + // Inject project context from memory + return { + inject: `[CONTEXT] Working in ${ctx.projectName} (${ctx.workingDir})`, + log: `Session started for ${ctx.agentName}` + }; + }, + + onSessionEnd: async (ctx) => { + return { + inject: `[SESSION END] Save learnings with @memory:save ` + }; + }, + + onIdle: async (ctx) => { + // Only prompt if idle for extended period + if (ctx.idleSeconds > 60) { + return { + inject: '[STATUS] Are you blocked? Need help?' + }; + } + } + } +} satisfies RelayConfig; +``` + +--- + +### Example 2: Error Detection and Alerting + +Detect errors in output and notify another agent. + +```typescript +// relay.config.ts +export default { + hooks: { + onOutput: async (output, ctx) => { + // Check for error patterns + if (output.includes('Error:') || output.includes('FAILED')) { + return { + sendMessage: { + to: 'Coordinator', + content: `Alert: ${ctx.agentName} encountered error in ${ctx.projectName}` + }, + log: `Error detected: ${output.slice(0, 100)}` + }; + } + // No injection - just observing + } + } +}; +``` + +--- + +### Example 3: Message Filtering + +Suppress low-priority messages, format high-priority ones. + +```typescript +// relay.config.ts +export default { + hooks: { + onMessageReceived: async (msg, ctx) => { + // Suppress status broadcasts while busy + if (msg.from === '*' && msg.content.startsWith('STATUS:')) { + return { suppress: true, log: `Suppressed broadcast from ${msg.from}` }; + } + + // Highlight urgent messages + if (msg.content.includes('URGENT')) { + return { + inject: `\n>>> URGENT from ${msg.from}: ${msg.content} <<<\n` + }; + } + + // Default formatting (return nothing to use default) + } + } +}; +``` + +--- + +### Example 4: Custom Pattern Handler + +Define `@ticket:` pattern to create tickets. + +```typescript +// relay.config.ts +export default { + patterns: { + ticket: { + handler: async (target, message, ctx) => { + // target = priority (high, medium, low) + // message = ticket description + + // Just log and acknowledge - no external calls in default sandbox + return { + inject: `Ticket logged: [${target}] ${message.slice(0, 50)}...`, + log: `ticket:${target} - ${message}` + }; + } + } + } +}; + +// Agent usage: +// @ticket:high Fix authentication timeout in login flow +// @ticket:low Update README with new CLI options +``` + +--- + +### Example 5: Coordinator Agent Hooks + +Special hooks for a coordinating agent that manages others. + +```typescript +// relay.config.ts +export default { + hooks: { + onSessionStart: async (ctx) => { + if (ctx.agentName === 'Coordinator') { + return { + inject: `[COORDINATOR MODE] +You are managing the following agents. Monitor their status. +Use @relay:AgentName to communicate. +Use @relay:* to broadcast.`, + sendMessage: { + to: '*', + content: 'Coordinator online. Report status.' + } + }; + } + }, + + onMessageReceived: async (msg, ctx) => { + // Log all incoming messages for coordinator + if (ctx.agentName === 'Coordinator') { + return { + log: `[${msg.from}] ${msg.content.slice(0, 200)}` + }; + } + } + } +}; +``` + +--- + +### Example 6: Minimal Config (Just Memory Prompts) + +Simplest useful configuration. + +```typescript +// relay.config.ts +export default { + hooks: { + onSessionEnd: async () => ({ + inject: 'Save anything important: @memory:save ' + }) + } +}; +``` + +--- + +### Example 7: Debug Mode + +Log everything for troubleshooting. + +```typescript +// relay.config.ts +export default { + hooks: { + onSessionStart: async (ctx) => ({ + log: `START: ${ctx.agentName} in ${ctx.workingDir}` + }), + + onOutput: async (output, ctx) => ({ + log: `OUTPUT[${ctx.agentName}]: ${output.slice(0, 100)}` + }), + + onMessageReceived: async (msg, ctx) => ({ + log: `MSG[${ctx.agentName}] from ${msg.from}: ${msg.content.slice(0, 100)}` + }), + + onIdle: async (ctx) => ({ + log: `IDLE: ${ctx.agentName} for ${ctx.idleSeconds}s` + }), + + onSessionEnd: async (ctx) => ({ + log: `END: ${ctx.agentName}` + }) + } +}; +``` + +--- + +## Built-in Pattern Handlers + +### @relay: (Messaging) + +```typescript +// Built-in, always available +relay.pattern('relay', async (target, message, ctx) => { + if (target === '*') { + await ctx.relay.broadcast(message); + } else { + await ctx.relay.send(target, message); + } +}); +``` + +### @memory: (Memory Operations) + +```typescript +// Built-in when memory is configured +relay.pattern('memory', async (action, content, ctx) => { + switch (action) { + case 'save': + await ctx.memory.add(content, { agentId: ctx.agentId }); + return { inject: `✓ Saved to memory` }; + + case 'search': + const results = await ctx.memory.search(content); + return { inject: `Memories:\n${format(results)}` }; + + case 'forget': + await ctx.memory.delete(content); + return { inject: `✓ Forgotten` }; + } +}); +``` + +## Example: Full Memory Integration + +```typescript +// relay.config.ts +export default { + patterns: { + memory: 'builtin', + }, + + hooks: { + onSessionStart: async (ctx) => { + // Search for relevant context based on current directory/project + const projectMemories = await ctx.memory.search( + `project: ${ctx.workingDir}` + ); + const userPrefs = await ctx.memory.search('user preferences'); + + if (projectMemories.length || userPrefs.length) { + return { + inject: ` +[CONTEXT FROM MEMORY] +${projectMemories.map(m => `- ${m.content}`).join('\n')} + +[USER PREFERENCES] +${userPrefs.map(m => `- ${m.content}`).join('\n')} +` + }; + } + }, + + onSessionEnd: async (ctx) => { + return { + inject: ` +[SESSION ENDING] +If you learned anything important, save it: + @memory:save + +Examples: + @memory:save User prefers TypeScript over JavaScript + @memory:save This project uses Prisma for database access + @memory:save Auth tokens stored in httpOnly cookies +` + }; + }, + }, + + memory: { + backend: 'mem0', + config: { + vectorStore: { provider: 'qdrant', url: 'http://localhost:6333' }, + embedder: { provider: 'ollama', model: 'nomic-embed-text' }, + }, + }, +}; +``` + +## Escaping Patterns + +To output literal `@namespace:` without triggering handlers: + +``` +\@relay:AgentName # Outputs literally, not routed +\\@relay:AgentName # Outputs \@relay:AgentName +``` + +## Priority & Order + +1. Patterns are matched in order of specificity +2. Built-in patterns run before user patterns (unless overridden) +3. Multiple handlers for same pattern run in registration order +4. Return `{ stop: true }` to prevent further handlers + +## Next Steps + +1. Implement pattern registry in agent-relay daemon +2. Add hook lifecycle events to wrapper +3. Implement @memory: built-in handler +4. Create relay.config.ts loader +5. Add documentation and examples + +## Related + +- [MEMORY_STACK_DECISION.md](./MEMORY_STACK_DECISION.md) - Memory backend choice +- [FEDERATION_PROPOSAL.md](./FEDERATION_PROPOSAL.md) - Cross-server messaging diff --git a/docs/MEMORY_STACK_DECISION.md b/docs/MEMORY_STACK_DECISION.md new file mode 100644 index 00000000..6f2f4cdc --- /dev/null +++ b/docs/MEMORY_STACK_DECISION.md @@ -0,0 +1,342 @@ +# Memory Stack Decision: Mem0 as Foundation + +**Date:** 2025-12-21 +**Status:** Proposed + +## Decision + +Use [Mem0](https://github.com/mem0ai/mem0) as the memory substrate for agent-trajectories rather than building from scratch. + +## Context + +We evaluated cross-platform memory solutions before building our own: + +| Solution | Stars | Focus | Multi-Agent | MCP Support | +|----------|-------|-------|-------------|-------------| +| [Mem0](https://github.com/mem0ai/mem0) | 25k+ | Universal memory API | ✅ | ✅ | +| [Zep](https://github.com/getzep/zep) | 3k+ | Temporal knowledge graph | ✅ | ❓ | +| [Letta](https://github.com/letta-ai/letta) | 20k+ | Stateful agents | ✅ | ❓ | +| [Cognee](https://github.com/topoteretes/cognee) | 4k+ | Document → graph | ⚠️ | ✅ | +| [claude-mem](https://github.com/thedotmack/claude-mem) | Popular | Claude Code memory | ❌ Single agent | ❌ Claude only | + +## Why Mem0 + +1. **Most popular** - 25k+ stars, active development, YC-backed +2. **Multi-LLM support** - Not locked to OpenAI (works with Anthropic, etc.) +3. **MCP integration exists** - Works with Claude Code today via [Composio MCP](https://mcp.composio.dev/mem0) +4. **Self-hosted option** - Apache 2.0 license +5. **Python + TypeScript SDKs** - Matches our stack +6. **Performance claims** - +26% accuracy vs OpenAI Memory, 91% faster, 90% fewer tokens + +## Why Not Others + +| Solution | Why Not Primary | +|----------|-----------------| +| **Zep** | More complex (Graphiti), cloud-first pivot | +| **Letta** | Full agent framework, not just memory | +| **Cognee** | Document-focused, less mature | +| **claude-mem** | Claude Code only, not multi-agent | + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ AGENT MEMORY STACK │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ agent-trajectories (our layer) │ │ +│ │ │ │ +│ │ BUILDS ON MEM0: ADDS: │ │ +│ │ • Uses Mem0 for observation storage • Task-based grouping │ │ +│ │ • Uses Mem0 for semantic search • Inter-agent events │ │ +│ │ • Uses Mem0's multi-user isolation • Fleet knowledge workspace │ │ +│ │ • .trajectory export │ │ +│ │ • Decisions & patterns │ │ +│ └──────────────────────────────────┬──────────────────────────────────┘ │ +│ │ uses │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Mem0 (memory substrate) │ │ +│ │ │ │ +│ │ • Observation storage + retrieval │ │ +│ │ • Semantic search (vector + hybrid) │ │ +│ │ • Multi-user/agent isolation │ │ +│ │ • MCP integration for Claude Code │ │ +│ │ • Self-hosted or cloud │ │ +│ └──────────────────────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌──────────────────────────┼──────────────────────────┐ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Claude agent │ │ Codex agent │ │ Gemini agent │ │ +│ │ (MCP→Mem0) │ │ (SDK→Mem0) │ │ (SDK→Mem0) │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ │ +│ ◄──────────────────── agent-relay provides messaging ──────────────────► │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## What We Build vs Use + +| Component | Build or Use | Owner | +|-----------|--------------|-------| +| Observation storage | **USE Mem0** | Mem0 | +| Semantic search | **USE Mem0** | Mem0 | +| Vector database | **USE Mem0** | Mem0 | +| Task-based grouping | **BUILD** | agent-trajectories | +| Trajectory format (.trajectory) | **BUILD** | agent-trajectories | +| Knowledge workspace | **BUILD** | agent-trajectories | +| Inter-agent event capture | **BUILD** | agent-trajectories | +| Fleet-wide patterns/decisions | **BUILD** | agent-trajectories | +| Message routing | **USE** | agent-relay | + +## Constraint: Claude Code Auth Only (No Direct SDK) + +**Problem:** We use Claude Code via auth, not direct Anthropic SDK access. Mem0's TypeScript SDK requires LLM API access for memory extraction/compression. + +**Solution:** Use MCP approach where **Claude Code IS the intelligence layer**. + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ MCP-BASED MEMORY (Recommended) │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Claude Code Agent │ │ +│ │ │ │ +│ │ 1. Agent decides what to remember (intelligence here) │ │ +│ │ 2. Agent calls MCP tool: add_memory("user prefers dark mode") │ │ +│ │ 3. Later: Agent calls search_memories("user preferences") │ │ +│ │ 4. Agent uses retrieved memories in context │ │ +│ └────────────────────────────────────┬────────────────────────────────┘ │ +│ │ MCP calls │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Mem0 MCP Server │ │ +│ │ │ │ +│ │ • add_memory(content, metadata) → Store to vector DB │ │ +│ │ • search_memories(query) → Vector search (no LLM needed) │ │ +│ │ • delete_memory(id) → Remove from storage │ │ +│ │ │ │ +│ │ NO LLM CALLS - Pure storage + retrieval │ │ +│ └────────────────────────────────────┬────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌──────────────────┐ │ +│ │ Qdrant / Redis │ │ +│ │ (Vector Store) │ │ +│ └──────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### Why This Works + +| Approach | LLM Needed? | Works with Claude Code Auth? | +|----------|-------------|------------------------------| +| Mem0 TypeScript SDK (full) | ✅ Yes, for extraction | ❌ No - requires API key | +| Mem0 MCP Server | ❌ No, agent is the LLM | ✅ Yes | +| Mem0 with `infer: false` | ❌ No, raw storage | ✅ Yes | + +**Key Insight:** With MCP, the agent (Claude Code) does the "thinking" about what to remember. Mem0 becomes a dumb storage layer with smart retrieval (vector search). + +### Integration Modes + +| Mode | Use Case | LLM Required | +|------|----------|--------------| +| **MCP (Primary)** | Claude Code agents | No - agent is the LLM | +| **SDK with infer:false** | Programmatic storage | No - raw storage | +| **SDK with LLM** | Non-Claude agents with API keys | Yes - for extraction | + +### MCP Server Options + +1. **Mem0 Official MCP** - `@mem0/mcp-server` +2. **Composio MCP** - Third-party wrapper +3. **Custom MCP** - Build thin layer over Qdrant + +### Embeddings Without API Keys + +Vector search needs embeddings. Options without paid API: + +| Option | Pros | Cons | +|--------|------|------| +| **Ollama** (local) | Free, private, fast | Requires local GPU/CPU | +| **HuggingFace TEI** | Free, self-hosted | Setup complexity | +| **Qdrant FastEmbed** | Built-in, no setup | Limited models | +| **OpenAI API** | Best quality | Requires API key + cost | + +Recommended: **Ollama + nomic-embed-text** for local development, option to swap to OpenAI embeddings in production if quality matters. + +## Integration Points + +### 1. Mem0 as Storage Backend (No LLM Required) + +```typescript +// agent-trajectories uses Mem0 for observation storage +// NOTE: Using infer:false - no LLM API needed +import { Memory } from 'mem0ai/oss'; + +const memory = new Memory({ + // Vector store only - no LLM config needed + vectorStore: { + provider: 'qdrant', + config: { + url: process.env.QDRANT_URL || 'http://localhost:6333', + collectionName: 'trajectories', + }, + }, + // Optional: embeddings can use local model or API + embedder: { + provider: 'ollama', // Local embeddings, no API key + config: { model: 'nomic-embed-text' }, + }, +}); + +// Store trajectory events - raw storage, no extraction +async function storeTrajectoryEvent(event: TrajectoryEvent) { + await memory.add( + [{ role: 'assistant', content: event.content }], + { + user_id: event.agentId, + metadata: { + trajectory_id: event.trajectoryId, + task_id: event.taskId, + event_type: event.type, + ts: event.ts, + }, + infer: false, // Skip LLM extraction - store as-is + } + ); +} + +// Retrieve relevant context - vector search only +async function getAgentContext(agentId: string, query: string) { + return memory.search(query, { + user_id: agentId, + limit: 10, + }); +} +``` + +### 1b. Alternative: Direct Qdrant (Simpler) + +If Mem0 adds complexity, use Qdrant directly: + +```typescript +import { QdrantClient } from '@qdrant/js-client-rest'; + +const qdrant = new QdrantClient({ url: 'http://localhost:6333' }); + +// Store with pre-computed embeddings (from local model) +async function storeEvent(event: TrajectoryEvent, embedding: number[]) { + await qdrant.upsert('trajectories', { + points: [{ + id: event.id, + vector: embedding, + payload: { + agentId: event.agentId, + content: event.content, + trajectoryId: event.trajectoryId, + ts: event.ts, + }, + }], + }); +} + +// Search by vector similarity +async function search(queryEmbedding: number[], agentId: string) { + return qdrant.search('trajectories', { + vector: queryEmbedding, + filter: { must: [{ key: 'agentId', match: { value: agentId } }] }, + limit: 10, + }); +} +``` + +### 2. MCP for Claude Code Agents + +```json +// Claude Code MCP config (~/.claude/mcp.json) +{ + "mcpServers": { + "mem0": { + "command": "npx", + "args": ["-y", "@mem0/mcp-server"], + "env": { + "MEM0_API_KEY": "${MEM0_API_KEY}" + } + } + } +} +``` + +### 3. agent-relay Event Emission + +```typescript +// agent-relay emits events +relay.on('message', (msg) => { + // Forward to agent-trajectories + trajectories.captureEvent({ + type: 'inter_agent_message', + from: msg.from, + to: msg.to, + content: msg.content, + ts: msg.ts, + }); +}); +``` + +## Alternatives Considered + +### Option A: Build Everything (Rejected) +- SQLite + FTS5 + Chroma from scratch +- **Rejected:** 3-4 weeks of work Mem0 already does + +### Option B: Fork claude-mem (Rejected) +- Extend claude-mem for multi-agent +- **Rejected:** Too Claude-specific, massive refactor needed + +### Option C: Use Zep (Considered) +- Temporal knowledge graph is powerful +- **Deferred:** More complex, can add later if needed + +### Option D: Use Mem0 + Build On Top (Selected) +- Best of both worlds +- Use mature memory infra, add our task/trajectory layer + +## Migration Path + +If Mem0 doesn't meet needs, the abstraction allows swapping: + +```typescript +interface MemoryBackend { + add(memory: Memory): Promise; + search(query: string, options: SearchOptions): Promise; + delete(id: string): Promise; +} + +// Default: Mem0 +class Mem0Backend implements MemoryBackend { ... } + +// Alternative: Zep (if we need temporal graphs) +class ZepBackend implements MemoryBackend { ... } + +// Fallback: Custom SQLite + Chroma +class LocalBackend implements MemoryBackend { ... } +``` + +## Next Steps + +1. **Set up Qdrant** - Local vector store (docker or binary) +2. **Set up Ollama** - Local embeddings (nomic-embed-text) +3. **Configure Mem0 MCP** - For Claude Code agents +4. **Implement MemoryBackend** - With `infer: false` for programmatic use +5. **Build trajectory layer** - Task grouping, patterns on top +6. **Integrate with agent-relay** - Event emission to trajectories + +## References + +- [Mem0 GitHub](https://github.com/mem0ai/mem0) +- [Mem0 Documentation](https://docs.mem0.ai/) +- [Mem0 MCP Integration](https://mcp.composio.dev/mem0) +- [Collaborative Memory Paper](https://arxiv.org/html/2505.18279v1) +- [MemEngine Paper](https://arxiv.org/html/2505.02099v1) diff --git a/docs/competitive-analysis-ai-maestro.md b/docs/competitive-analysis-ai-maestro.md new file mode 100644 index 00000000..a12404cb --- /dev/null +++ b/docs/competitive-analysis-ai-maestro.md @@ -0,0 +1,249 @@ +# Competitive Analysis: ai-maestro vs agent-relay + +**Date:** 2025-12-21 +**Source:** https://github.com/23blocks-OS/ai-maestro (v0.17.7) + +## Executive Summary + +ai-maestro is a mature orchestration dashboard for multi-agent coordination. It takes a **human-as-orchestrator** approach with rich visualization, while agent-relay focuses on **automatic agent-to-agent communication**. Both use tmux for agent sessions, but differ fundamentally in message delivery and autonomy. + +--- + +## Architecture Comparison + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ AI-MAESTRO │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Web Dashboard (localhost:23000) │ │ +│ │ Next.js + TypeScript + Tailwind │ │ +│ └────────────────────────────────┬────────────────────────────────────┘ │ +│ │ WebSocket │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Manager Node │ │ +│ │ (Coordinates workers, proxies connections) │ │ +│ └────────────────────────────────┬────────────────────────────────────┘ │ +│ │ Tailscale VPN / Local │ +│ ┌────────────────────────┼────────────────────────┐ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ Worker Node │ │ Worker Node │ │ Worker Node │ │ +│ │ (tmux agents)│ │ (tmux agents)│ │ (tmux agents)│ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ │ +│ Communication: FILE-BASED (read from disk, human relays) │ +│ Human Role: ORCHESTRATOR (assigns tasks, relays messages) │ +└─────────────────────────────────────────────────────────────────────────────┘ + +┌─────────────────────────────────────────────────────────────────────────────┐ +│ AGENT-RELAY │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Dashboard (localhost:3888) │ │ +│ │ + CLI Interface │ │ +│ └────────────────────────────────┬────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────────────────────────────────────────────────┐ │ +│ │ Daemon (per server) │ │ +│ │ Unix socket (local) + WebSocket (federation - planned) │ │ +│ └────────────────────────────────┬────────────────────────────────────┘ │ +│ │ tmux send-keys │ +│ ┌────────────────────────┼────────────────────────┐ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ tmux: Alice │◄──────►│ tmux: Bob │◄──────►│ tmux: Carol │ │ +│ │ (auto-inject)│ │ (auto-inject)│ │ (auto-inject)│ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ │ +│ Communication: REAL-TIME (daemon routes, auto-injects via tmux) │ +│ Human Role: OBSERVER (agents communicate autonomously) │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Feature Comparison + +| Feature | ai-maestro | agent-relay | Notes | +|---------|------------|-------------|-------| +| **Message Delivery** | File-based, human relays | Auto-inject via tmux | agent-relay is truly autonomous | +| **Web Dashboard** | ✅ Rich (Next.js) | ⚠️ Basic (Ink) | ai-maestro more mature | +| **Mobile App** | ❌ None | 📋 Planned | Opportunity for agent-relay | +| **Distributed** | ✅ Tailscale VPN | 📋 Federation planned | Both support multi-server | +| **Agent Sessions** | tmux (read-only) | tmux (read + inject) | agent-relay has write access | +| **Code Intelligence** | ✅ Code Graphs | 📋 Planned | Learn from ai-maestro | +| **Conversation Memory** | ✅ CozoDB + search | 📋 Trajectories planned | Similar goals | +| **Agent Portability** | ✅ .zip export/import | 📋 Planned | Useful feature to adopt | +| **Health Monitoring** | ✅ Green/red/yellow | ⚠️ Basic | Adopt their status model | +| **Task Distribution** | Manual (human assigns) | Manual → Lead Agent | agent-relay planning automation | +| **Hierarchical Naming** | ✅ project-backend-api | ❌ Flat names | Consider adopting | + +--- + +## Key Differentiators + +### agent-relay's Advantages + +1. **Automatic Message Injection** + ``` + ai-maestro: Agent writes to file → Human reads → Human types to other agent + agent-relay: Agent outputs @relay:Bob → Daemon injects → Bob sees immediately + ``` + This is the core differentiator. Agents can truly collaborate without human relay. + +2. **Real-Time Communication** + - ai-maestro: Polling-based file reads + - agent-relay: Push-based via daemon + WebSocket + +3. **Simpler Local Setup** + - ai-maestro: Manager + workers + VPN for distributed + - agent-relay: Single daemon, federation opt-in + +### ai-maestro's Advantages + +1. **Mature Web Dashboard** + - Rich visualizations + - Hierarchical agent tree with color coding + - Code graph browser + - Conversation search + +2. **Code Intelligence** + - Multi-language code graphs (Ruby, TypeScript, Python) + - Delta indexing (~100ms vs 1000ms+ full re-index) + - Relationship visualization + +3. **Agent Portability** + - Export agent as .zip + - Full config + message history + git associations + - Import with conflict detection + +4. **Health Monitoring** + - Visual status indicators (green/red/yellow) + - Per-agent resource tracking + - Stuck detection + +--- + +## What We Should Learn from ai-maestro + +### 1. Hierarchical Agent Naming +``` +Current: Alice, Bob, Carol (flat) +ai-maestro: project-backend-api, project-frontend-ui (hierarchical) +Proposed: federation/network/peer-connection (namespace/role/task) +``` + +### 2. Code Graph Visualization +- Shared codebase understanding across agents +- Delta indexing for performance +- Multi-language support +- Add as `ctrl-010` task + +### 3. Agent Health Status +```typescript +type AgentStatus = 'healthy' | 'degraded' | 'offline'; + +interface AgentHealth { + status: AgentStatus; + lastHeartbeat: number; + cpuUsage?: number; + memoryUsage?: number; + errorRate?: number; + isStuck?: boolean; // No output for N minutes +} +``` + +### 4. Agent Portability +``` +agent-export.zip +├── config.json # Agent configuration +├── messages.jsonl # Message history +├── trajectory.json # Work history (ours) +├── git-associations/ # Linked repos +└── skills.json # Capability registry +``` + +### 5. Conversation Memory +- ai-maestro uses CozoDB for semantic search +- We can leverage trajectories + embeddings +- Cross-agent knowledge sharing + +--- + +## What ai-maestro Could Learn from Us + +### 1. Automatic Message Injection +Their file-based messaging requires human relay. Our tmux send-keys approach enables true agent autonomy. + +### 2. End-to-End Delivery Confirmation +We're planning delivery confirmation via capture-pane. They have no delivery guarantees. + +### 3. Peer-to-Peer Federation +Our federation proposal is peer-to-peer mesh. They require central manager + VPN. + +### 4. Trajectories +Our trajectory format captures complete work history with reasoning, decisions, and retrospectives—richer than their conversation memory. + +--- + +## Integration Opportunities + +### Could We Integrate? + +1. **Use ai-maestro's Dashboard** + - Their web UI is more mature + - We provide the messaging backbone + - They display, we deliver + +2. **Share Code Intelligence** + - Their Code Graphs + our Trajectories + - Unified knowledge layer + +3. **Complementary Roles** + ``` + ai-maestro: Human observation & control (read-mostly) + agent-relay: Agent-to-agent messaging (write-inject) + ``` + +### Technical Compatibility + +- Both use tmux for agent sessions ✅ +- Both use WebSocket for real-time ✅ +- Different message formats (file vs protocol) ⚠️ +- Different authentication models ⚠️ + +--- + +## Roadmap Impact + +Based on this analysis, prioritize these additions: + +| Priority | Task | Source | +|----------|------|--------| +| High | Agent health monitoring | ai-maestro | +| High | Code Graph integration | ai-maestro | +| Medium | Hierarchical naming | ai-maestro | +| Medium | Agent portability | ai-maestro | +| Medium | Delta indexing | ai-maestro | +| Low | CozoDB integration | ai-maestro | + +These are captured in `ctrl-010`, `ctrl-011`, `ctrl-012` tasks. + +--- + +## Conclusion + +**ai-maestro** excels at human observation and visualization but lacks autonomous agent communication. + +**agent-relay** excels at automatic agent-to-agent messaging but needs richer dashboards and intelligence features. + +The ideal system combines: +- agent-relay's automatic message injection +- ai-maestro's visualization and code intelligence +- Our proposed federation for distributed deployment +- Our proposed control plane for human oversight + +We're building the messaging backbone; ai-maestro shows what the UI layer should look like. diff --git a/src/cli/index.test.ts b/src/cli/index.test.ts index e81bc169..4be6881b 100644 --- a/src/cli/index.test.ts +++ b/src/cli/index.test.ts @@ -114,6 +114,42 @@ describe('CLI', () => { }); }); + describe('attach', () => { + it('should show help for attach command', async () => { + const { stdout } = await runCli('attach --help'); + expect(stdout).toContain('Attach to a running agent session'); + expect(stdout).toContain(''); + }); + + it('should error when session does not exist', async () => { + const { stderr, code } = await runCli('attach nonexistent-agent'); + expect(code).not.toBe(0); + expect(stderr).toContain('No session found'); + }); + }); + + describe('kill', () => { + it('should show help for kill command', async () => { + const { stdout } = await runCli('kill --help'); + expect(stdout).toContain('Stop a detached agent'); + expect(stdout).toContain(''); + expect(stdout).toContain('--force'); + }); + + it('should handle killing nonexistent agent gracefully', async () => { + const { stdout } = await runCli('kill nonexistent-agent-12345'); + expect(stdout).toContain('No running agent found'); + }); + }); + + describe('detach flag', () => { + it('should show -d/--detach in help', async () => { + const { stdout } = await runCli('--help'); + expect(stdout).toContain('-d, --detach'); + expect(stdout).toContain('background'); + }); + }); + describe('history', () => { it('should show history or empty message', async () => { const { stdout, code } = await runCli('history --limit 5'); diff --git a/src/cli/index.ts b/src/cli/index.ts index b7101176..cd72a6ae 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -49,6 +49,7 @@ program program .option('-n, --name ', 'Agent name (auto-generated if not set)') .option('-q, --quiet', 'Disable debug output', false) + .option('-d, --detach', 'Start agent in background (detached mode)') .option('--prefix ', 'Relay prefix pattern (default: ->relay:)') .argument('[command...]', 'Command to wrap (e.g., claude)') .action(async (commandParts, options) => { @@ -58,13 +59,47 @@ program return; } - const { getProjectPaths } = await import('../utils/project-namespace.js'); + const { ensureProjectDir } = await import('../utils/project-namespace.js'); const { findAgentConfig, isClaudeCli, buildClaudeArgs } = await import('../utils/agent-config.js'); - const paths = getProjectPaths(); + const paths = ensureProjectDir(); const [mainCommand, ...commandArgs] = commandParts; const agentName = options.name ?? generateAgentName(); + // Handle detached mode - spawn daemon and exit + const isDaemonProcess = process.argv.includes('--_daemon'); + if (options.detach && !isDaemonProcess) { + const { spawn } = await import('node:child_process'); + + // Build args for the daemon process + const daemonArgs = [ + ...process.argv.slice(2).filter(a => a !== '-d' && a !== '--detach'), + '--_daemon', + '-n', agentName, // Ensure same name is used + ]; + + // Spawn detached process + const child = spawn(process.argv[0], [process.argv[1], ...daemonArgs], { + detached: true, + stdio: 'ignore', + cwd: process.cwd(), + env: process.env, + }); + + // Write PID file for the wrapper + const wrapperPidPath = path.join(paths.dataDir, `wrapper-${agentName}.pid`); + fs.writeFileSync(wrapperPidPath, String(child.pid)); + + child.unref(); + + console.log(`Agent: ${agentName}`); + console.log(`Project: ${paths.projectId}`); + console.log(`Started in detached mode (PID: ${child.pid})`); + console.log(`Attach with: agent-relay attach ${agentName}`); + console.log(`Stop with: agent-relay kill ${agentName}`); + return; + } + console.error(`Agent: ${agentName}`); console.error(`Project: ${paths.projectId}`); @@ -92,6 +127,7 @@ program relayPrefix: options.prefix, useInbox: true, inboxDir: paths.dataDir, // Use the project-specific data directory for the inbox + detached: isDaemonProcess, // Don't attach if running as daemon }); process.on('SIGINT', () => { @@ -99,6 +135,11 @@ program process.exit(0); }); + process.on('SIGTERM', () => { + wrapper.stop(); + process.exit(0); + }); + await wrapper.start(); }); @@ -305,6 +346,81 @@ program } }); +// attach - Attach to a running agent's tmux session +program + .command('attach') + .description('Attach to a running agent session') + .argument('', 'Agent name to attach to') + .action(async (name) => { + const sessionName = `relay-${name}`; + + // Check if session exists + try { + await execAsync(`tmux has-session -t ${sessionName} 2>/dev/null`); + } catch { + console.error(`No session found for agent: ${name}`); + console.error(`Run 'agent-relay status' to see available sessions.`); + process.exit(1); + } + + // Attach to the session + const { spawn } = await import('node:child_process'); + const attach = spawn('tmux', ['attach-session', '-t', sessionName], { + stdio: 'inherit', + }); + + attach.on('exit', (code) => { + process.exit(code ?? 0); + }); + + attach.on('error', (err) => { + console.error(`Failed to attach: ${err.message}`); + process.exit(1); + }); + }); + +// kill - Stop a detached agent +program + .command('kill') + .description('Stop a detached agent and its tmux session') + .argument('', 'Agent name to kill') + .option('--force', 'Force kill without confirmation') + .action(async (name, options) => { + const { getProjectPaths } = await import('../utils/project-namespace.js'); + const paths = getProjectPaths(); + + const sessionName = `relay-${name}`; + const wrapperPidPath = path.join(paths.dataDir, `wrapper-${name}.pid`); + + let killed = false; + + // Kill the wrapper process if PID file exists + if (fs.existsSync(wrapperPidPath)) { + const pid = Number(fs.readFileSync(wrapperPidPath, 'utf-8').trim()); + try { + process.kill(pid, 'SIGTERM'); + console.log(`Stopped wrapper process (PID: ${pid})`); + killed = true; + } catch { + // Process may already be dead + } + fs.unlinkSync(wrapperPidPath); + } + + // Kill the tmux session + try { + await execAsync(`tmux kill-session -t ${sessionName}`); + console.log(`Killed tmux session: ${sessionName}`); + killed = true; + } catch { + // Session may not exist + } + + if (!killed) { + console.log(`No running agent found: ${name}`); + } + }); + // agents - List connected agents (from registry file) program .command('agents') diff --git a/src/wrapper/tmux-wrapper.ts b/src/wrapper/tmux-wrapper.ts index 4a2f53e4..d269b70c 100644 --- a/src/wrapper/tmux-wrapper.ts +++ b/src/wrapper/tmux-wrapper.ts @@ -76,6 +76,8 @@ export interface TmuxWrapperConfig { mouseMode?: boolean; /** Relay prefix pattern (default: '->relay:') */ relayPrefix?: string; + /** Run in detached mode (don't attach to tmux, run as background daemon) */ + detached?: boolean; /** Callback for spawn commands (@relay:spawn WorkerName cli "task") */ onSpawn?: (name: string, cli: string, task: string) => Promise; /** Callback for release commands (@relay:release WorkerName) */ @@ -385,6 +387,15 @@ export class TmuxWrapper { // Start background polling (silent - no stdout writes) this.startSilentPolling(); + // In detached mode, don't attach - just keep polling in background + if (this.config.detached) { + this.logStderr('Running in detached mode (daemon)', true); + this.logStderr(`Attach with: tmux attach -t ${this.sessionName}`, true); + // Keep the process alive + await new Promise(() => {}); // Never resolves - keeps daemon running + return; + } + // Attach user to tmux session // This takes over stdin/stdout - user sees the real terminal this.attachToSession();