bug-ops
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎book/src/concepts/memory.md‎
Lines changed: 72 additions & 7 deletions b/‎book/src/concepts/memory.md‎
Lines changed: 72 additions & 7 deletions
diff --git a/‎book/src/reference/configuration.md‎
Lines changed: 18 additions & 0 deletions b/‎book/src/reference/configuration.md‎
Lines changed: 18 additions & 0 deletions
@@ -72,8 +72,8 @@ zeph
 |---|---|
 | **Hybrid inference** | Ollama, Claude, OpenAI, Google Gemini, any OpenAI-compatible API, or fully local via Candle (GGUF). Providers are declared as `[[llm.providers]]` entries in config. Gemini supports SSE streaming, thinking-part surfacing (Gemini 2.5), and streaming `functionCall` parts. Multi-model orchestrator with fallback chains, EMA latency routing, and adaptive Thompson Sampling for exploration/exploitation-balanced model selection. Cascade routing supports `cost_tiers` for explicit cheapest-first provider ordering and `ClassifierMode::Judge` for LLM-scored query routing. **Complexity triage routing** (`LlmRoutingStrategy::Triage`) classifies each request into Simple/Medium/Complex/Expert tiers before inference and dispatches to the tier-matched provider pool, avoiding over-provisioning cheap queries to expensive models. **PILOT LinUCB bandit routing** (`LlmRoutingStrategy::Bandit`) applies a contextual LinUCB bandit to provider selection — features include query complexity, provider latency history, and time-of-day signals; configured via `[llm.router.bandit]`. Claude extended context (`--extended-context` flag or `enable_extended_context = true`) enables the 1M token window with a TUI `[1M CTX]` header badge; cost warning emitted automatically. Built-in pricing includes gpt-5 and gpt-5-mini. [→ Providers](https://bug-ops.github.io/zeph/concepts/providers.html) |
 | **Skills-first architecture** | YAML+Markdown skill files with BM25+cosine hybrid retrieval. Bayesian re-ranking, 4-tier trust model, and self-learning evolution — skills improve from real usage. Agent-as-a-Judge feedback detection with adaptive regex/LLM hybrid analysis across 7 languages (English, Russian, Spanish, German, French, Portuguese, Chinese). The `load_skill` tool lets the LLM fetch the full body of any skill outside the active TOP-N set on demand. [→ Skills](https://bug-ops.github.io/zeph/concepts/skills.html) · [→ Self-learning](https://bug-ops.github.io/zeph/advanced/self-learning.html) |
-| **Context engineering** | Semantic skill selection, command-aware output filters, tool-pair summarization with deferred application (pre-computed eagerly, applied lazily to stabilize the Claude API prompt cache prefix), proactive context compression (reactive + proactive strategies), and reactive middle-out compaction keep the window efficient under any load. Three-tier compaction pipeline: deferred summary application at 70% context usage → pruning at 80% → LLM compaction on overflow. **HiAgent subgoal-aware compaction** tracks active and completed subgoals — active subgoal messages are protected from eviction while completed subgoals are candidates for summarization with MIG redundancy scoring. Large tool outputs are stored in SQLite (not on disk) and injected on demand via the native `read_overflow` tool, eliminating absolute-path leakage and enabling automatic cleanup on conversation delete. **Failure-driven compression guidelines** (ACON): after each hard compaction, the agent monitors responses for context-loss signals; confirmed failure pairs train an LLM-generated `<compression-guidelines>` block that is injected into every future compaction prompt. `--debug-dump [PATH]` writes every LLM request, response, and raw tool output to numbered files for context debugging; `--dump-format <json\|raw\|trace>` (or `/dump-format` at runtime) switches the output format — `trace` emits OpenTelemetry-compatible OTLP JSON with a session → iteration → LLM-call/tool-call/memory-search span hierarchy. [→ Context](https://bug-ops.github.io/zeph/advanced/context.html) · [→ Debug Dump](https://bug-ops.github.io/zeph/advanced/debug-dump.html) |
-| **Semantic memory** | SQLite (default) or PostgreSQL + Qdrant with MMR re-ranking, temporal decay, write-time importance scoring, query-aware memory routing (keyword/semantic/hybrid/episodic), cross-session recall, implicit correction detection, and credential scrubbing. **Structured anchored summarization** preserves factual anchors during compaction; **compaction probe validation** verifies quality via probe questions before committing. **Semantic response caching** deduplicates recall queries. Optional **graph memory** adds entity-relationship tracking with typed edges (8 relationship types), FTS5-accelerated entity search, BFS traversal for multi-hop reasoning, bi-temporal edge versioning (`valid_from`/`valid_to`) with point-in-time historical queries (`/graph history <name>`), configurable `temporal_decay_rate` for recency-weighted scoring, and embedding-based entity resolution for semantic deduplication. **SYNAPSE spreading activation** propagates energy through the entity graph with hop-by-hop decay, lateral inhibition, and edge-type filtering (`[memory.graph.spreading_activation]`). **A-MEM dynamic note linking** creates fire-and-forget similarity edges between notes on each graph write (`[memory.graph.note_linking]`). Background LLM extraction runs fire-and-forget on each turn; graph facts are injected into the context window alongside semantic recall. [→ Memory](https://bug-ops.github.io/zeph/concepts/memory.html) · [→ Graph Memory](https://bug-ops.github.io/zeph/concepts/graph-memory.html) |
+| **Context engineering** | Semantic skill selection, command-aware output filters, tool-pair summarization with deferred application (pre-computed eagerly, applied lazily to stabilize the Claude API prompt cache prefix), proactive context compression (reactive + proactive strategies), and reactive middle-out compaction keep the window efficient under any load. Three-tier compaction pipeline: deferred summary application at 70% context usage → pruning at 80% → LLM compaction on overflow. **HiAgent subgoal-aware compaction** tracks active and completed subgoals — active subgoal messages are protected from eviction while completed subgoals are candidates for summarization with MIG redundancy scoring. Large tool outputs are stored in SQLite (not on disk) and injected on demand via the native `read_overflow` tool, eliminating absolute-path leakage and enabling automatic cleanup on conversation delete. **Failure-driven compression guidelines** (ACON): after each hard compaction, the agent monitors responses for context-loss signals; confirmed failure pairs train an LLM-generated `<compression-guidelines>` block that is injected into every future compaction prompt. **ACON per-category guidelines** (`categorized_guidelines = true` in `[memory.compression_guidelines]`) tags each failure pair by category (tool_output / assistant_reasoning / user_context) and maintains separate per-category guideline blocks for finer-grained compression control. **Memex tool-output archive** (`archive_tool_outputs = true` in `[memory.compression]`) saves tool output bodies to SQLite before compaction and injects UUID back-references into summaries, preserving retrievability after the live context is discarded. `--debug-dump [PATH]` writes every LLM request, response, and raw tool output to numbered files for context debugging; `--dump-format <json\|raw\|trace>` (or `/dump-format` at runtime) switches the output format — `trace` emits OpenTelemetry-compatible OTLP JSON with a session → iteration → LLM-call/tool-call/memory-search span hierarchy. [→ Context](https://bug-ops.github.io/zeph/advanced/context.html) · [→ Debug Dump](https://bug-ops.github.io/zeph/advanced/debug-dump.html) |
+| **Semantic memory** | SQLite (default) or PostgreSQL + Qdrant with MMR re-ranking, temporal decay, write-time importance scoring, query-aware memory routing (keyword/semantic/hybrid/episodic), cross-session recall, implicit correction detection, and credential scrubbing. **Structured anchored summarization** preserves factual anchors during compaction; **compaction probe validation** verifies quality via probe questions before committing. **Semantic response caching** deduplicates recall queries. Optional **graph memory** adds entity-relationship tracking with typed edges (8 relationship types), FTS5-accelerated entity search, BFS traversal for multi-hop reasoning, bi-temporal edge versioning (`valid_from`/`valid_to`) with point-in-time historical queries (`/graph history <name>`), configurable `temporal_decay_rate` for recency-weighted scoring, and embedding-based entity resolution for semantic deduplication. **SYNAPSE spreading activation** propagates energy through the entity graph with hop-by-hop decay, lateral inhibition, and edge-type filtering (`[memory.graph.spreading_activation]`). **A-MEM dynamic note linking** creates fire-and-forget similarity edges between notes on each graph write (`[memory.graph.note_linking]`). **RL-based admission control** (`admission_strategy = "rl"`) replaces the static heuristic write-gate with a logistic regression model trained on the `was_recalled` signal; falls back to heuristic until `rl_min_samples` is reached. Background LLM extraction runs fire-and-forget on each turn; graph facts are injected into the context window alongside semantic recall. [→ Memory](https://bug-ops.github.io/zeph/concepts/memory.html) · [→ Graph Memory](https://bug-ops.github.io/zeph/concepts/graph-memory.html) |
 | **IDE integration (ACP)** | Stdio, HTTP+SSE, or WebSocket transport. Multi-session isolation with per-session conversation history and SQLite persistence. Session modes, live tool streaming, LSP diagnostics injection, file following, usage reporting. Works in Zed, Helix, VS Code. [→ ACP](https://bug-ops.github.io/zeph/advanced/acp.html) |
 | **Multi-channel I/O** | CLI, Telegram, TUI dashboard — all with streaming. Voice and vision input supported. [→ Channels](https://bug-ops.github.io/zeph/advanced/channels.html) |
 | **MCP & A2A** | MCP client with full tool exposure to the model. All MCP tool definitions are sanitized at registration time and again on every `tools/list_changed` refresh — 17 injection-detection patterns, Unicode Cf-category strip, and a 1024-byte description cap prevent prompt injection via malicious server metadata. Configure [mcpls](https://github.com/bug-ops/mcpls) as an MCP server for compiler-level code intelligence: hover, definition, references, diagnostics, call hierarchy, and safe rename via rust-analyzer, pyright, gopls, and 30+ other LSP servers. A2A agent-to-agent protocol for multi-agent orchestration. [→ MCP](https://bug-ops.github.io/zeph/guides/mcp.html) · [→ LSP](https://bug-ops.github.io/zeph/guides/lsp.html) · [→ A2A](https://bug-ops.github.io/zeph/advanced/a2a.html) |
 
@@ -236,18 +236,40 @@ A-MAC is disabled by default. Enable it in `[memory.admission]`:
 ```toml
 [memory.admission]
 enabled = true
-threshold = 0.30           # Cosine similarity ceiling; messages more similar than this to recent entries are rejected (default: 0.30)
-fast_path_margin = 0.10    # If the nearest-neighbor score is below (threshold - margin), skip the full check and admit immediately (default: 0.10)
+threshold = 0.40            # Composite score threshold; messages below this are rejected (default: 0.40)
+fast_path_margin = 0.15     # Skip full check and admit immediately when score >= threshold + margin (default: 0.15)
 admission_provider = "fast" # Provider name for LLM-assisted admission decisions (optional)
 
 [memory.admission.weights]
-recency = 0.4              # Weight for how recently similar content was stored
-importance = 0.3           # Weight for message importance score (requires importance_enabled = true)
-similarity = 0.3           # Weight for raw embedding similarity
+future_utility = 0.30       # LLM-estimated future reuse probability (heuristic mode only)
+factual_confidence = 0.15   # Inverse of hedging markers (e.g. "I think", "maybe")
+semantic_novelty = 0.30     # 1 - max similarity to existing memories
+temporal_recency = 0.10     # Always 1.0 at write time
+content_type_prior = 0.15   # Role-based prior (user messages score higher)
 ```
 
 The `fast_path_margin` short-circuits the admission check for clearly novel messages, reducing embedding lookups on low-similarity content. When `admission_provider` is set, borderline cases (similarity near `threshold`) are escalated to an LLM for a binary admit/reject decision; without it, the threshold comparison is the sole gate.
 
+### RL-Based Admission Strategy
+
+The default `heuristic` strategy uses static weights and an optional LLM call for the `future_utility` factor. The `rl` strategy replaces the `future_utility` LLM call with a trained logistic regression model that learns from actual recall outcomes.
+
+The RL model collects `(query, content, was_recalled)` triples from every admitted and rejected message over time. When the training corpus reaches `rl_min_samples`, the model is trained and deployed. Below that threshold the system automatically falls back to `heuristic`.
+
+```toml
+[memory.admission]
+enabled = true
+admission_strategy = "rl"          # "heuristic" (default) or "rl"
+rl_min_samples = 500               # Training samples required before RL activates (default: 500)
+rl_retrain_interval_secs = 3600    # Background retraining interval in seconds (default: 3600)
+```
+
+> [!WARNING]
+> `admission_strategy = "rl"` is currently a preview feature. The model infrastructure is wired and sample collection is active, but the trained model is not yet connected to the admission path — the system will emit a startup warning and fall back to `heuristic`. Full RL-gated admission is tracked in [#2416](https://github.com/bug-ops/zeph/issues/2416).
+
+> [!NOTE]
+> Migration 055 adds the tables required for RL sample storage. Run `zeph --migrate-config` when upgrading an existing installation.
+
 ## MemScene Consolidation
 
 MemScene groups semantically related messages into *scenes* — short-lived narrative units covering a coherent sub-topic within a session. Scenes are detected automatically in the background and consolidated into a single embedding before the individual messages are demoted in the recall index. This compresses the vector space without discarding information: a scene embedding captures the collective meaning of its member messages, and scene summaries are searchable in future sessions.
@@ -289,7 +311,24 @@ Proactive and reactive compression are mutually exclusive per turn: if proactive
 
 Proactive compression emits two metrics: `compression_events` (count) and `compression_tokens_saved` (cumulative tokens freed).
 
-> **Note:** Validation rejects `threshold_tokens < 1000` and `max_summary_tokens < 128` at startup.
+> [!NOTE]
+> Validation rejects `threshold_tokens < 1000` and `max_summary_tokens < 128` at startup.
+
+### Tool Output Archive (Memex)
+
+When `archive_tool_outputs = true`, Zeph saves the full body of every tool output in the compaction range to SQLite before summarization begins. The archived entries are stored in the `tool_overflow` table with `archive_type = 'archive'` and are excluded from the normal overflow cleanup pass.
+
+During compaction the LLM sees placeholder messages instead of the full outputs, keeping the summarization prompt small. After the LLM produces its summary, Zeph appends UUID reference lines (one per archived output) to the summary text. This gives you a complete audit trail of tool outputs that survived context compaction.
+
+This feature is disabled by default because it increases SQLite storage usage. Enable it when you need durable tool output history across long sessions:
+
+```toml
+[memory.compression]
+archive_tool_outputs = true
+```
+
+> [!TIP]
+> Tool output archives are written by database migration 054. Run `zeph --migrate-config` if you are upgrading an existing installation.
 
 ## Failure-Driven Compression Guidelines
 
@@ -310,7 +349,33 @@ update_interval_secs = 300       # Seconds between background updater checks (de
 max_stored_pairs = 100           # Maximum unused failure pairs retained (default: 100)
 ```
 
-> **Note:** Guidelines are injected only when `enabled = true` and at least one guidelines version exists in SQLite. The guidelines document grows incrementally as the agent accumulates failure experience.
+> [!NOTE]
+> Guidelines are injected only when `enabled = true` and at least one guidelines version exists in SQLite. The guidelines document grows incrementally as the agent accumulates failure experience.
+
+### Per-Category Compression Guidelines
+
+By default a single global guidelines document is maintained for the entire conversation. When `categorized_guidelines = true`, the updater maintains **four independent documents** — one per content category — and injects only the relevant document during compaction:
+
+| Category | Content covered |
+|----------|----------------|
+| `tool_output` | Tool call results, shell output, file reads |
+| `assistant_reasoning` | Agent reasoning steps and explanations |
+| `user_context` | User instructions, preferences, and goals |
+| `unknown` | Messages that do not match a category |
+
+Each category runs its own update cycle: a category is updated only when its unprocessed failure pair count reaches `update_threshold`, avoiding unnecessary LLM calls for categories that have few failures.
+
+Enable per-category guidelines alongside the base feature:
+
+```toml
+[memory.compression_guidelines]
+enabled = true
+categorized_guidelines = true    # Maintain separate guidelines per content category (default: false)
+update_threshold = 5
+```
+
+> [!TIP]
+> Per-category guidelines reduce the chance that tool-output compression rules interfere with how assistant reasoning is compressed, and vice versa. Enable this when you have long sessions mixing heavy tool use with extended reasoning chains.
 
 ## Graph Memory
 
 
@@ -287,12 +287,29 @@ importance_weight = 0.15              # Blend weight for importance in ranking,
 [memory.routing]
 strategy = "heuristic"        # Routing strategy for memory backend selection (default: "heuristic")
 
+# [memory.admission]
+# enabled = false                    # Enable A-MAC adaptive memory admission control (default: false)
+# threshold = 0.40                   # Composite score threshold; messages below this are rejected (default: 0.40)
+# fast_path_margin = 0.15            # Admit immediately when score >= threshold + margin (default: 0.15)
+# admission_provider = "fast"        # Provider for LLM-assisted admission decisions (optional, default: "")
+# admission_strategy = "heuristic"   # "heuristic" (default) or "rl" (preview — falls back to heuristic)
+# rl_min_samples = 500               # Training samples required before RL model activates (default: 500)
+# rl_retrain_interval_secs = 3600    # Background RL retraining interval in seconds (default: 3600)
+#
+# [memory.admission.weights]
+# future_utility = 0.30              # LLM-estimated future reuse probability (heuristic mode only)
+# factual_confidence = 0.15          # Inverse of hedging markers
+# semantic_novelty = 0.30            # 1 - max similarity to existing memories
+# temporal_recency = 0.10            # Always 1.0 at write time
+# content_type_prior = 0.15          # Role-based prior
+
 [memory.compression]
 strategy = "reactive"         # "reactive" (default) or "proactive"
 # Proactive strategy fields (required when strategy = "proactive"):
 # threshold_tokens = 80000   # Fire compression when context exceeds this token count (>= 1000)
 # max_summary_tokens = 4000  # Cap for the compressed summary (>= 128)
 # model = ""                 # Reserved — currently unused
+# archive_tool_outputs = false  # Archive tool output bodies to SQLite before compaction (default: false)
 
 [memory.compression.probe]
 # enabled = false           # Enable compaction probe validation (default: false)
@@ -310,6 +327,7 @@ enabled = false                # Enable failure-driven compression guidelines (d
 # detection_window_turns = 10 # Turns after hard compaction to watch for context loss (default: 10)
 # update_interval_secs = 300  # Interval in seconds between background updater checks (default: 300)
 # max_stored_pairs = 100      # Maximum unused failure pairs retained before cleanup (default: 100)
+# categorized_guidelines = false  # Maintain separate guideline documents per content category (default: false)
 
 [memory.graph]
 enabled = false                        # Enable graph memory (default: false, requires graph-memory feature)