feat: add context recovery workflows and diagnostics (br-dzv)

dmoliveira · dmoliveira · commit e3190c9f7c2d · 2026-02-13T18:59:18.000+11:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -33,6 +33,7 @@ All notable changes to this project are documented in this file.
 - Added `instructions/rules_team_pack_examples.md` with practical team rule-pack layout and sample rule files.
 - Added `instructions/context_resilience_policy_schema.md` defining truncation modes, protected artifacts, and resilience notification levels for Epic 11 Task 11.1.
 - Added `scripts/context_resilience.py` implementing resilience policy resolution and deterministic context pruning primitives.
+- Added recovery workflow planning in `scripts/context_resilience.py` with resume hints, safe fallback steps, and diagnostics payloads.
 
 ### Changes
 - Documented extension evaluation outcomes and when each tool is the better fit.
@@ -65,6 +66,7 @@ All notable changes to this project are documented in this file.
 - Added `/doctor` rules subsystem integration and expanded install/selftest coverage for rules command workflows.
 - Expanded rules verification to cover always-apply behavior, equal-priority lexical ordering, and richer discovery scenarios.
 - Expanded selftest coverage for context resilience policy validation and pruning behavior (dedupe, superseded writes, stale error purge, protected evidence retention).
+- Expanded selftest coverage for context recovery outcomes, including resume hints and fallback-path diagnostics.
 
 ## v0.2.0 - 2026-02-12
 
diff --git a/IMPLEMENTATION_ROADMAP.md b/IMPLEMENTATION_ROADMAP.md
@@ -47,7 +47,7 @@ Use this map to avoid overlapping implementations.
 | E8 | Keyword-Triggered Execution Modes | done | High | E1, E4 | bd-302, bd-2fb, bd-2zq, bd-3dp | Fast power-mode activation from prompt text |
 | E9 | Conditional Rules Injector | done | High | E1 | bd-1q8, bd-3rj, bd-fo8, bd-2ik | Enforce project conventions with scoped rules |
 | E10 | Auto Slash Command Detector | paused | Medium | E1, E8 | TBD | Resume only if intent precision stays high in prototypes |
-| E11 | Context-Window Resilience Toolkit | in_progress | High | E4 | bd-2tj, bd-n9y | Improve long-session stability and recovery |
+| E11 | Context-Window Resilience Toolkit | in_progress | High | E4 | bd-2tj, bd-n9y, bd-2t0 | Improve long-session stability and recovery |
 | E12 | Provider/Model Fallback Visibility | planned | Medium | E5 | TBD | Explain why model routing decisions happen |
 | E13 | Browser Automation Profile Switching | planned | Medium | E1 | TBD | Toggle Playwright/agent-browser with checks |
 | E14 | Plan-to-Execution Bridge Command | planned | Medium | E2, E3 | TBD | Execute validated plans with progress tracking |
@@ -482,10 +482,11 @@ Every command-oriented epic must ship all of the following:
   - [x] Subtask 11.2.2: Add old-error input purge with turn thresholds
   - [x] Subtask 11.2.3: Preserve critical evidence and command outcomes
   - [x] Notes: Added `scripts/context_resilience.py` with policy resolution plus deterministic pruning (dedupe, superseded writes, stale error purge, budget trim) while preserving protected artifacts and latest command outcomes.
-- [ ] Task 11.3: Recovery workflows
-  - [ ] Subtask 11.3.1: Add automatic resume hints after successful recovery
-  - [ ] Subtask 11.3.2: Add safe fallback when recovery cannot proceed
-  - [ ] Subtask 11.3.3: Add diagnostics for pruning/recovery actions
+- [x] Task 11.3: Recovery workflows
+  - [x] Subtask 11.3.1: Add automatic resume hints after successful recovery
+  - [x] Subtask 11.3.2: Add safe fallback when recovery cannot proceed
+  - [x] Subtask 11.3.3: Add diagnostics for pruning/recovery actions
+  - [x] Notes: Added recovery-plan generation in `scripts/context_resilience.py` with resume hints, safe fallback actions, and structured pruning/recovery diagnostics.
 - [ ] Task 11.4: Validation and docs
   - [ ] Subtask 11.4.1: Add stress tests for long-session behavior
   - [ ] Subtask 11.4.2: Add docs for tuning resilience settings
diff --git a/README.md b/README.md
@@ -550,6 +550,7 @@ Engine behavior currently includes:
 - superseded write pruning (older writes to same target path)
 - stale error purging once newer successful command outcomes exist beyond threshold
 - preservation of protected artifacts and latest command outcomes as critical evidence
+- recovery planning with automatic resume hints, safe fallback steps, and pruning diagnostics
 
 ## Background jobs inside OpenCode 🧵
 
diff --git a/scripts/context_resilience.py b/scripts/context_resilience.py
@@ -241,3 +241,78 @@ def prune_context(
         "kept_count": len(kept_messages),
         "dropped_count": len(dropped),
     }
+
+
+def build_recovery_plan(
+    original_messages: list[dict[str, Any]],
+    pruned_report: dict[str, Any],
+    policy: dict[str, Any],
+) -> dict[str, Any]:
+    kept_messages = list(pruned_report.get("messages", []))
+    drop_counts = dict(pruned_report.get("drop_counts", {}))
+
+    latest_success: dict[str, Any] | None = None
+    latest_error: dict[str, Any] | None = None
+    for idx, message in enumerate(kept_messages):
+        exit_code = message.get("exit_code")
+        if isinstance(exit_code, int) and exit_code == 0:
+            latest_success = {
+                "index": idx,
+                "command": str(message.get("command", "")).strip(),
+                "tool_name": str(message.get("tool_name", "")).strip(),
+            }
+        kind = str(message.get("kind", "")).strip().lower()
+        if kind == "error":
+            latest_error = {
+                "index": idx,
+                "command": str(message.get("command", "")).strip(),
+                "content": str(message.get("content", "")).strip(),
+            }
+
+    protected_retained = sum(
+        1 for message in kept_messages if _is_protected(message, policy)
+    )
+
+    if latest_success:
+        command = latest_success.get("command") or "latest successful command"
+        plan = {
+            "can_resume": True,
+            "recovery_action": "resume_hint",
+            "resume_hint": f"Resume from the last successful step: `{command}`.",
+            "fallback": None,
+        }
+    elif latest_error:
+        failed_command = latest_error.get("command") or "last failed command"
+        plan = {
+            "can_resume": False,
+            "recovery_action": "safe_fallback",
+            "resume_hint": None,
+            "fallback": {
+                "reason": "no_successful_recovery_anchor",
+                "steps": [
+                    "restore full context snapshot for the current workflow",
+                    f"re-run `{failed_command}` in isolation with explicit logging",
+                    "request operator review before applying any destructive edits",
+                ],
+            },
+        }
+    else:
+        plan = {
+            "can_resume": True,
+            "recovery_action": "resume_hint",
+            "resume_hint": "Resume from the latest retained decision and rerun validation.",
+            "fallback": None,
+        }
+
+    return {
+        **plan,
+        "diagnostics": {
+            "original_count": len(original_messages),
+            "kept_count": len(kept_messages),
+            "dropped_count": int(pruned_report.get("dropped_count", 0)),
+            "drop_counts": drop_counts,
+            "protected_retained_count": protected_retained,
+            "notification_level": str(policy.get("notification_level", "normal")),
+            "truncation_mode": str(policy.get("truncation_mode", "default")),
+        },
+    }
diff --git a/scripts/selftest.py b/scripts/selftest.py
@@ -24,7 +24,11 @@
     validate_schema,
 )
 from keyword_mode_schema import resolve_prompt_modes  # type: ignore
-from context_resilience import prune_context, resolve_policy  # type: ignore
+from context_resilience import (  # type: ignore
+    build_recovery_plan,
+    prune_context,
+    resolve_policy,
+)
 from rules_engine import (  # type: ignore
     discover_rules,
     parse_frontmatter,
@@ -1955,6 +1959,60 @@ def run_bg(*args: str) -> subprocess.CompletedProcess[str]:
             "context pruning should preserve latest command outcomes as critical evidence",
         )
 
+        recovery_plan = build_recovery_plan(
+            context_messages, pruned_context, resilience_policy
+        )
+        expect(
+            recovery_plan.get("can_resume") is True,
+            "recovery plan should allow resume when success anchor exists",
+        )
+        expect(
+            recovery_plan.get("recovery_action") == "resume_hint",
+            "recovery plan should emit resume hints after successful recovery",
+        )
+        expect(
+            "make validate" in str(recovery_plan.get("resume_hint", "")),
+            "resume hint should reference latest successful command",
+        )
+        expect(
+            isinstance(recovery_plan.get("diagnostics", {}).get("drop_counts"), dict),
+            "recovery diagnostics should include pruning reason counts",
+        )
+
+        failed_only_messages = [
+            {
+                "role": "tool",
+                "tool_name": "bash",
+                "kind": "error",
+                "command": "make install-test",
+                "exit_code": 2,
+                "content": "missing dependency",
+                "turn": 1,
+            },
+            {
+                "role": "assistant",
+                "kind": "analysis",
+                "content": "investigate dependency mismatch",
+                "turn": 2,
+            },
+        ]
+        failed_pruned = prune_context(failed_only_messages, resilience_policy)
+        failed_plan = build_recovery_plan(
+            failed_only_messages, failed_pruned, resilience_policy
+        )
+        expect(
+            failed_plan.get("can_resume") is False,
+            "recovery plan should block resume when no success anchor is available",
+        )
+        expect(
+            failed_plan.get("recovery_action") == "safe_fallback",
+            "recovery plan should provide safe fallback path for unrecoverable contexts",
+        )
+        expect(
+            bool(failed_plan.get("fallback", {}).get("steps")),
+            "safe fallback should include actionable recovery steps",
+        )
+
         wizard_state_path = (
             home / ".config" / "opencode" / "my_opencode-install-state.json"
         )