Skip to content

Commit f646c7c

Browse files
authored
Merge pull request #87 from dmoliveira/my_opencode-e17-recovery-engine
Implement E17-T2 recovery engine and resume replay backend
2 parents 1c381af + 38c7100 commit f646c7c

File tree

6 files changed

+568
-7
lines changed

6 files changed

+568
-7
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ All notable changes to this project are documented in this file.
5353
- Added `scripts/todo_command.py` with `/todo status` and `/todo enforce` diagnostics for runtime compliance visibility.
5454
- Added `/todo`, `/todo-status`, and `/todo-enforce` aliases in `opencode.json`.
5555
- Added `instructions/resume_policy_model.md` defining interruption classes, resume eligibility/cool-down rules, attempt limits, escalation semantics, and deterministic reason codes for Epic 17 Task 17.1.
56+
- Added `scripts/recovery_engine.py` implementing checkpoint loading, eligibility evaluation, idempotency gating, and persisted resume decision/transition trail events for Epic 17 Task 17.2.
5657

5758
### Changes
5859
- Documented extension evaluation outcomes and when each tool is the better fit.
@@ -101,6 +102,7 @@ All notable changes to this project are documented in this file.
101102
- Integrated todo compliance checks into `/doctor` summary, installer self-checks, and install-test smoke coverage.
102103
- Expanded selftest coverage for todo transition gating, completion blocking, and bypass audit-event payload validation.
103104
- Marked Epic 17 as in progress and completed Task 17.1 resume-policy definition notes in the roadmap.
105+
- Added `/start-work recover` backend path with explicit interruption class handling and approval-gated replay for non-idempotent pending steps.
104106

105107
## v0.2.0 - 2026-02-12
106108

IMPLEMENTATION_ROADMAP.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -648,10 +648,11 @@ Every command-oriented epic must ship all of the following:
648648
- [x] Subtask 17.1.2: Define resume eligibility and cool-down rules
649649
- [x] Subtask 17.1.3: Define max resume attempts and escalation path
650650
- [x] Notes: Added `instructions/resume_policy_model.md` with interruption classes, deterministic eligibility/cool-down/attempt-limit rules, reason codes, and audit event contract.
651-
- [ ] Task 17.2: Implement recovery engine
652-
- [ ] Subtask 17.2.1: Load last safe checkpoint and reconstruct state
653-
- [ ] Subtask 17.2.2: Re-run only idempotent or explicitly approved steps
654-
- [ ] Subtask 17.2.3: Persist resume trail for audit/debugging
651+
- [x] Task 17.2: Implement recovery engine
652+
- [x] Subtask 17.2.1: Load last safe checkpoint and reconstruct state
653+
- [x] Subtask 17.2.2: Re-run only idempotent or explicitly approved steps
654+
- [x] Subtask 17.2.3: Persist resume trail for audit/debugging
655+
- [x] Notes: Added `scripts/recovery_engine.py` and `/start-work recover` backend path for checkpoint eligibility checks, approval-gated replay, and persisted resume audit trail events.
655656
- [ ] Task 17.3: User control surfaces
656657
- [ ] Subtask 17.3.1: Add `/resume status`, `/resume now`, `/resume disable` commands
657658
- [ ] Subtask 17.3.2: Add clear output explaining why resume did/did not trigger

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,13 @@ Epic 17 Task 17.1 defines the baseline policy contract for safe auto-resume beha
640640
- eligibility gate: checkpoint availability + idempotency + artifact readiness + attempt budget
641641
- safety controls: class-specific cool-down windows and escalation after max attempts
642642

643+
Epic 17 Task 17.2 implements the recovery backend:
644+
645+
- engine module: `scripts/recovery_engine.py`
646+
- backend path: `/start-work recover --interruption-class <tool_failure|timeout|context_reset|process_crash> --json`
647+
- approval gate: non-idempotent pending steps require explicit `--approve-step <ordinal>`
648+
- audit trail: persisted `resume_decision` and `resume_transition` events under runtime `resume.trail`
649+
643650
## Context resilience policy
644651

645652
Epic 11 Task 11.1 defines the baseline policy schema for context-window resilience:

scripts/recovery_engine.py

Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
#!/usr/bin/env python3
2+
3+
from __future__ import annotations
4+
5+
from copy import deepcopy
6+
from datetime import UTC, datetime
7+
from typing import Any
8+
9+
10+
INTERRUPTION_COOLDOWNS = {
11+
"tool_failure": 30,
12+
"timeout": 120,
13+
"context_reset": 10,
14+
"process_crash": 60,
15+
}
16+
MAX_RESUME_ATTEMPTS_DEFAULT = 3
17+
18+
19+
def now_iso() -> str:
20+
return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
21+
22+
23+
def _parse_iso(value: Any) -> datetime | None:
24+
if not isinstance(value, str) or not value.strip():
25+
return None
26+
try:
27+
return datetime.fromisoformat(value.replace("Z", "+00:00")).astimezone(UTC)
28+
except ValueError:
29+
return None
30+
31+
32+
def _normalize_steps(runtime: dict[str, Any]) -> list[dict[str, Any]]:
33+
raw_steps = runtime.get("steps")
34+
if not isinstance(raw_steps, list):
35+
return []
36+
steps: list[dict[str, Any]] = []
37+
for step in raw_steps:
38+
if isinstance(step, dict):
39+
steps.append(step)
40+
return steps
41+
42+
43+
def load_last_safe_checkpoint(runtime: dict[str, Any]) -> dict[str, Any]:
44+
steps = _normalize_steps(runtime)
45+
if not steps:
46+
return {
47+
"available": False,
48+
"reason_code": "resume_missing_checkpoint",
49+
"checkpoint": None,
50+
}
51+
pending_steps = [
52+
step
53+
for step in steps
54+
if str(step.get("state") or "") in {"pending", "in_progress"}
55+
]
56+
if not pending_steps:
57+
return {
58+
"available": True,
59+
"reason_code": "resume_allowed",
60+
"checkpoint": {
61+
"status": "completed",
62+
"next_step_ordinal": None,
63+
"next_step_idempotent": True,
64+
},
65+
}
66+
next_step = pending_steps[0]
67+
return {
68+
"available": True,
69+
"reason_code": "resume_allowed",
70+
"checkpoint": {
71+
"status": str(runtime.get("status") or "unknown"),
72+
"next_step_ordinal": next_step.get("ordinal"),
73+
"next_step_idempotent": bool(next_step.get("idempotent", True)),
74+
},
75+
}
76+
77+
78+
def evaluate_resume_eligibility(
79+
runtime: dict[str, Any],
80+
interruption_class: str,
81+
*,
82+
approved_steps: set[int] | None = None,
83+
now_ts: str | None = None,
84+
) -> dict[str, Any]:
85+
approved = approved_steps or set()
86+
checkpoint_info = load_last_safe_checkpoint(runtime)
87+
if not checkpoint_info.get("available"):
88+
return {
89+
"eligible": False,
90+
"reason_code": "resume_missing_checkpoint",
91+
"checkpoint": None,
92+
"cooldown_remaining": 0,
93+
}
94+
95+
if interruption_class not in INTERRUPTION_COOLDOWNS:
96+
return {
97+
"eligible": False,
98+
"reason_code": "resume_unknown_interruption_class",
99+
"checkpoint": checkpoint_info.get("checkpoint"),
100+
"cooldown_remaining": 0,
101+
}
102+
103+
status = str(runtime.get("status") or "")
104+
if status not in {"failed", "in_progress", "completed"}:
105+
return {
106+
"eligible": False,
107+
"reason_code": "resume_missing_runtime_artifacts",
108+
"checkpoint": checkpoint_info.get("checkpoint"),
109+
"cooldown_remaining": 0,
110+
}
111+
112+
resume_meta_any = runtime.get("resume")
113+
resume_meta: dict[str, Any] = (
114+
resume_meta_any if isinstance(resume_meta_any, dict) else {}
115+
)
116+
attempt_count = int(resume_meta.get("attempt_count", 0) or 0)
117+
max_attempts = int(
118+
resume_meta.get("max_attempts", MAX_RESUME_ATTEMPTS_DEFAULT)
119+
or MAX_RESUME_ATTEMPTS_DEFAULT
120+
)
121+
if attempt_count >= max_attempts:
122+
return {
123+
"eligible": False,
124+
"reason_code": "resume_attempt_limit_reached",
125+
"checkpoint": checkpoint_info.get("checkpoint"),
126+
"cooldown_remaining": 0,
127+
"attempt_count": attempt_count,
128+
"max_attempts": max_attempts,
129+
}
130+
131+
current_time = _parse_iso(now_ts or now_iso())
132+
last_attempt = _parse_iso(resume_meta.get("last_attempt_at"))
133+
cooldown = INTERRUPTION_COOLDOWNS[interruption_class]
134+
remaining = 0
135+
if current_time is not None and last_attempt is not None:
136+
elapsed = int((current_time - last_attempt).total_seconds())
137+
if elapsed < cooldown:
138+
remaining = cooldown - elapsed
139+
if remaining > 0:
140+
return {
141+
"eligible": False,
142+
"reason_code": "resume_blocked_cooldown",
143+
"checkpoint": checkpoint_info.get("checkpoint"),
144+
"cooldown_remaining": remaining,
145+
"attempt_count": attempt_count,
146+
"max_attempts": max_attempts,
147+
}
148+
149+
checkpoint_raw = checkpoint_info.get("checkpoint")
150+
checkpoint: dict[str, Any] = (
151+
checkpoint_raw if isinstance(checkpoint_raw, dict) else {}
152+
)
153+
next_ordinal = checkpoint.get("next_step_ordinal")
154+
next_idempotent = bool(checkpoint.get("next_step_idempotent", True))
155+
if (
156+
isinstance(next_ordinal, int)
157+
and (not next_idempotent)
158+
and next_ordinal not in approved
159+
):
160+
return {
161+
"eligible": False,
162+
"reason_code": "resume_non_idempotent_step",
163+
"checkpoint": checkpoint,
164+
"cooldown_remaining": 0,
165+
"attempt_count": attempt_count,
166+
"max_attempts": max_attempts,
167+
}
168+
169+
return {
170+
"eligible": True,
171+
"reason_code": "resume_allowed",
172+
"checkpoint": checkpoint,
173+
"cooldown_remaining": 0,
174+
"attempt_count": attempt_count,
175+
"max_attempts": max_attempts,
176+
}
177+
178+
179+
def execute_resume(
180+
runtime: dict[str, Any],
181+
interruption_class: str,
182+
*,
183+
approved_steps: set[int] | None = None,
184+
actor: str = "system",
185+
) -> dict[str, Any]:
186+
approved = approved_steps or set()
187+
next_runtime = deepcopy(runtime)
188+
evaluation = evaluate_resume_eligibility(
189+
next_runtime,
190+
interruption_class,
191+
approved_steps=approved,
192+
)
193+
resume_meta_any = next_runtime.get("resume")
194+
resume_meta: dict[str, Any] = (
195+
resume_meta_any if isinstance(resume_meta_any, dict) else {}
196+
)
197+
trail_raw = resume_meta.get("trail")
198+
trail: list[dict[str, Any]] = trail_raw if isinstance(trail_raw, list) else []
199+
decision_at = now_iso()
200+
decision = {
201+
"event": "resume_decision",
202+
"interruption_class": interruption_class,
203+
"eligible": bool(evaluation.get("eligible")),
204+
"reason_code": evaluation.get("reason_code"),
205+
"cooldown_seconds_remaining": int(evaluation.get("cooldown_remaining", 0) or 0),
206+
"attempt": int(evaluation.get("attempt_count", 0) or 0) + 1,
207+
"max_attempts": int(
208+
evaluation.get("max_attempts", MAX_RESUME_ATTEMPTS_DEFAULT)
209+
or MAX_RESUME_ATTEMPTS_DEFAULT
210+
),
211+
"at": decision_at,
212+
"actor": actor,
213+
}
214+
trail.append(decision)
215+
216+
resume_meta["last_interruption_class"] = interruption_class
217+
resume_meta["last_attempt_at"] = decision_at
218+
resume_meta["attempt_count"] = int(evaluation.get("attempt_count", 0) or 0) + 1
219+
resume_meta["max_attempts"] = int(
220+
evaluation.get("max_attempts", MAX_RESUME_ATTEMPTS_DEFAULT)
221+
or MAX_RESUME_ATTEMPTS_DEFAULT
222+
)
223+
resume_meta["trail"] = trail
224+
next_runtime["resume"] = resume_meta
225+
226+
if not evaluation.get("eligible"):
227+
if evaluation.get("reason_code") == "resume_attempt_limit_reached":
228+
next_runtime["status"] = "resume_escalated"
229+
return {
230+
"result": "FAIL",
231+
"runtime": next_runtime,
232+
"reason_code": evaluation.get("reason_code"),
233+
"checkpoint": evaluation.get("checkpoint"),
234+
"resumed_steps": [],
235+
}
236+
237+
resumed_steps: list[int] = []
238+
for step in _normalize_steps(next_runtime):
239+
state = str(step.get("state") or "")
240+
if state == "done":
241+
continue
242+
ordinal = step.get("ordinal")
243+
if isinstance(ordinal, int):
244+
resumed_steps.append(ordinal)
245+
step["state"] = "in_progress"
246+
trail.append(
247+
{
248+
"event": "resume_transition",
249+
"step_ordinal": ordinal,
250+
"to": "in_progress",
251+
"at": now_iso(),
252+
"actor": actor,
253+
}
254+
)
255+
step["state"] = "done"
256+
trail.append(
257+
{
258+
"event": "resume_transition",
259+
"step_ordinal": ordinal,
260+
"to": "done",
261+
"at": now_iso(),
262+
"actor": actor,
263+
}
264+
)
265+
266+
next_runtime["resume"] = resume_meta
267+
all_done = all(
268+
str(step.get("state") or "") == "done"
269+
for step in _normalize_steps(next_runtime)
270+
)
271+
next_runtime["status"] = "completed" if all_done else "failed"
272+
next_runtime["finished_at"] = now_iso()
273+
return {
274+
"result": "PASS",
275+
"runtime": next_runtime,
276+
"reason_code": "resume_allowed",
277+
"checkpoint": evaluation.get("checkpoint"),
278+
"resumed_steps": resumed_steps,
279+
}

0 commit comments

Comments
 (0)