Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ All notable changes to this project are documented in this file.
- Added `scripts/todo_command.py` with `/todo status` and `/todo enforce` diagnostics for runtime compliance visibility.
- Added `/todo`, `/todo-status`, and `/todo-enforce` aliases in `opencode.json`.
- Added `instructions/resume_policy_model.md` defining interruption classes, resume eligibility/cool-down rules, attempt limits, escalation semantics, and deterministic reason codes for Epic 17 Task 17.1.
- Added `scripts/recovery_engine.py` implementing checkpoint loading, eligibility evaluation, idempotency gating, and persisted resume decision/transition trail events for Epic 17 Task 17.2.

### Changes
- Documented extension evaluation outcomes and when each tool is the better fit.
Expand Down Expand Up @@ -101,6 +102,7 @@ All notable changes to this project are documented in this file.
- Integrated todo compliance checks into `/doctor` summary, installer self-checks, and install-test smoke coverage.
- Expanded selftest coverage for todo transition gating, completion blocking, and bypass audit-event payload validation.
- Marked Epic 17 as in progress and completed Task 17.1 resume-policy definition notes in the roadmap.
- Added `/start-work recover` backend path with explicit interruption class handling and approval-gated replay for non-idempotent pending steps.

## v0.2.0 - 2026-02-12

Expand Down
9 changes: 5 additions & 4 deletions IMPLEMENTATION_ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -648,10 +648,11 @@ Every command-oriented epic must ship all of the following:
- [x] Subtask 17.1.2: Define resume eligibility and cool-down rules
- [x] Subtask 17.1.3: Define max resume attempts and escalation path
- [x] Notes: Added `instructions/resume_policy_model.md` with interruption classes, deterministic eligibility/cool-down/attempt-limit rules, reason codes, and audit event contract.
- [ ] Task 17.2: Implement recovery engine
- [ ] Subtask 17.2.1: Load last safe checkpoint and reconstruct state
- [ ] Subtask 17.2.2: Re-run only idempotent or explicitly approved steps
- [ ] Subtask 17.2.3: Persist resume trail for audit/debugging
- [x] Task 17.2: Implement recovery engine
- [x] Subtask 17.2.1: Load last safe checkpoint and reconstruct state
- [x] Subtask 17.2.2: Re-run only idempotent or explicitly approved steps
- [x] Subtask 17.2.3: Persist resume trail for audit/debugging
- [x] Notes: Added `scripts/recovery_engine.py` and `/start-work recover` backend path for checkpoint eligibility checks, approval-gated replay, and persisted resume audit trail events.
- [ ] Task 17.3: User control surfaces
- [ ] Subtask 17.3.1: Add `/resume status`, `/resume now`, `/resume disable` commands
- [ ] Subtask 17.3.2: Add clear output explaining why resume did/did not trigger
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,13 @@ Epic 17 Task 17.1 defines the baseline policy contract for safe auto-resume beha
- eligibility gate: checkpoint availability + idempotency + artifact readiness + attempt budget
- safety controls: class-specific cool-down windows and escalation after max attempts

Epic 17 Task 17.2 implements the recovery backend:

- engine module: `scripts/recovery_engine.py`
- backend path: `/start-work recover --interruption-class <tool_failure|timeout|context_reset|process_crash> --json`
- approval gate: non-idempotent pending steps require explicit `--approve-step <ordinal>`
- audit trail: persisted `resume_decision` and `resume_transition` events under runtime `resume.trail`

## Context resilience policy

Epic 11 Task 11.1 defines the baseline policy schema for context-window resilience:
Expand Down
279 changes: 279 additions & 0 deletions scripts/recovery_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
#!/usr/bin/env python3

from __future__ import annotations

from copy import deepcopy
from datetime import UTC, datetime
from typing import Any


INTERRUPTION_COOLDOWNS = {
"tool_failure": 30,
"timeout": 120,
"context_reset": 10,
"process_crash": 60,
}
MAX_RESUME_ATTEMPTS_DEFAULT = 3


def now_iso() -> str:
return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")


def _parse_iso(value: Any) -> datetime | None:
if not isinstance(value, str) or not value.strip():
return None
try:
return datetime.fromisoformat(value.replace("Z", "+00:00")).astimezone(UTC)
except ValueError:
return None


def _normalize_steps(runtime: dict[str, Any]) -> list[dict[str, Any]]:
raw_steps = runtime.get("steps")
if not isinstance(raw_steps, list):
return []
steps: list[dict[str, Any]] = []
for step in raw_steps:
if isinstance(step, dict):
steps.append(step)
return steps


def load_last_safe_checkpoint(runtime: dict[str, Any]) -> dict[str, Any]:
steps = _normalize_steps(runtime)
if not steps:
return {
"available": False,
"reason_code": "resume_missing_checkpoint",
"checkpoint": None,
}
pending_steps = [
step
for step in steps
if str(step.get("state") or "") in {"pending", "in_progress"}
]
if not pending_steps:
return {
"available": True,
"reason_code": "resume_allowed",
"checkpoint": {
"status": "completed",
"next_step_ordinal": None,
"next_step_idempotent": True,
},
}
next_step = pending_steps[0]
return {
"available": True,
"reason_code": "resume_allowed",
"checkpoint": {
"status": str(runtime.get("status") or "unknown"),
"next_step_ordinal": next_step.get("ordinal"),
"next_step_idempotent": bool(next_step.get("idempotent", True)),
},
}


def evaluate_resume_eligibility(
runtime: dict[str, Any],
interruption_class: str,
*,
approved_steps: set[int] | None = None,
now_ts: str | None = None,
) -> dict[str, Any]:
approved = approved_steps or set()
checkpoint_info = load_last_safe_checkpoint(runtime)
if not checkpoint_info.get("available"):
return {
"eligible": False,
"reason_code": "resume_missing_checkpoint",
"checkpoint": None,
"cooldown_remaining": 0,
}

if interruption_class not in INTERRUPTION_COOLDOWNS:
return {
"eligible": False,
"reason_code": "resume_unknown_interruption_class",
"checkpoint": checkpoint_info.get("checkpoint"),
"cooldown_remaining": 0,
}

status = str(runtime.get("status") or "")
if status not in {"failed", "in_progress", "completed"}:
return {
"eligible": False,
"reason_code": "resume_missing_runtime_artifacts",
"checkpoint": checkpoint_info.get("checkpoint"),
"cooldown_remaining": 0,
}

resume_meta_any = runtime.get("resume")
resume_meta: dict[str, Any] = (
resume_meta_any if isinstance(resume_meta_any, dict) else {}
)
attempt_count = int(resume_meta.get("attempt_count", 0) or 0)
max_attempts = int(
resume_meta.get("max_attempts", MAX_RESUME_ATTEMPTS_DEFAULT)
or MAX_RESUME_ATTEMPTS_DEFAULT
)
if attempt_count >= max_attempts:
return {
"eligible": False,
"reason_code": "resume_attempt_limit_reached",
"checkpoint": checkpoint_info.get("checkpoint"),
"cooldown_remaining": 0,
"attempt_count": attempt_count,
"max_attempts": max_attempts,
}

current_time = _parse_iso(now_ts or now_iso())
last_attempt = _parse_iso(resume_meta.get("last_attempt_at"))
cooldown = INTERRUPTION_COOLDOWNS[interruption_class]
remaining = 0
if current_time is not None and last_attempt is not None:
elapsed = int((current_time - last_attempt).total_seconds())
if elapsed < cooldown:
remaining = cooldown - elapsed
if remaining > 0:
return {
"eligible": False,
"reason_code": "resume_blocked_cooldown",
"checkpoint": checkpoint_info.get("checkpoint"),
"cooldown_remaining": remaining,
"attempt_count": attempt_count,
"max_attempts": max_attempts,
}

checkpoint_raw = checkpoint_info.get("checkpoint")
checkpoint: dict[str, Any] = (
checkpoint_raw if isinstance(checkpoint_raw, dict) else {}
)
next_ordinal = checkpoint.get("next_step_ordinal")
next_idempotent = bool(checkpoint.get("next_step_idempotent", True))
if (
isinstance(next_ordinal, int)
and (not next_idempotent)
and next_ordinal not in approved
):
return {
"eligible": False,
"reason_code": "resume_non_idempotent_step",
"checkpoint": checkpoint,
"cooldown_remaining": 0,
"attempt_count": attempt_count,
"max_attempts": max_attempts,
}

return {
"eligible": True,
"reason_code": "resume_allowed",
"checkpoint": checkpoint,
"cooldown_remaining": 0,
"attempt_count": attempt_count,
"max_attempts": max_attempts,
}


def execute_resume(
runtime: dict[str, Any],
interruption_class: str,
*,
approved_steps: set[int] | None = None,
actor: str = "system",
) -> dict[str, Any]:
approved = approved_steps or set()
next_runtime = deepcopy(runtime)
evaluation = evaluate_resume_eligibility(
next_runtime,
interruption_class,
approved_steps=approved,
)
resume_meta_any = next_runtime.get("resume")
resume_meta: dict[str, Any] = (
resume_meta_any if isinstance(resume_meta_any, dict) else {}
)
trail_raw = resume_meta.get("trail")
trail: list[dict[str, Any]] = trail_raw if isinstance(trail_raw, list) else []
decision_at = now_iso()
decision = {
"event": "resume_decision",
"interruption_class": interruption_class,
"eligible": bool(evaluation.get("eligible")),
"reason_code": evaluation.get("reason_code"),
"cooldown_seconds_remaining": int(evaluation.get("cooldown_remaining", 0) or 0),
"attempt": int(evaluation.get("attempt_count", 0) or 0) + 1,
"max_attempts": int(
evaluation.get("max_attempts", MAX_RESUME_ATTEMPTS_DEFAULT)
or MAX_RESUME_ATTEMPTS_DEFAULT
),
"at": decision_at,
"actor": actor,
}
trail.append(decision)

resume_meta["last_interruption_class"] = interruption_class
resume_meta["last_attempt_at"] = decision_at
resume_meta["attempt_count"] = int(evaluation.get("attempt_count", 0) or 0) + 1
resume_meta["max_attempts"] = int(
evaluation.get("max_attempts", MAX_RESUME_ATTEMPTS_DEFAULT)
or MAX_RESUME_ATTEMPTS_DEFAULT
)
resume_meta["trail"] = trail
next_runtime["resume"] = resume_meta

if not evaluation.get("eligible"):
if evaluation.get("reason_code") == "resume_attempt_limit_reached":
next_runtime["status"] = "resume_escalated"
return {
"result": "FAIL",
"runtime": next_runtime,
"reason_code": evaluation.get("reason_code"),
"checkpoint": evaluation.get("checkpoint"),
"resumed_steps": [],
}

resumed_steps: list[int] = []
for step in _normalize_steps(next_runtime):
state = str(step.get("state") or "")
if state == "done":
continue
ordinal = step.get("ordinal")
if isinstance(ordinal, int):
resumed_steps.append(ordinal)
step["state"] = "in_progress"
trail.append(
{
"event": "resume_transition",
"step_ordinal": ordinal,
"to": "in_progress",
"at": now_iso(),
"actor": actor,
}
)
step["state"] = "done"
trail.append(
{
"event": "resume_transition",
"step_ordinal": ordinal,
"to": "done",
"at": now_iso(),
"actor": actor,
}
)

next_runtime["resume"] = resume_meta
all_done = all(
str(step.get("state") or "") == "done"
for step in _normalize_steps(next_runtime)
)
next_runtime["status"] = "completed" if all_done else "failed"
next_runtime["finished_at"] = now_iso()
return {
"result": "PASS",
"runtime": next_runtime,
"reason_code": "resume_allowed",
"checkpoint": evaluation.get("checkpoint"),
"resumed_steps": resumed_steps,
}
Loading