|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +from copy import deepcopy |
| 6 | +from datetime import UTC, datetime |
| 7 | +from typing import Any |
| 8 | + |
| 9 | + |
| 10 | +INTERRUPTION_COOLDOWNS = { |
| 11 | + "tool_failure": 30, |
| 12 | + "timeout": 120, |
| 13 | + "context_reset": 10, |
| 14 | + "process_crash": 60, |
| 15 | +} |
| 16 | +MAX_RESUME_ATTEMPTS_DEFAULT = 3 |
| 17 | + |
| 18 | + |
| 19 | +def now_iso() -> str: |
| 20 | + return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z") |
| 21 | + |
| 22 | + |
| 23 | +def _parse_iso(value: Any) -> datetime | None: |
| 24 | + if not isinstance(value, str) or not value.strip(): |
| 25 | + return None |
| 26 | + try: |
| 27 | + return datetime.fromisoformat(value.replace("Z", "+00:00")).astimezone(UTC) |
| 28 | + except ValueError: |
| 29 | + return None |
| 30 | + |
| 31 | + |
| 32 | +def _normalize_steps(runtime: dict[str, Any]) -> list[dict[str, Any]]: |
| 33 | + raw_steps = runtime.get("steps") |
| 34 | + if not isinstance(raw_steps, list): |
| 35 | + return [] |
| 36 | + steps: list[dict[str, Any]] = [] |
| 37 | + for step in raw_steps: |
| 38 | + if isinstance(step, dict): |
| 39 | + steps.append(step) |
| 40 | + return steps |
| 41 | + |
| 42 | + |
| 43 | +def load_last_safe_checkpoint(runtime: dict[str, Any]) -> dict[str, Any]: |
| 44 | + steps = _normalize_steps(runtime) |
| 45 | + if not steps: |
| 46 | + return { |
| 47 | + "available": False, |
| 48 | + "reason_code": "resume_missing_checkpoint", |
| 49 | + "checkpoint": None, |
| 50 | + } |
| 51 | + pending_steps = [ |
| 52 | + step |
| 53 | + for step in steps |
| 54 | + if str(step.get("state") or "") in {"pending", "in_progress"} |
| 55 | + ] |
| 56 | + if not pending_steps: |
| 57 | + return { |
| 58 | + "available": True, |
| 59 | + "reason_code": "resume_allowed", |
| 60 | + "checkpoint": { |
| 61 | + "status": "completed", |
| 62 | + "next_step_ordinal": None, |
| 63 | + "next_step_idempotent": True, |
| 64 | + }, |
| 65 | + } |
| 66 | + next_step = pending_steps[0] |
| 67 | + return { |
| 68 | + "available": True, |
| 69 | + "reason_code": "resume_allowed", |
| 70 | + "checkpoint": { |
| 71 | + "status": str(runtime.get("status") or "unknown"), |
| 72 | + "next_step_ordinal": next_step.get("ordinal"), |
| 73 | + "next_step_idempotent": bool(next_step.get("idempotent", True)), |
| 74 | + }, |
| 75 | + } |
| 76 | + |
| 77 | + |
| 78 | +def evaluate_resume_eligibility( |
| 79 | + runtime: dict[str, Any], |
| 80 | + interruption_class: str, |
| 81 | + *, |
| 82 | + approved_steps: set[int] | None = None, |
| 83 | + now_ts: str | None = None, |
| 84 | +) -> dict[str, Any]: |
| 85 | + approved = approved_steps or set() |
| 86 | + checkpoint_info = load_last_safe_checkpoint(runtime) |
| 87 | + if not checkpoint_info.get("available"): |
| 88 | + return { |
| 89 | + "eligible": False, |
| 90 | + "reason_code": "resume_missing_checkpoint", |
| 91 | + "checkpoint": None, |
| 92 | + "cooldown_remaining": 0, |
| 93 | + } |
| 94 | + |
| 95 | + if interruption_class not in INTERRUPTION_COOLDOWNS: |
| 96 | + return { |
| 97 | + "eligible": False, |
| 98 | + "reason_code": "resume_unknown_interruption_class", |
| 99 | + "checkpoint": checkpoint_info.get("checkpoint"), |
| 100 | + "cooldown_remaining": 0, |
| 101 | + } |
| 102 | + |
| 103 | + status = str(runtime.get("status") or "") |
| 104 | + if status not in {"failed", "in_progress", "completed"}: |
| 105 | + return { |
| 106 | + "eligible": False, |
| 107 | + "reason_code": "resume_missing_runtime_artifacts", |
| 108 | + "checkpoint": checkpoint_info.get("checkpoint"), |
| 109 | + "cooldown_remaining": 0, |
| 110 | + } |
| 111 | + |
| 112 | + resume_meta_any = runtime.get("resume") |
| 113 | + resume_meta: dict[str, Any] = ( |
| 114 | + resume_meta_any if isinstance(resume_meta_any, dict) else {} |
| 115 | + ) |
| 116 | + attempt_count = int(resume_meta.get("attempt_count", 0) or 0) |
| 117 | + max_attempts = int( |
| 118 | + resume_meta.get("max_attempts", MAX_RESUME_ATTEMPTS_DEFAULT) |
| 119 | + or MAX_RESUME_ATTEMPTS_DEFAULT |
| 120 | + ) |
| 121 | + if attempt_count >= max_attempts: |
| 122 | + return { |
| 123 | + "eligible": False, |
| 124 | + "reason_code": "resume_attempt_limit_reached", |
| 125 | + "checkpoint": checkpoint_info.get("checkpoint"), |
| 126 | + "cooldown_remaining": 0, |
| 127 | + "attempt_count": attempt_count, |
| 128 | + "max_attempts": max_attempts, |
| 129 | + } |
| 130 | + |
| 131 | + current_time = _parse_iso(now_ts or now_iso()) |
| 132 | + last_attempt = _parse_iso(resume_meta.get("last_attempt_at")) |
| 133 | + cooldown = INTERRUPTION_COOLDOWNS[interruption_class] |
| 134 | + remaining = 0 |
| 135 | + if current_time is not None and last_attempt is not None: |
| 136 | + elapsed = int((current_time - last_attempt).total_seconds()) |
| 137 | + if elapsed < cooldown: |
| 138 | + remaining = cooldown - elapsed |
| 139 | + if remaining > 0: |
| 140 | + return { |
| 141 | + "eligible": False, |
| 142 | + "reason_code": "resume_blocked_cooldown", |
| 143 | + "checkpoint": checkpoint_info.get("checkpoint"), |
| 144 | + "cooldown_remaining": remaining, |
| 145 | + "attempt_count": attempt_count, |
| 146 | + "max_attempts": max_attempts, |
| 147 | + } |
| 148 | + |
| 149 | + checkpoint_raw = checkpoint_info.get("checkpoint") |
| 150 | + checkpoint: dict[str, Any] = ( |
| 151 | + checkpoint_raw if isinstance(checkpoint_raw, dict) else {} |
| 152 | + ) |
| 153 | + next_ordinal = checkpoint.get("next_step_ordinal") |
| 154 | + next_idempotent = bool(checkpoint.get("next_step_idempotent", True)) |
| 155 | + if ( |
| 156 | + isinstance(next_ordinal, int) |
| 157 | + and (not next_idempotent) |
| 158 | + and next_ordinal not in approved |
| 159 | + ): |
| 160 | + return { |
| 161 | + "eligible": False, |
| 162 | + "reason_code": "resume_non_idempotent_step", |
| 163 | + "checkpoint": checkpoint, |
| 164 | + "cooldown_remaining": 0, |
| 165 | + "attempt_count": attempt_count, |
| 166 | + "max_attempts": max_attempts, |
| 167 | + } |
| 168 | + |
| 169 | + return { |
| 170 | + "eligible": True, |
| 171 | + "reason_code": "resume_allowed", |
| 172 | + "checkpoint": checkpoint, |
| 173 | + "cooldown_remaining": 0, |
| 174 | + "attempt_count": attempt_count, |
| 175 | + "max_attempts": max_attempts, |
| 176 | + } |
| 177 | + |
| 178 | + |
| 179 | +def execute_resume( |
| 180 | + runtime: dict[str, Any], |
| 181 | + interruption_class: str, |
| 182 | + *, |
| 183 | + approved_steps: set[int] | None = None, |
| 184 | + actor: str = "system", |
| 185 | +) -> dict[str, Any]: |
| 186 | + approved = approved_steps or set() |
| 187 | + next_runtime = deepcopy(runtime) |
| 188 | + evaluation = evaluate_resume_eligibility( |
| 189 | + next_runtime, |
| 190 | + interruption_class, |
| 191 | + approved_steps=approved, |
| 192 | + ) |
| 193 | + resume_meta_any = next_runtime.get("resume") |
| 194 | + resume_meta: dict[str, Any] = ( |
| 195 | + resume_meta_any if isinstance(resume_meta_any, dict) else {} |
| 196 | + ) |
| 197 | + trail_raw = resume_meta.get("trail") |
| 198 | + trail: list[dict[str, Any]] = trail_raw if isinstance(trail_raw, list) else [] |
| 199 | + decision_at = now_iso() |
| 200 | + decision = { |
| 201 | + "event": "resume_decision", |
| 202 | + "interruption_class": interruption_class, |
| 203 | + "eligible": bool(evaluation.get("eligible")), |
| 204 | + "reason_code": evaluation.get("reason_code"), |
| 205 | + "cooldown_seconds_remaining": int(evaluation.get("cooldown_remaining", 0) or 0), |
| 206 | + "attempt": int(evaluation.get("attempt_count", 0) or 0) + 1, |
| 207 | + "max_attempts": int( |
| 208 | + evaluation.get("max_attempts", MAX_RESUME_ATTEMPTS_DEFAULT) |
| 209 | + or MAX_RESUME_ATTEMPTS_DEFAULT |
| 210 | + ), |
| 211 | + "at": decision_at, |
| 212 | + "actor": actor, |
| 213 | + } |
| 214 | + trail.append(decision) |
| 215 | + |
| 216 | + resume_meta["last_interruption_class"] = interruption_class |
| 217 | + resume_meta["last_attempt_at"] = decision_at |
| 218 | + resume_meta["attempt_count"] = int(evaluation.get("attempt_count", 0) or 0) + 1 |
| 219 | + resume_meta["max_attempts"] = int( |
| 220 | + evaluation.get("max_attempts", MAX_RESUME_ATTEMPTS_DEFAULT) |
| 221 | + or MAX_RESUME_ATTEMPTS_DEFAULT |
| 222 | + ) |
| 223 | + resume_meta["trail"] = trail |
| 224 | + next_runtime["resume"] = resume_meta |
| 225 | + |
| 226 | + if not evaluation.get("eligible"): |
| 227 | + if evaluation.get("reason_code") == "resume_attempt_limit_reached": |
| 228 | + next_runtime["status"] = "resume_escalated" |
| 229 | + return { |
| 230 | + "result": "FAIL", |
| 231 | + "runtime": next_runtime, |
| 232 | + "reason_code": evaluation.get("reason_code"), |
| 233 | + "checkpoint": evaluation.get("checkpoint"), |
| 234 | + "resumed_steps": [], |
| 235 | + } |
| 236 | + |
| 237 | + resumed_steps: list[int] = [] |
| 238 | + for step in _normalize_steps(next_runtime): |
| 239 | + state = str(step.get("state") or "") |
| 240 | + if state == "done": |
| 241 | + continue |
| 242 | + ordinal = step.get("ordinal") |
| 243 | + if isinstance(ordinal, int): |
| 244 | + resumed_steps.append(ordinal) |
| 245 | + step["state"] = "in_progress" |
| 246 | + trail.append( |
| 247 | + { |
| 248 | + "event": "resume_transition", |
| 249 | + "step_ordinal": ordinal, |
| 250 | + "to": "in_progress", |
| 251 | + "at": now_iso(), |
| 252 | + "actor": actor, |
| 253 | + } |
| 254 | + ) |
| 255 | + step["state"] = "done" |
| 256 | + trail.append( |
| 257 | + { |
| 258 | + "event": "resume_transition", |
| 259 | + "step_ordinal": ordinal, |
| 260 | + "to": "done", |
| 261 | + "at": now_iso(), |
| 262 | + "actor": actor, |
| 263 | + } |
| 264 | + ) |
| 265 | + |
| 266 | + next_runtime["resume"] = resume_meta |
| 267 | + all_done = all( |
| 268 | + str(step.get("state") or "") == "done" |
| 269 | + for step in _normalize_steps(next_runtime) |
| 270 | + ) |
| 271 | + next_runtime["status"] = "completed" if all_done else "failed" |
| 272 | + next_runtime["finished_at"] = now_iso() |
| 273 | + return { |
| 274 | + "result": "PASS", |
| 275 | + "runtime": next_runtime, |
| 276 | + "reason_code": "resume_allowed", |
| 277 | + "checkpoint": evaluation.get("checkpoint"), |
| 278 | + "resumed_steps": resumed_steps, |
| 279 | + } |
0 commit comments