NatLabRockies
diff --git a/‎docs/src/core/reference/cli-cheatsheet.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/src/core/reference/cli-cheatsheet.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/src/core/reference/cli.md‎
Lines changed: 10 additions & 1 deletion b/‎docs/src/core/reference/cli.md‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎docs/src/specialized/fault-tolerance/automatic-recovery.md‎
Lines changed: 102 additions & 42 deletions b/‎docs/src/specialized/fault-tolerance/automatic-recovery.md‎
Lines changed: 102 additions & 42 deletions
diff --git a/‎examples/json/slurm_staged_pipeline.json5‎
Lines changed: 1 addition & 1 deletion b/‎examples/json/slurm_staged_pipeline.json5‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/kdl/slurm_staged_pipeline.kdl‎
Lines changed: 1 addition & 1 deletion b/‎examples/kdl/slurm_staged_pipeline.kdl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/yaml/slurm_staged_pipeline.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/yaml/slurm_staged_pipeline.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎slurm-tests/workflows/cancel_workflow.yaml‎
Lines changed: 1 addition & 1 deletion b/‎slurm-tests/workflows/cancel_workflow.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎slurm-tests/workflows/failure_recovery.yaml‎
Lines changed: 1 addition & 1 deletion b/‎slurm-tests/workflows/failure_recovery.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎slurm-tests/workflows/job_parallelism.yaml‎
Lines changed: 4 additions & 4 deletions b/‎slurm-tests/workflows/job_parallelism.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎slurm-tests/workflows/resource_monitoring.yaml‎
Lines changed: 1 addition & 1 deletion b/‎slurm-tests/workflows/resource_monitoring.yaml‎
Lines changed: 1 addition & 1 deletion
@@ -58,7 +58,8 @@
 | `torc status <id>`                          | Workflow status and job summary                |
 | `torc workflows check-resources <id>`       | Check memory/CPU/time usage                    |
 | `torc results list <id> --include-logs`     | Job results with log paths                     |
-| `torc recover <id>`                         | One-shot recovery (diagnose + fix + resubmit)  |
+| `torc recover <id>`                         | Interactive recovery wizard (default)          |
+| `torc recover <id> --no-prompts`            | Automatic recovery (no prompts, for scripting) |
 | `torc watch <id> --recover --auto-schedule` | Full production recovery mode                  |
 | `torc workflows sync-status <id>`           | Fix orphaned jobs (stuck in "running")         |
 | `torc workflows correct-resources <id>`     | Upscale violated + downsize over-allocated RRs |
 
@@ -279,6 +279,11 @@ Diagnoses job failures (OOM, timeout), adjusts resource requirements, and resubm
 a workflow has completed with failures. For continuous monitoring, use `torc watch --recover`
 instead.
 
+By default, runs an interactive wizard that displays failed jobs, lets you choose per-category
+actions (retry with adjusted resources or skip), select a Slurm scheduler, and confirm before
+executing. Use `--no-prompts` to skip the wizard and apply heuristics automatically. When stdin is
+not a terminal (e.g., piped or scripted), non-interactive mode is used automatically.
+
 **Usage:** `torc recover [OPTIONS] <WORKFLOW_ID>`
 
 ### Arguments
@@ -294,6 +299,7 @@ instead.
 - `--retry-unknown` — Also retry jobs with unknown failure causes
 - `--recovery-hook <RECOVERY_HOOK>` — Custom recovery script for unknown failures
 - `--dry-run` — Show what would be done without making any changes
+- `--no-prompts` — Skip interactive wizard and apply heuristics automatically
 
 ### When to Use
 
@@ -312,12 +318,15 @@ Use `torc watch --recover` instead for:
 ### Examples
 
 ```bash
-# Basic recovery
+# Interactive recovery (default)
 torc recover 123
 
 # Dry run to preview changes without modifying anything
 torc recover 123 --dry-run
 
+# Skip interactive prompts (for scripting)
+torc recover 123 --no-prompts
+
 # Custom resource multipliers
 torc recover 123 --memory-multiplier 2.0 --runtime-multiplier 1.5
 
 
@@ -92,14 +92,92 @@ This data is analyzed to determine failure causes:
 For one-shot recovery when a workflow has failed:
 
 ```bash
-# Preview what would be done (recommended first step)
+# Interactive recovery (default when running in a terminal)
+torc recover 42
+
+# Preview what would be done without making changes
 torc recover 42 --dry-run
 
-# Execute the recovery
-torc recover 42
+# Skip interactive prompts (for scripting)
+torc recover 42 --no-prompts
+```
+
+### Interactive Recovery Wizard
+
+By default, `torc recover` runs an interactive wizard that guides you through the recovery process
+step by step:
+
+1. **Diagnose failures** — Categorizes failed jobs into OOM, timeout, and unknown failures and
+   displays a summary table
+2. **Per-category decisions** — For each failure category, choose to retry with adjusted resources,
+   customize the multiplier, or skip
+3. **Scheduler selection** — Choose to auto-generate new Slurm schedulers or reuse an existing one
+   (with optional walltime override and allocation count)
+4. **Review and confirm** — Shows the full recovery plan and asks for confirmation before executing
+
+The wizard runs automatically when stdin is a terminal. When piped or scripted (non-TTY), the
+command falls back to automatic mode. Use `--no-prompts` to explicitly skip the wizard.
+
+#### Example Session
+
 ```
+=== Recovery Wizard ===
+
+Diagnosing failures for workflow 42...
+
+OOM Failures (3 jobs):
+  ID       Name                           RC     Memory     Peak Memory    Reason
+  ---      ----                           ---    ------     -----------    ------
+  107      train_model_7                  137    8g         10.2 GB        sigkill_137
+  112      train_model_12                 137    8g         9.8 GB         memory_exceeded
+  123      train_model_23                 137    8g         11.1 GB        sigkill_137
+
+Timeout Failures (1 job):
+  ID       Name                           RC     Runtime      Exec (min)   Reason
+  ---      ----                           ---    -------      ----------   ------
+  145      postprocess                    152    PT30M        29.8         sigxcpu_152
+
+OOM failures (3 jobs): [R]etry with 1.5x memory / [A]djust multiplier / [S]kip (default: R): r
+Timeout failures (1 job): [R]etry with 1.4x runtime / [A]djust multiplier / [S]kip (default: R): a
+  Enter runtime multiplier [default: 1.4]: 2.0
+
+--- Recovery Plan ---
+
+  Memory: 8g -> 12g (1.5x) for 3 jobs: train_model_7, train_model_12, train_model_23
+  Runtime: PT30M -> PT1H (2x) for 1 job: postprocess
+
+  Total: 4 jobs to retry
+
+--- Slurm Scheduler ---
 
-This command:
+Existing schedulers for this workflow:
+
+  ID     Name                      Account        Partition      Walltime     Nodes
+  ---    ----                      -------        ---------      --------     -----
+  5      gpu_scheduler             myproject      gpu            04:00:00     1
+
+Scheduler: [A]uto-generate new / [E]xisting (enter ID) (default: A): e
+  Enter scheduler ID: 5
+  Walltime [default: 04:00:00] (press Enter to keep): 06:00:00
+  Creating new scheduler with walltime 06:00:00...
+  Created scheduler 'gpu_scheduler_recovery' (ID 8) with walltime 06:00:00
+  Number of allocations [default: 1]: 2
+
+  Scheduler: gpu_scheduler_recovery (ID 8), 2 allocation(s)
+
+Proceed with recovery? (y/N): y
+```
+
+### Non-Interactive Mode
+
+Use `--no-prompts` to skip the wizard and apply recovery heuristics automatically. This is useful
+for scripting or when you want the default behavior without interaction:
+
+```bash
+torc recover 42 --no-prompts
+```
+
+In non-interactive mode, the command:
 
 1. Detects and cleans up orphaned jobs from terminated Slurm allocations
 2. Checks that the workflow is complete and no workers are active
@@ -109,9 +187,8 @@ This command:
 6. Resets failed jobs and regenerates Slurm schedulers
 7. Submits new allocations
 
-> **Note:** Step 1 (orphan cleanup) handles the case where Slurm terminated an allocation
-> unexpectedly, leaving jobs stuck in "running" status. This is done automatically before checking
-> preconditions.
+> **Note:** Orphan cleanup handles the case where Slurm terminated an allocation unexpectedly,
+> leaving jobs stuck in "running" status. This is done automatically before checking preconditions.
 
 ### Options
 
@@ -121,25 +198,8 @@ torc recover <workflow_id> \
   --runtime-multiplier 1.4 \    # Runtime increase factor for timeout (default: 1.4)
   --retry-unknown \             # Also retry jobs with unknown failure causes
   --recovery-hook "bash fix.sh" \  # Custom script for unknown failures
-  --dry-run                     # Preview without making changes
-```
-
-### Example Output
-
-```
-Diagnosing failures...
-Applying recovery heuristics...
-  Job 107 (train_model): OOM detected, increasing memory 8g -> 12g
-  Applied fixes: 1 OOM, 0 timeout
-Resetting 1 job(s) for retry...
-  Reset 1 job(s)
-Reinitializing workflow...
-Regenerating Slurm schedulers...
-  Submitted Slurm allocation with 1 job
-
-Recovery complete for workflow 42
-  - 1 job(s) had memory increased
-Reset 1 job(s). Slurm schedulers regenerated and submitted.
+  --dry-run \                   # Preview without making changes
+  --no-prompts                  # Skip interactive wizard
 ```
 
 ## The `torc watch --recover` Command
@@ -263,13 +323,13 @@ With default settings:
 
 ## Choosing the Right Command
 
-| Use Case                          | Command                  |
-| --------------------------------- | ------------------------ |
-| One-shot recovery after failure   | `torc recover`           |
-| Continuous monitoring             | `torc watch -r`          |
-| Preview what recovery would do    | `torc recover --dry-run` |
-| Production long-running workflows | `torc watch -r`          |
-| Manual investigation, then retry  | `torc recover`           |
+| Use Case                           | Command                     |
+| ---------------------------------- | --------------------------- |
+| Interactive recovery after failure | `torc recover`              |
+| Automatic recovery (scripting)     | `torc recover --no-prompts` |
+| Continuous monitoring              | `torc watch -r`             |
+| Preview what recovery would do     | `torc recover --dry-run`    |
+| Production long-running workflows  | `torc watch -r`             |
 
 ## Complete Workflow Example
 
@@ -530,16 +590,16 @@ If jobs are requesting more resources than partitions allow:
 2. Use smaller multipliers
 3. Consider splitting jobs into smaller pieces
 
-## Comparison: Automatic vs Manual Recovery
+## Comparison: Interactive vs Automatic vs AI-Assisted Recovery
 
-| Feature                | Automatic            | Manual/AI-Assisted      |
-| ---------------------- | -------------------- | ----------------------- |
-| Human involvement      | None                 | Interactive             |
-| Speed                  | Fast                 | Depends on human        |
-| Handles OOM/timeout    | Yes                  | Yes                     |
-| Handles unknown errors | Retry only           | Full investigation      |
-| Cost optimization      | Basic                | Can be sophisticated    |
-| Use case               | Production workflows | Debugging, optimization |
+| Feature                | Interactive (`torc recover`) | Automatic (`--no-prompts`) | AI-Assisted   |
+| ---------------------- | ---------------------------- | -------------------------- | ------------- |
+| Human involvement      | Guided wizard                | None                       | AI + human    |
+| Speed                  | Minutes                      | Fast                       | Varies        |
+| Handles OOM/timeout    | Yes                          | Yes                        | Yes           |
+| Handles unknown errors | User chooses                 | Retry only                 | Investigation |
+| Scheduler control      | Choose or auto-generate      | Auto-generate              | Manual        |
+| Use case               | Most recovery scenarios      | Scripting, `torc watch`    | Complex bugs  |
 
 ## Implementation Details
 
 
@@ -336,7 +336,7 @@
     {
       "name": "work_scheduler",
       "account": "my_account",
-      "walltime": "04:00:00",
+      "walltime": "08:00:00",
       "nodes": 1
     },
     {
 
@@ -14,7 +14,7 @@ slurm_scheduler "setup_scheduler" {
 
 slurm_scheduler "work_scheduler" {
     account "my_account"
-    walltime "04:00:00"
+    walltime "08:00:00"
     nodes 1
 }
 
 
@@ -16,7 +16,7 @@ slurm_schedulers:
     nodes: 1
   - name: "work_scheduler"
     account: "my_account"
-    walltime: "04:00:00"
+    walltime: "08:00:00"
     nodes: 1
   - name: "postprocess_scheduler"
     account: "my_account"
 
@@ -8,7 +8,7 @@ name: cancel_workflow
 description: Workflow cancellation test — cancel while jobs are running
 project: slurm-tests
 execution_config:
-  mode: slurm
+  mode: direct
 
 resource_requirements:
   - name: sleep_resources
 
@@ -10,7 +10,7 @@ description: Test workflow for Slurm job retry with failure handlers
 project: slurm-tests
 metadata: '{"test_type": "failure_recovery", "stages": 3}'
 execution_config:
-  mode: slurm
+  mode: direct
 
 failure_handlers:
   - name: retry_on_exit_42
 
@@ -1,8 +1,8 @@
 # Test: Job-Based Parallelism
 #
-# 1-node allocation with NO resource_requirements on jobs.
-# Jobs get the auto-assigned "default" RR, so srun skips resource limit flags
-# and each job can use the full allocation's resources.
+# 1-node allocation with NO resource_requirements on jobs (direct mode).
+# Jobs get the auto-assigned "default" RR and run directly (no srun wrapper),
+# so each job can use the full allocation's resources.
 #
 # Concurrency is controlled by --max-parallel-jobs (passed via torc submit).
 # The test submits with --max-parallel-jobs 2, so 2 of the 4 jobs run at a time.
@@ -15,7 +15,7 @@ name: job_parallelism
 description: Job-based parallelism — no resource requirements, controlled by --max-parallel-jobs
 project: slurm-tests
 execution_config:
-  mode: slurm
+  mode: direct
 
 resource_monitor:
   enabled: true
 
@@ -8,7 +8,7 @@ name: resource_monitoring
 description: Resource monitoring validation — CPU and memory usage captured
 project: slurm-tests
 execution_config:
-  mode: slurm
+  mode: direct
 
 resource_monitor:
   enabled: true
Original file line number	Diff line number	Diff line change
`@@ -336,7 +336,7 @@`
`336`	`336`	`{`
`337`	`337`	`"name": "work_scheduler",`
`338`	`338`	`"account": "my_account",`
`339`		`- "walltime": "04:00:00",`
	`339`	`+ "walltime": "08:00:00",`
`340`	`340`	`"nodes": 1`
`341`	`341`	`},`
`342`	`342`	`{`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ slurm_scheduler "setup_scheduler" {`
`14`	`14`
`15`	`15`	`slurm_scheduler "work_scheduler" {`
`16`	`16`	`account "my_account"`
`17`		`- walltime "04:00:00"`
	`17`	`+ walltime "08:00:00"`
`18`	`18`	`nodes 1`
`19`	`19`	`}`
`20`	`20`