NatLabRockies
diff --git a/‎docs/src/core/reference/cli.md‎
Lines changed: 20 additions & 4 deletions b/‎docs/src/core/reference/cli.md‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎docs/src/specialized/fault-tolerance/automatic-recovery.md‎
Lines changed: 17 additions & 12 deletions b/‎docs/src/specialized/fault-tolerance/automatic-recovery.md‎
Lines changed: 17 additions & 12 deletions
diff --git a/‎src/cli.rs‎
Lines changed: 20 additions & 3 deletions b/‎src/cli.rs‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎src/client/commands/recover.rs‎
Lines changed: 24 additions & 10 deletions b/‎src/client/commands/recover.rs‎
Lines changed: 24 additions & 10 deletions
diff --git a/‎src/client/commands/slurm.rs‎
Lines changed: 43 additions & 2 deletions b/‎src/client/commands/slurm.rs‎
Lines changed: 43 additions & 2 deletions
@@ -180,7 +180,7 @@ resource requirements, and resubmits jobs.
    ```
 
    Automatically diagnoses OOM/timeout failures, adjusts resources, and retries. Runs until all jobs
-   complete or max retries exceeded.
+   complete. Use `--max-retries` to limit recovery attempts.
 
 3. **With auto-scheduling** (`--auto-schedule`):
 
@@ -207,7 +207,7 @@ resource requirements, and resubmits jobs.
 **Recovery:**
 
 - `-r`, `--recover` — Enable automatic failure recovery
-- `-m`, `--max-retries <MAX_RETRIES>` — Maximum number of recovery attempts. Default: `3`
+- `-m`, `--max-retries <MAX_RETRIES>` — Maximum number of recovery attempts. Default: unlimited
 - `--memory-multiplier <MEMORY_MULTIPLIER>` — Memory multiplier for OOM failures. Default: `1.5`
 - `--runtime-multiplier <RUNTIME_MULTIPLIER>` — Runtime multiplier for timeout failures. Default:
   `1.5`
@@ -225,6 +225,14 @@ resource requirements, and resubmits jobs.
 - `--auto-schedule-stranded-timeout <SECONDS>` — Schedule stranded jobs after this timeout even if
   below threshold. Default: `7200` (2 hrs). Set to `0` to disable.
 
+**Scheduler overrides:**
+
+- `--partition <PARTITION>` — Fixed Slurm partition for regenerated schedulers. Bypasses automatic
+  partition selection. Node count is still calculated dynamically.
+- `--walltime <WALLTIME>` — Fixed Slurm walltime for regenerated schedulers (format: `HH:MM:SS` or
+  `D-HH:MM:SS`). Bypasses automatic walltime calculation. Node count is still calculated
+  dynamically.
+
 ### Auto-Scheduling Behavior
 
 When `--auto-schedule` is enabled:
@@ -261,6 +269,10 @@ torc watch 123 --auto-schedule \
     --auto-schedule-threshold 10 \
     --auto-schedule-cooldown 3600 \
     --auto-schedule-stranded-timeout 14400
+
+# Fixed partition and walltime (dynamic node count only)
+# Useful for long-running checkpointable jobs
+torc watch 123 --auto-schedule --partition standard --walltime 04:00:00
 ```
 
 ### See Also
@@ -1701,6 +1713,10 @@ regenerate schedulers to submit new allocations.
 
 - `--account <ACCOUNT>` — Slurm account to use (defaults to account from existing schedulers)
 - `--profile <PROFILE>` — HPC profile to use (if not specified, tries to detect current system)
+- `--partition <PARTITION>` — Fixed Slurm partition (bypasses automatic partition selection). Node
+  count is still calculated dynamically.
+- `--walltime <WALLTIME>` — Fixed Slurm walltime (format: `HH:MM:SS` or `D-HH:MM:SS`). Bypasses
+  automatic walltime calculation. Node count is still calculated dynamically.
 - `--single-allocation` — Bundle all nodes into a single Slurm allocation per scheduler
 - `--submit` — Submit the generated allocations immediately
 - `-o`, `--output-dir <OUTPUT_DIR>` — Output directory for job output files (used when submitting).
@@ -1710,9 +1726,9 @@ regenerate schedulers to submit new allocations.
 - `--group-by <GROUP_BY>` — Strategy for grouping jobs into schedulers. Possible values:
   `resource-requirements` (default), `partition`
 - `--walltime-strategy <STRATEGY>` — Strategy for determining Slurm job walltime. Possible values:
-  `max-job-runtime` (default), `max-partition-time`
+  `max-job-runtime` (default), `max-partition-time`. Ignored when `--walltime` is set.
 - `--walltime-multiplier <MULTIPLIER>` — Multiplier for job runtime when using
-  `--walltime-strategy=max-job-runtime`. Default: `1.5`
+  `--walltime-strategy=max-job-runtime`. Default: `1.5`. Ignored when `--walltime` is set.
 - `--dry-run` — Show what would be created without making changes
 - `--include-job-ids <JOB_IDS>` — Include specific job IDs in planning regardless of their status
   (useful for recovery dry-run to include failed jobs)
 
@@ -157,14 +157,14 @@ This will:
 3. Adjust resource requirements based on heuristics
 4. Reset failed jobs and submit new Slurm allocations
 5. Resume monitoring
-6. Repeat until success or max retries exceeded
+6. Repeat until success (or max retries exceeded, if `--max-retries` is set)
 
 ### Options
 
 ```bash
 torc watch <workflow_id> \
   -r \                          # Enable automatic recovery (--recover)
-  -m 3 \                        # Maximum recovery attempts (--max-retries)
+  -m 5 \                        # Optional: limit recovery attempts (--max-retries)
   --memory-multiplier 1.5 \     # Memory increase factor for OOM
   --runtime-multiplier 1.5 \    # Runtime increase factor for timeout
   --retry-unknown \             # Also retry jobs with unknown failures
@@ -175,7 +175,9 @@ torc watch <workflow_id> \
   --auto-schedule \             # Automatically schedule nodes for stranded jobs
   --auto-schedule-threshold 5 \ # Min retry jobs before scheduling (default: 5)
   --auto-schedule-cooldown 1800 \      # Seconds between auto-schedule attempts (default: 1800)
-  --auto-schedule-stranded-timeout 7200  # Schedule stranded jobs after this time (default: 7200)
+  --auto-schedule-stranded-timeout 7200 \ # Schedule stranded jobs after this time (default: 7200)
+  --partition standard \        # Fixed Slurm partition (bypass auto-detection)
+  --walltime 04:00:00           # Fixed walltime (bypass auto-calculation)
 ```
 
 ### Custom Recovery Hooks
@@ -287,7 +289,7 @@ Submitted to Slurm with 10 allocations
 ### 2. Start Watching with Auto-Recovery
 
 ```bash
-torc watch 42 --recover --max-retries 3 --show-job-counts
+torc watch 42 --recover --show-job-counts
 ```
 
 > **Note:** The `--show-job-counts` flag is optional. Without it, the command polls silently until
@@ -296,7 +298,7 @@ torc watch 42 --recover --max-retries 3 --show-job-counts
 Output:
 
 ```
-Watching workflow 42 (poll interval: 60s, recover enabled, max retries: 3, job counts enabled)
+Watching workflow 42 (poll interval: 60s, recover enabled, unlimited retries, job counts enabled)
   completed=0, running=10, pending=0, failed=0, blocked=90
   completed=25, running=10, pending=0, failed=0, blocked=65
   ...
@@ -309,7 +311,7 @@ Workflow completed with failures:
   - Terminated: 0
   - Completed: 95
 
-Attempting automatic recovery (attempt 1/3)
+Attempting automatic recovery (attempt 1)
 
 Diagnosing failures...
 Applying recovery heuristics...
@@ -325,7 +327,7 @@ Regenerating Slurm schedulers and submitting...
 
 Recovery initiated. Resuming monitoring...
 
-Watching workflow 42 (poll interval: 60s, recover enabled, max retries: 3, job counts enabled)
+Watching workflow 42 (poll interval: 60s, recover enabled, unlimited retries, job counts enabled)
   completed=95, running=5, pending=0, failed=0, blocked=0
   ...
 Workflow 42 is complete
@@ -350,7 +352,7 @@ This prevents wasting allocation time on jobs that likely have script or data bu
 
 ### 4. If Max Retries Exceeded
 
-If failures persist after max retries:
+If `--max-retries` is set and failures persist after that many attempts:
 
 ```
 Max retries (3) exceeded. Manual intervention required.
@@ -417,13 +419,16 @@ Set initial resource requests lower and let auto-recovery increase them:
 - Only failing jobs get increased resources
 - Avoids wasting HPC resources on over-provisioned jobs
 
-### 2. Set Reasonable Max Retries
+### 2. Set Max Retries When Appropriate
+
+By default, `torc watch` retries indefinitely until the workflow succeeds. Use `--max-retries` to
+limit recovery attempts if needed:
 
 ```bash
---max-retries 3  # Good for most workflows
+--max-retries 5  # Limit to 5 recovery attempts
 ```
 
-Too many retries can waste allocation time on jobs that will never succeed.
+This can prevent wasting allocation time on jobs that will never succeed.
 
 ### 3. Use Appropriate Multipliers
 
@@ -551,7 +556,7 @@ If jobs are requesting more resources than partitions allow:
    - Run `torc slurm regenerate --submit`
    - Increment retry counter
    - Resume polling
-5. Exit 0 on success, exit 1 on max retries exceeded
+5. Exit 0 on success, exit 1 on max retries exceeded (if `--max-retries` is set)
 
 ### The Regenerate Command Flow
 
 
@@ -400,9 +400,9 @@ SEE ALSO:
         #[arg(short, long)]
         recover: bool,
 
-        /// Maximum number of recovery attempts
-        #[arg(short, long, default_value = "3")]
-        max_retries: u32,
+        /// Maximum number of recovery attempts (unlimited if not set)
+        #[arg(short, long)]
+        max_retries: Option<u32>,
 
         /// Memory multiplier for OOM failures (default: 1.5 = 50% increase)
         #[arg(long, default_value = "1.5")]
@@ -511,6 +511,23 @@ SEE ALSO:
         ///   claude - Claude Code CLI (default)
         #[arg(long, default_value = "claude", verbatim_doc_comment)]
         ai_agent: String,
+
+        /// Fixed Slurm partition for regenerated schedulers
+        ///
+        /// When set, all regenerated schedulers (from --auto-schedule or --recover)
+        /// use this partition instead of auto-detecting the best partition from job
+        /// resource requirements. The number of compute nodes is still calculated
+        /// dynamically based on pending jobs.
+        #[arg(long)]
+        partition: Option<String>,
+
+        /// Fixed Slurm walltime for regenerated schedulers (format: HH:MM:SS or D-HH:MM:SS)
+        ///
+        /// When set, all regenerated schedulers (from --auto-schedule or --recover)
+        /// use this walltime instead of calculating it from job runtimes. The number
+        /// of compute nodes is still calculated dynamically.
+        #[arg(long)]
+        walltime: Option<String>,
     },
     /// Recover a Slurm workflow from failures
     ///
 
@@ -369,7 +369,7 @@ pub fn recover_workflow(
 
     // Step 7: Regenerate Slurm schedulers and submit
     info!("Schedulers regenerating workflow_id={}", args.workflow_id);
-    regenerate_and_submit(args.workflow_id, &args.output_dir)?;
+    regenerate_and_submit(args.workflow_id, &args.output_dir, None, None)?;
 
     Ok(result)
 }
@@ -942,16 +942,30 @@ pub fn run_recovery_hook(workflow_id: i64, hook_command: &str) -> Result<(), Str
 }
 
 /// Regenerate Slurm schedulers and submit allocations
-pub fn regenerate_and_submit(workflow_id: i64, output_dir: &Path) -> Result<(), String> {
+pub fn regenerate_and_submit(
+    workflow_id: i64,
+    output_dir: &Path,
+    partition: Option<&str>,
+    walltime: Option<&str>,
+) -> Result<(), String> {
+    let mut args = vec![
+        "slurm".to_string(),
+        "regenerate".to_string(),
+        workflow_id.to_string(),
+        "--submit".to_string(),
+        "-o".to_string(),
+        output_dir.to_str().unwrap_or("torc_output").to_string(),
+    ];
+    if let Some(p) = partition {
+        args.push("--partition".to_string());
+        args.push(p.to_string());
+    }
+    if let Some(w) = walltime {
+        args.push("--walltime".to_string());
+        args.push(w.to_string());
+    }
     let output = Command::new("torc")
-        .args([
-            "slurm",
-            "regenerate",
-            &workflow_id.to_string(),
-            "--submit",
-            "-o",
-            output_dir.to_str().unwrap_or("torc_output"),
-        ])
+        .args(&args)
         .output()
         .map_err(|e| format!("Failed to run slurm regenerate: {}", e))?;
 
 
@@ -607,6 +607,22 @@ EXAMPLES:
         #[arg(long)]
         profile: Option<String>,
 
+        /// Fixed Slurm partition (bypasses automatic partition selection)
+        ///
+        /// When set, all regenerated schedulers use this partition instead of
+        /// auto-detecting the best partition from job resource requirements.
+        /// Node count is still calculated dynamically.
+        #[arg(long)]
+        partition: Option<String>,
+
+        /// Fixed Slurm walltime (bypasses automatic walltime calculation)
+        ///
+        /// When set, all regenerated schedulers use this walltime instead of
+        /// calculating it from job runtimes. Format: HH:MM:SS or D-HH:MM:SS.
+        /// Node count is still calculated dynamically.
+        #[arg(long)]
+        walltime: Option<String>,
+
         /// Bundle all nodes into a single Slurm allocation per scheduler
         #[arg(long)]
         single_allocation: bool,
@@ -627,6 +643,8 @@ EXAMPLES:
         ///   Longer walltime allows more sequential jobs per allocation, reducing
         ///   the total number of allocations. However, longer walltime requests
         ///   may receive lower queue priority from the scheduler.
+        ///
+        /// Ignored when --walltime is set.
         #[arg(long, value_enum, default_value_t = WalltimeStrategy::MaxJobRuntime)]
         walltime_strategy: WalltimeStrategy,
 
@@ -635,6 +653,8 @@ EXAMPLES:
         /// The maximum job runtime is multiplied by this value to provide a safety
         /// margin. For example, 1.5 means requesting 50% more time than the longest
         /// job estimate.
+        ///
+        /// Ignored when --walltime is set.
         #[arg(long, default_value = "1.5")]
         walltime_multiplier: f64,
 
@@ -732,7 +752,9 @@ pub fn generate_schedulers_for_workflow(
         spec.actions = None;
     }
 
-    use crate::client::scheduler_plan::{apply_plan_to_spec, generate_scheduler_plan};
+    use crate::client::scheduler_plan::{
+        SchedulerOverrides, apply_plan_to_spec, generate_scheduler_plan,
+    };
 
     // Save original jobs and files before expansion so we can restore them later
     let original_jobs = spec.jobs.clone();
@@ -782,6 +804,7 @@ pub fn generate_schedulers_for_workflow(
         add_actions,
         None,  // No suffix for regular generation (uses "_scheduler")
         false, // Not a recovery scenario
+        &SchedulerOverrides::default(),
     );
 
     // Combine warnings
@@ -1382,6 +1405,8 @@ pub fn handle_slurm_commands(config: &Configuration, command: &SlurmCommands, fo
             workflow_id,
             account,
             profile: profile_name,
+            partition,
+            walltime,
             single_allocation,
             group_by,
             walltime_strategy,
@@ -1402,6 +1427,8 @@ pub fn handle_slurm_commands(config: &Configuration, command: &SlurmCommands, fo
                 *workflow_id,
                 account.as_deref(),
                 profile_name.as_deref(),
+                partition.as_deref(),
+                walltime.as_deref(),
                 *single_allocation,
                 *group_by,
                 *walltime_strategy,
@@ -3376,6 +3403,8 @@ fn handle_regenerate(
     workflow_id: i64,
     account: Option<&str>,
     profile_name: Option<&str>,
+    partition: Option<&str>,
+    walltime: Option<&str>,
     single_allocation: bool,
     group_by: GroupByStrategy,
     walltime_strategy: WalltimeStrategy,
@@ -3585,7 +3614,18 @@ fn handle_regenerate(
             std::process::exit(1);
         });
 
-    use crate::client::scheduler_plan::generate_scheduler_plan;
+    use crate::client::scheduler_plan::{SchedulerOverrides, generate_scheduler_plan};
+
+    // Build overrides from partition/walltime arguments
+    let overrides = SchedulerOverrides {
+        partition: partition.map(|s| s.to_string()),
+        walltime_secs: walltime.map(|w| {
+            parse_walltime_secs(w).unwrap_or_else(|e| {
+                eprintln!("Error: invalid --walltime '{}': {}", w, e);
+                std::process::exit(1);
+            })
+        }),
+    };
 
     // Build WorkflowGraph from pending jobs for proper dependency-aware grouping
     // This aligns with create-slurm's behavior of separating jobs by (rr, has_dependencies)
@@ -3628,6 +3668,7 @@ fn handle_regenerate(
         true, // add_actions (we'll create them as recovery actions)
         Some(&format!("regen_{}", timestamp)),
         true, // is_recovery
+        &overrides,
     );
 
     // Combine warnings from planning