Fix memory settings for Kestrel HPC

daniel-thom · daniel-thom · commit 9da8bc8dd8f5 · 2026-02-17T19:13:40.000-07:00
The original values were based on scraping the website and had base 2
vs base 10 memory errors. This applies exact memory values per partition
from sinfo output.
diff --git a/src/client/hpc/kestrel.rs b/src/client/hpc/kestrel.rs
@@ -1,7 +1,7 @@
 //! NLR Kestrel HPC profile
 //!
 //! Kestrel is NLR's flagship HPC system featuring:
-//! - 2,240 standard CPU nodes (104 cores, 240GB RAM each)
+//! - 2,240 standard CPU nodes (104 cores, ~240 GiB RAM each)
 //! - 156 GPU nodes with 4x NVIDIA H100 GPUs (80GB each)
 //! - Various specialized partitions for different workload types
 //!
@@ -42,7 +42,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             name: "debug".to_string(),
             description: "Nodes dedicated to developing and troubleshooting jobs".to_string(),
             cpus_per_node: 104,
-            memory_mb: 240_000,
+            memory_mb: 246_064,
             max_walltime_secs: 3600, // 1 hour
             max_nodes: Some(2),
             max_nodes_per_user: Some(2),
@@ -61,7 +61,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             name: "short".to_string(),
             description: "Nodes that prefer jobs with walltimes <= 4 hours".to_string(),
             cpus_per_node: 104,
-            memory_mb: 240_000, // ~240G usable (984256M total but we use practical limit)
+            memory_mb: 246_064,
             max_walltime_secs: 4 * 3600, // 4 hours
             max_nodes: Some(2240),
             max_nodes_per_user: None,
@@ -80,7 +80,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             name: "standard".to_string(),
             description: "Nodes that prefer jobs with walltimes <= 2 days".to_string(),
             cpus_per_node: 104,
-            memory_mb: 240_000,
+            memory_mb: 246_064,
             max_walltime_secs: 2 * 24 * 3600, // 2 days
             max_nodes: Some(2240),
             max_nodes_per_user: Some(1050),
@@ -99,7 +99,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             name: "long".to_string(),
             description: "Nodes that prefer jobs with walltimes > 2 days (max 10 days)".to_string(),
             cpus_per_node: 104,
-            memory_mb: 240_000,
+            memory_mb: 246_064,
             max_walltime_secs: 10 * 24 * 3600, // 10 days
             max_nodes: Some(430),
             max_nodes_per_user: Some(215),
@@ -118,7 +118,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             name: "medmem".to_string(),
             description: "Nodes with 1TB of RAM".to_string(),
             cpus_per_node: 104,
-            memory_mb: 1_000_000,              // ~1TB
+            memory_mb: 984_256,
             max_walltime_secs: 10 * 24 * 3600, // 10 days
             max_nodes: Some(64),
             max_nodes_per_user: Some(32),
@@ -176,7 +176,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             description: "CPU nodes with dual network interface cards for multi-node jobs"
                 .to_string(),
             cpus_per_node: 104,
-            memory_mb: 240_000,
+            memory_mb: 246_064,
             max_walltime_secs: 2 * 24 * 3600, // 2 days
             max_nodes: Some(512),
             max_nodes_per_user: Some(256),
@@ -195,7 +195,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             name: "hbwl".to_string(),
             description: "HBW nodes for jobs > 2 days (max 10 days)".to_string(),
             cpus_per_node: 104,
-            memory_mb: 240_000,
+            memory_mb: 246_064,
             max_walltime_secs: 10 * 24 * 3600, // 10 days
             max_nodes: Some(128),
             max_nodes_per_user: Some(64),
@@ -214,7 +214,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             name: "nvme".to_string(),
             description: "CPU nodes with 1.7TB NVMe local drives".to_string(),
             cpus_per_node: 104,
-            memory_mb: 240_000,
+            memory_mb: 246_064,
             max_walltime_secs: 2 * 24 * 3600, // 2 days
             max_nodes: Some(256),
             max_nodes_per_user: Some(128),
@@ -233,7 +233,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             name: "shared".to_string(),
             description: "Nodes that can be shared by multiple users and jobs".to_string(),
             cpus_per_node: 104,
-            memory_mb: 240_000,
+            memory_mb: 246_064,
             max_walltime_secs: 2 * 24 * 3600, // 2 days
             max_nodes: Some(128),
             max_nodes_per_user: Some(64),
@@ -252,7 +252,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
             name: "sharedl".to_string(),
             description: "Shared nodes for jobs > 2 days".to_string(),
             cpus_per_node: 104,
-            memory_mb: 240_000,
+            memory_mb: 246_064,
             max_walltime_secs: 10 * 24 * 3600, // Docs say 2 days but listing says 10 days pattern
             max_nodes: Some(32),
             max_nodes_per_user: Some(16),
diff --git a/tests/test_hpc.rs b/tests/test_hpc.rs
@@ -215,7 +215,7 @@ fn test_kestrel_standard_partition() {
         .expect("Standard partition not found");
 
     assert_eq!(standard.cpus_per_node, 104);
-    assert_eq!(standard.memory_mb, 240_000);
+    assert_eq!(standard.memory_mb, 246_064);
     assert_eq!(standard.max_walltime_secs, 172800); // 48 hours
     assert!(standard.gpus_per_node.is_none());
 }
@@ -1055,8 +1055,8 @@ fn test_generate_schedulers_sets_memory() {
     let scheduler = &spec.slurm_schedulers.as_ref().unwrap()[0];
     // Memory should be set to the partition's max memory, not the job's requirement.
     // This allows jobs to use more memory than their estimates.
-    // Kestrel standard partition has 240,000 MB = 234g.
-    assert_eq!(scheduler.mem.as_deref(), Some("234g"));
+    // Kestrel standard partition has 246,064 MB = 240g.
+    assert_eq!(scheduler.mem.as_deref(), Some("240g"));
 }
 
 #[rstest]
@@ -1380,10 +1380,10 @@ fn test_generate_schedulers_stage_aware_for_dependent_jobs() {
 #[rstest]
 fn test_generate_schedulers_memory_constrained_allocation() {
     // Create 10 jobs that are memory-heavy: 8 CPUs, 120GB each
-    // On Kestrel standard nodes (104 CPUs, 240GB):
+    // On Kestrel standard nodes (104 CPUs, 246,064MB):
     // - CPU-based: 104/8 = 13 jobs per node
-    // - Memory-based: 240,000MB / 122,880MB = ~1.95 = 1 job per node
-    // Memory should be the limiting factor, so we need 10 nodes for 10 jobs
+    // - Memory-based: 246,064MB / 122,880MB = 2 jobs per node
+    // Memory should be the limiting factor
     let jobs: Vec<JobSpec> = (0..10)
         .map(|i| JobSpec {
             name: format!("memory_job_{}", i),
@@ -1431,14 +1431,14 @@ fn test_generate_schedulers_memory_constrained_allocation() {
 
     let action = &actions[0];
     // 10 jobs, 120GB memory each, 1 hour runtime
-    // Concurrent by memory: 240GB / 120GB = 2 (but actually 240000MB / 122880MB = 1.95, so 1)
+    // Concurrent by memory: 246,064MB / 122,880MB = 2 jobs per node
     // Time slots: 4h walltime / 1h runtime = 4 sequential batches
-    // Jobs per allocation: 1 concurrent × 4 time slots = 4 jobs
-    // Allocations needed: ceil(10 / 4) = 3
+    // Jobs per allocation: 2 concurrent × 4 time slots = 8 jobs
+    // Allocations needed: ceil(10 / 8) = 2
     assert_eq!(
         action.num_allocations,
-        Some(3),
-        "Should allocate 3 nodes for 10 memory-heavy jobs (1 concurrent × 4 time slots = 4 jobs per allocation)"
+        Some(2),
+        "Should allocate 2 nodes for 10 memory-heavy jobs (2 concurrent × 4 time slots = 8 jobs per allocation)"
     );
 }
 
@@ -1450,7 +1450,7 @@ fn test_generate_schedulers_cpu_vs_memory_constraint() {
         description: Some("Test CPU vs memory constraints".to_string()),
         jobs: vec![
             // 4 CPU-limited jobs: 52 CPUs, 60GB each
-            // On 104 CPU / 240GB node: 104/52=2 by CPU, 240000/61440=3.9 by memory -> CPU wins (2 per node)
+            // On 104 CPU / 246,064MB node: 104/52=2 by CPU, 246064/61440=4 by memory -> CPU wins (2 per node)
             // 4 jobs / 2 per node = 2 allocations
             JobSpec {
                 name: "cpu_job_1".to_string(),
@@ -1507,8 +1507,8 @@ fn test_generate_schedulers_cpu_vs_memory_constraint() {
 
     // 4 jobs, 52 CPUs each, 60GB memory, 1 hour runtime
     // Concurrent by CPU: 104/52 = 2 jobs per node
-    // Concurrent by memory: 240000/61440 = 3.9 = 3 jobs per node
-    // Concurrent = min(2, 3) = 2 jobs per node (CPU-limited)
+    // Concurrent by memory: 246064/61440 = 4 jobs per node
+    // Concurrent = min(2, 4) = 2 jobs per node (CPU-limited)
     // Time slots: 4h walltime / 1h runtime = 4 sequential batches
     // Jobs per allocation: 2 concurrent × 4 time slots = 8 jobs
     // Allocations needed: ceil(4 / 8) = 1