Skip to content

Commit 9da8bc8

Browse files
committed
Fix memory settings for Kestrel HPC
The original values were based on scraping the website and had base 2 vs base 10 memory errors. This applies exact memory values per partition from sinfo output.
1 parent 041edd2 commit 9da8bc8

File tree

2 files changed

+25
-25
lines changed

2 files changed

+25
-25
lines changed

src/client/hpc/kestrel.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//! NLR Kestrel HPC profile
22
//!
33
//! Kestrel is NLR's flagship HPC system featuring:
4-
//! - 2,240 standard CPU nodes (104 cores, 240GB RAM each)
4+
//! - 2,240 standard CPU nodes (104 cores, ~240 GiB RAM each)
55
//! - 156 GPU nodes with 4x NVIDIA H100 GPUs (80GB each)
66
//! - Various specialized partitions for different workload types
77
//!
@@ -42,7 +42,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
4242
name: "debug".to_string(),
4343
description: "Nodes dedicated to developing and troubleshooting jobs".to_string(),
4444
cpus_per_node: 104,
45-
memory_mb: 240_000,
45+
memory_mb: 246_064,
4646
max_walltime_secs: 3600, // 1 hour
4747
max_nodes: Some(2),
4848
max_nodes_per_user: Some(2),
@@ -61,7 +61,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
6161
name: "short".to_string(),
6262
description: "Nodes that prefer jobs with walltimes <= 4 hours".to_string(),
6363
cpus_per_node: 104,
64-
memory_mb: 240_000, // ~240G usable (984256M total but we use practical limit)
64+
memory_mb: 246_064,
6565
max_walltime_secs: 4 * 3600, // 4 hours
6666
max_nodes: Some(2240),
6767
max_nodes_per_user: None,
@@ -80,7 +80,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
8080
name: "standard".to_string(),
8181
description: "Nodes that prefer jobs with walltimes <= 2 days".to_string(),
8282
cpus_per_node: 104,
83-
memory_mb: 240_000,
83+
memory_mb: 246_064,
8484
max_walltime_secs: 2 * 24 * 3600, // 2 days
8585
max_nodes: Some(2240),
8686
max_nodes_per_user: Some(1050),
@@ -99,7 +99,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
9999
name: "long".to_string(),
100100
description: "Nodes that prefer jobs with walltimes > 2 days (max 10 days)".to_string(),
101101
cpus_per_node: 104,
102-
memory_mb: 240_000,
102+
memory_mb: 246_064,
103103
max_walltime_secs: 10 * 24 * 3600, // 10 days
104104
max_nodes: Some(430),
105105
max_nodes_per_user: Some(215),
@@ -118,7 +118,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
118118
name: "medmem".to_string(),
119119
description: "Nodes with 1TB of RAM".to_string(),
120120
cpus_per_node: 104,
121-
memory_mb: 1_000_000, // ~1TB
121+
memory_mb: 984_256,
122122
max_walltime_secs: 10 * 24 * 3600, // 10 days
123123
max_nodes: Some(64),
124124
max_nodes_per_user: Some(32),
@@ -176,7 +176,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
176176
description: "CPU nodes with dual network interface cards for multi-node jobs"
177177
.to_string(),
178178
cpus_per_node: 104,
179-
memory_mb: 240_000,
179+
memory_mb: 246_064,
180180
max_walltime_secs: 2 * 24 * 3600, // 2 days
181181
max_nodes: Some(512),
182182
max_nodes_per_user: Some(256),
@@ -195,7 +195,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
195195
name: "hbwl".to_string(),
196196
description: "HBW nodes for jobs > 2 days (max 10 days)".to_string(),
197197
cpus_per_node: 104,
198-
memory_mb: 240_000,
198+
memory_mb: 246_064,
199199
max_walltime_secs: 10 * 24 * 3600, // 10 days
200200
max_nodes: Some(128),
201201
max_nodes_per_user: Some(64),
@@ -214,7 +214,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
214214
name: "nvme".to_string(),
215215
description: "CPU nodes with 1.7TB NVMe local drives".to_string(),
216216
cpus_per_node: 104,
217-
memory_mb: 240_000,
217+
memory_mb: 246_064,
218218
max_walltime_secs: 2 * 24 * 3600, // 2 days
219219
max_nodes: Some(256),
220220
max_nodes_per_user: Some(128),
@@ -233,7 +233,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
233233
name: "shared".to_string(),
234234
description: "Nodes that can be shared by multiple users and jobs".to_string(),
235235
cpus_per_node: 104,
236-
memory_mb: 240_000,
236+
memory_mb: 246_064,
237237
max_walltime_secs: 2 * 24 * 3600, // 2 days
238238
max_nodes: Some(128),
239239
max_nodes_per_user: Some(64),
@@ -252,7 +252,7 @@ fn kestrel_partitions() -> Vec<HpcPartition> {
252252
name: "sharedl".to_string(),
253253
description: "Shared nodes for jobs > 2 days".to_string(),
254254
cpus_per_node: 104,
255-
memory_mb: 240_000,
255+
memory_mb: 246_064,
256256
max_walltime_secs: 10 * 24 * 3600, // Docs say 2 days but listing says 10 days pattern
257257
max_nodes: Some(32),
258258
max_nodes_per_user: Some(16),

tests/test_hpc.rs

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ fn test_kestrel_standard_partition() {
215215
.expect("Standard partition not found");
216216

217217
assert_eq!(standard.cpus_per_node, 104);
218-
assert_eq!(standard.memory_mb, 240_000);
218+
assert_eq!(standard.memory_mb, 246_064);
219219
assert_eq!(standard.max_walltime_secs, 172800); // 48 hours
220220
assert!(standard.gpus_per_node.is_none());
221221
}
@@ -1055,8 +1055,8 @@ fn test_generate_schedulers_sets_memory() {
10551055
let scheduler = &spec.slurm_schedulers.as_ref().unwrap()[0];
10561056
// Memory should be set to the partition's max memory, not the job's requirement.
10571057
// This allows jobs to use more memory than their estimates.
1058-
// Kestrel standard partition has 240,000 MB = 234g.
1059-
assert_eq!(scheduler.mem.as_deref(), Some("234g"));
1058+
// Kestrel standard partition has 246,064 MB = 240g.
1059+
assert_eq!(scheduler.mem.as_deref(), Some("240g"));
10601060
}
10611061

10621062
#[rstest]
@@ -1380,10 +1380,10 @@ fn test_generate_schedulers_stage_aware_for_dependent_jobs() {
13801380
#[rstest]
13811381
fn test_generate_schedulers_memory_constrained_allocation() {
13821382
// Create 10 jobs that are memory-heavy: 8 CPUs, 120GB each
1383-
// On Kestrel standard nodes (104 CPUs, 240GB):
1383+
// On Kestrel standard nodes (104 CPUs, 246,064MB):
13841384
// - CPU-based: 104/8 = 13 jobs per node
1385-
// - Memory-based: 240,000MB / 122,880MB = ~1.95 = 1 job per node
1386-
// Memory should be the limiting factor, so we need 10 nodes for 10 jobs
1385+
// - Memory-based: 246,064MB / 122,880MB = 2 jobs per node
1386+
// Memory should be the limiting factor
13871387
let jobs: Vec<JobSpec> = (0..10)
13881388
.map(|i| JobSpec {
13891389
name: format!("memory_job_{}", i),
@@ -1431,14 +1431,14 @@ fn test_generate_schedulers_memory_constrained_allocation() {
14311431

14321432
let action = &actions[0];
14331433
// 10 jobs, 120GB memory each, 1 hour runtime
1434-
// Concurrent by memory: 240GB / 120GB = 2 (but actually 240000MB / 122880MB = 1.95, so 1)
1434+
// Concurrent by memory: 246,064MB / 122,880MB = 2 jobs per node
14351435
// Time slots: 4h walltime / 1h runtime = 4 sequential batches
1436-
// Jobs per allocation: 1 concurrent × 4 time slots = 4 jobs
1437-
// Allocations needed: ceil(10 / 4) = 3
1436+
// Jobs per allocation: 2 concurrent × 4 time slots = 8 jobs
1437+
// Allocations needed: ceil(10 / 8) = 2
14381438
assert_eq!(
14391439
action.num_allocations,
1440-
Some(3),
1441-
"Should allocate 3 nodes for 10 memory-heavy jobs (1 concurrent × 4 time slots = 4 jobs per allocation)"
1440+
Some(2),
1441+
"Should allocate 2 nodes for 10 memory-heavy jobs (2 concurrent × 4 time slots = 8 jobs per allocation)"
14421442
);
14431443
}
14441444

@@ -1450,7 +1450,7 @@ fn test_generate_schedulers_cpu_vs_memory_constraint() {
14501450
description: Some("Test CPU vs memory constraints".to_string()),
14511451
jobs: vec![
14521452
// 4 CPU-limited jobs: 52 CPUs, 60GB each
1453-
// On 104 CPU / 240GB node: 104/52=2 by CPU, 240000/61440=3.9 by memory -> CPU wins (2 per node)
1453+
// On 104 CPU / 246,064MB node: 104/52=2 by CPU, 246064/61440=4 by memory -> CPU wins (2 per node)
14541454
// 4 jobs / 2 per node = 2 allocations
14551455
JobSpec {
14561456
name: "cpu_job_1".to_string(),
@@ -1507,8 +1507,8 @@ fn test_generate_schedulers_cpu_vs_memory_constraint() {
15071507

15081508
// 4 jobs, 52 CPUs each, 60GB memory, 1 hour runtime
15091509
// Concurrent by CPU: 104/52 = 2 jobs per node
1510-
// Concurrent by memory: 240000/61440 = 3.9 = 3 jobs per node
1511-
// Concurrent = min(2, 3) = 2 jobs per node (CPU-limited)
1510+
// Concurrent by memory: 246064/61440 = 4 jobs per node
1511+
// Concurrent = min(2, 4) = 2 jobs per node (CPU-limited)
15121512
// Time slots: 4h walltime / 1h runtime = 4 sequential batches
15131513
// Jobs per allocation: 2 concurrent × 4 time slots = 8 jobs
15141514
// Allocations needed: ceil(4 / 8) = 1

0 commit comments

Comments
 (0)