Skip to content

Commit c663fac

Browse files
committed
Fix GPU order in gres.conf
1 parent 6b80f29 commit c663fac

File tree

1 file changed

+27
-26
lines changed
  • soperator/modules/available_resources

1 file changed

+27
-26
lines changed

soperator/modules/available_resources/gres.tf

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ locals {
77
(local.platforms.gpu-b300-sxm) = "nvidia_b300_sxm6_ac"
88
})
99

10+
# The list of GPUs should be sorted by Links field to correspond to the GPU order in nvidia-smi
1011
gres_config_by_platforms = tomap({
1112
(local.platforms.cpu-e2) = [
1213
"AutoDetect=off"
@@ -15,54 +16,54 @@ locals {
1516
"AutoDetect=off"
1617
]
1718
(local.platforms.gpu-h100-sxm) = [
18-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia0 Cores=32-63 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
19-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia1 Cores=32-63 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
20-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia2 Cores=32-63 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
21-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia3 Cores=32-63 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
2219
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia4 Cores=0-31 Links=-1,1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
2320
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia5 Cores=0-31 Links=1,-1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
2421
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia6 Cores=0-31 Links=1,1,-1,1,1,1,1,1 Flags=nvidia_gpu_env",
2522
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia7 Cores=0-31 Links=1,1,1,-1,1,1,1,1 Flags=nvidia_gpu_env",
23+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia0 Cores=32-63 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
24+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia1 Cores=32-63 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
25+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia2 Cores=32-63 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
26+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h100-sxm]} File=/dev/nvidia3 Cores=32-63 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
2627
]
2728
(local.platforms.gpu-h200-sxm) = [
28-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia0 Cores=32-63 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
29-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia1 Cores=32-63 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
30-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia2 Cores=32-63 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
31-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia3 Cores=32-63 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
3229
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia4 Cores=0-31 Links=-1,1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
3330
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia5 Cores=0-31 Links=1,-1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
3431
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia6 Cores=0-31 Links=1,1,-1,1,1,1,1,1 Flags=nvidia_gpu_env",
3532
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia7 Cores=0-31 Links=1,1,1,-1,1,1,1,1 Flags=nvidia_gpu_env",
33+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia0 Cores=32-63 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
34+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia1 Cores=32-63 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
35+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia2 Cores=32-63 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
36+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-h200-sxm]} File=/dev/nvidia3 Cores=32-63 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
3637
]
3738
(local.platforms.gpu-b200-sxm) = [
38-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia0 Cores=40-79 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
39-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia1 Cores=40-79 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
40-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia2 Cores=40-79 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
41-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia3 Cores=40-79 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
4239
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia4 Cores=0-39 Links=-1,1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
4340
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia5 Cores=0-39 Links=1,-1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
4441
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia6 Cores=0-39 Links=1,1,-1,1,1,1,1,1 Flags=nvidia_gpu_env",
4542
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia7 Cores=0-39 Links=1,1,1,-1,1,1,1,1 Flags=nvidia_gpu_env",
43+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia0 Cores=40-79 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
44+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia1 Cores=40-79 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
45+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia2 Cores=40-79 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
46+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm]} File=/dev/nvidia3 Cores=40-79 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
4647
]
4748
(local.platforms.gpu-b200-sxm-a) = [
48-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia0 Cores=40-79 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
49-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia1 Cores=40-79 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
50-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia2 Cores=40-79 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
51-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia3 Cores=40-79 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
52-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia4 Cores=0-39 Links=1,1,1,-1,1,1,1,1 Flags=nvidia_gpu_env",
53-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia5 Cores=0-39 Links=1,1,-1,1,1,1,1,1 Flags=nvidia_gpu_env",
54-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia6 Cores=0-39 Links=1,-1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
5549
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia7 Cores=0-39 Links=-1,1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
50+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia6 Cores=0-39 Links=1,-1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
51+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia5 Cores=0-39 Links=1,1,-1,1,1,1,1,1 Flags=nvidia_gpu_env",
52+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia4 Cores=0-39 Links=1,1,1,-1,1,1,1,1 Flags=nvidia_gpu_env",
53+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia3 Cores=40-79 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
54+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia2 Cores=40-79 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
55+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia1 Cores=40-79 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
56+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b200-sxm-a]} File=/dev/nvidia0 Cores=40-79 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
5657
]
5758
(local.platforms.gpu-b300-sxm) = [
58-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia0 Cores=48-95 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
59-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia1 Cores=48-95 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
60-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia2 Cores=48-95 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
61-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia3 Cores=48-95 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
62-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia4 Cores=0-47 Links=1,1,1,-1,1,1,1,1 Flags=nvidia_gpu_env",
63-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia5 Cores=0-47 Links=1,1,-1,1,1,1,1,1 Flags=nvidia_gpu_env",
64-
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia6 Cores=0-47 Links=1,-1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
6559
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia7 Cores=0-47 Links=-1,1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
60+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia6 Cores=0-47 Links=1,-1,1,1,1,1,1,1 Flags=nvidia_gpu_env",
61+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia5 Cores=0-47 Links=1,1,-1,1,1,1,1,1 Flags=nvidia_gpu_env",
62+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia4 Cores=0-47 Links=1,1,1,-1,1,1,1,1 Flags=nvidia_gpu_env",
63+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia3 Cores=48-95 Links=1,1,1,1,-1,1,1,1 Flags=nvidia_gpu_env",
64+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia2 Cores=48-95 Links=1,1,1,1,1,-1,1,1 Flags=nvidia_gpu_env",
65+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia1 Cores=48-95 Links=1,1,1,1,1,1,-1,1 Flags=nvidia_gpu_env",
66+
"AutoDetect=off Name=gpu Type=${local.gres_by_platforms[local.platforms.gpu-b300-sxm]} File=/dev/nvidia0 Cores=48-95 Links=1,1,1,1,1,1,1,-1 Flags=nvidia_gpu_env",
6667
]
6768
})
6869
}

0 commit comments

Comments
 (0)