File tree Expand file tree Collapse file tree 7 files changed +29
-13
lines changed
cookbooks/aws-parallelcluster-platform Expand file tree Collapse file tree 7 files changed +29
-13
lines changed Original file line number Diff line number Diff line change 1616
1717# NVidia
1818default [ 'cluster' ] [ 'nvidia' ] [ 'enabled' ] = 'no'
19- default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '550.127.08 '
20- default [ 'cluster' ] [ 'nvidia' ] [ 'dcgm_version' ] = '3.3.6 '
19+ default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '570.86.15 '
20+ default [ 'cluster' ] [ 'nvidia' ] [ 'dcgm_version' ] = '3.3.9 '
2121
2222# DCV
2323default [ 'cluster' ] [ 'dcv' ] [ 'authenticator' ] [ 'user' ] = "dcvextauth"
Original file line number Diff line number Diff line change 1919
2020# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
2121# Cuda installer naming: cuda_11.8.0_520.61.05_linux
22- cuda_version = '12.4 '
23- cuda_patch = '1 '
22+ cuda_version = '12.8 '
23+ cuda_patch = '0 '
2424cuda_complete_version = "#{ cuda_version } .#{ cuda_patch } "
25- cuda_version_suffix = '550.54.15 '
25+ cuda_version_suffix = '570.86.10 '
2626cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
27- cuda_url = "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/ cuda/cuda_#{ cuda_complete_version } _#{ cuda_version_suffix } _#{ cuda_arch } .run"
28- cuda_samples_version = '12.4 '
29- cuda_samples_url = "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/cuda/ samples/v#{ cuda_samples_version } .tar.gz"
27+ cuda_url = "https://developer.download.nvidia.com/compute/ cuda/12.8.0/local_installers /cuda_#{ cuda_complete_version } _#{ cuda_version_suffix } _#{ cuda_arch } .run"
28+ cuda_samples_version = '12.8 '
29+ cuda_samples_url = "https://github.com/NVIDIA/cuda- samples/archive/refs/tags /v#{ cuda_samples_version } .tar.gz"
3030tmp_cuda_run = '/tmp/cuda.run'
3131tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'
3232
Original file line number Diff line number Diff line change @@ -38,6 +38,10 @@ def arch_suffix
3838 arm_instance? ? 'arm64' : 'amd64'
3939end
4040
41+ def arch_name
42+ arm_instance? ? 'sbsa' : 'x86_64'
43+ end
44+
4145def fabric_manager_url
42- "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/nvidia_fabric/ #{ platform } /#{ fabric_manager_package } _#{ fabric_manager_version } -1_#{ arch_suffix } .deb"
46+ "https://developer.download.nvidia.com/compute/cuda/repos/ #{ platform } / #{ arch_name } /#{ fabric_manager_package } _#{ fabric_manager_version } -1_#{ arch_suffix } .deb"
4347end
Original file line number Diff line number Diff line change @@ -39,6 +39,10 @@ def arch_suffix
3939 arm_instance? ? 'aarch64' : 'x86_64'
4040end
4141
42+ def arch_name
43+ arm_instance? ? 'sbsa' : 'x86_64'
44+ end
45+
4246def fabric_manager_url
43- "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/nvidia_fabric/ #{ platform } /#{ fabric_manager_package } -#{ fabric_manager_version } -1.#{ arch_suffix } .rpm"
47+ "https://developer.download.nvidia.com/compute/cuda/repos/ #{ platform } / #{ arch_name } /#{ fabric_manager_package } -#{ fabric_manager_version } -1.#{ arch_suffix } .rpm"
4448end
Original file line number Diff line number Diff line change 3333 end
3434end
3535
36+ def arch_name
37+ arm_instance? ? 'sbsa' : 'x86_64'
38+ end
39+
3640def dcgm_url
37- "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/nvidia_dcgm/ #{ platform } /#{ dcgm_package } _#{ package_version } _#{ arch_suffix } .deb"
41+ "https://developer.download.nvidia.com/compute/cuda/repos/ #{ platform } / #{ arch_name } /#{ dcgm_package } _#{ package_version } _#{ arch_suffix } .deb"
3842end
3943
4044def dcgm_package
Original file line number Diff line number Diff line change 3333 end
3434end
3535
36+ def arch_name
37+ arm_instance? ? 'sbsa' : 'x86_64'
38+ end
39+
3640def dcgm_url
37- "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/nvidia_dcgm/ #{ platform } /#{ dcgm_package } -#{ package_version } -1-#{ arch_suffix } .rpm"
41+ "https://developer.download.nvidia.com/compute/cuda/repos/ #{ platform } / #{ arch_name } /#{ dcgm_package } -#{ package_version } -1-#{ arch_suffix } .rpm"
3842end
3943
4044def dcgm_package
Original file line number Diff line number Diff line change @@ -96,7 +96,7 @@ def _nvidia_driver_version
9696end
9797
9898def nvidia_driver_url
99- "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/nvidia_driver /NVIDIA-Linux-#{ nvidia_arch } -#{ _nvidia_driver_version } .run"
99+ "https://us.download.nvidia.com/tesla/ #{ _nvidia_driver_version } /NVIDIA-Linux-#{ nvidia_arch } -#{ _nvidia_driver_version } .run"
100100end
101101
102102def nvidia_driver_enabled?
You can’t perform that action at this time.
0 commit comments