Skip to content

Commit e5a3c1a

Browse files
committed
test upgrade nvidia driver version
1 parent ddb2e4c commit e5a3c1a

File tree

7 files changed

+29
-13
lines changed

7 files changed

+29
-13
lines changed

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
# NVidia
1818
default['cluster']['nvidia']['enabled'] = 'no'
19-
default['cluster']['nvidia']['driver_version'] = '550.127.08'
20-
default['cluster']['nvidia']['dcgm_version'] = '3.3.6'
19+
default['cluster']['nvidia']['driver_version'] = '570.86.15'
20+
default['cluster']['nvidia']['dcgm_version'] = '3.3.9'
2121

2222
# DCV
2323
default['cluster']['dcv']['authenticator']['user'] = "dcvextauth"

cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@
1919

2020
# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
2121
# Cuda installer naming: cuda_11.8.0_520.61.05_linux
22-
cuda_version = '12.4'
23-
cuda_patch = '1'
22+
cuda_version = '12.8'
23+
cuda_patch = '0'
2424
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
25-
cuda_version_suffix = '550.54.15'
25+
cuda_version_suffix = '570.86.10'
2626
cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
27-
cuda_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
28-
cuda_samples_version = '12.4'
29-
cuda_samples_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz"
27+
cuda_url = "https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
28+
cuda_samples_version = '12.8'
29+
cuda_samples_url = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{cuda_samples_version}.tar.gz"
3030
tmp_cuda_run = '/tmp/cuda.run'
3131
tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'
3232

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_debian.rb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ def arch_suffix
3838
arm_instance? ? 'arm64' : 'amd64'
3939
end
4040

41+
def arch_name
42+
arm_instance? ? 'sbsa' : 'x86_64'
43+
end
44+
4145
def fabric_manager_url
42-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb"
46+
"https://developer.download.nvidia.com/compute/cuda/repos/#{platform}/#{arch_name}/#{fabric_manager_package}_#{fabric_manager_version}-1_#{arch_suffix}.deb"
4347
end

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_install_rhel.rb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ def arch_suffix
3939
arm_instance? ? 'aarch64' : 'x86_64'
4040
end
4141

42+
def arch_name
43+
arm_instance? ? 'sbsa' : 'x86_64'
44+
end
45+
4246
def fabric_manager_url
43-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
47+
"https://developer.download.nvidia.com/compute/cuda/repos/#{platform}/#{arch_name}/#{fabric_manager_package}-#{fabric_manager_version}-1.#{arch_suffix}.rpm"
4448
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@
3333
end
3434
end
3535

36+
def arch_name
37+
arm_instance? ? 'sbsa' : 'x86_64'
38+
end
39+
3640
def dcgm_url
37-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb"
41+
"https://developer.download.nvidia.com/compute/cuda/repos/#{platform}/#{arch_name}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb"
3842
end
3943

4044
def dcgm_package

cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@
3333
end
3434
end
3535

36+
def arch_name
37+
arm_instance? ? 'sbsa' : 'x86_64'
38+
end
39+
3640
def dcgm_url
37-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm"
41+
"https://developer.download.nvidia.com/compute/cuda/repos/#{platform}/#{arch_name}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm"
3842
end
3943

4044
def dcgm_package

cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def _nvidia_driver_version
9696
end
9797

9898
def nvidia_driver_url
99-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{_nvidia_driver_version}.run"
99+
"https://us.download.nvidia.com/tesla/#{_nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{_nvidia_driver_version}.run"
100100
end
101101

102102
def nvidia_driver_enabled?

0 commit comments

Comments
 (0)