Skip to content

Commit 9f70919

Browse files
authored
Upgrade NVIDIA driver and cuda version (aws#2887)
* Upgrade NVIDIA driver and cuda version * Update CHANGELOG * Fix unit tests
1 parent 89e4c46 commit 9f70919

File tree

6 files changed

+23
-11
lines changed

6 files changed

+23
-11
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1010
- Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security.
1111

1212
**CHANGES**
13+
- Upgrade NVIDIA driver to version 570.86.15 (from 550.127.08) for all OSs except AL2.
14+
- Upgrade CUDA Toolkit to version 12.8.0 (from 12.4.1) for all OSs except AL2.
1315
- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel.
1416
- Upgrade `aws-cfn-bootstrap` to version 2.0-32.
1517
- Upgrade amazon-efs-utils to version 2.1.0.

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@
1616

1717
# NVidia
1818
default['cluster']['nvidia']['enabled'] = 'no'
19-
default['cluster']['nvidia']['driver_version'] = '550.127.08'
19+
default['cluster']['nvidia']['driver_version'] = '570.86.15'
2020
default['cluster']['nvidia']['dcgm_version'] = '3.3.6'
21+
if platform?('amazon') && node['platform_version'] == "2"
22+
default['cluster']['nvidia']['driver_version'] = '550.127.08'
23+
end
2124

2225
# DCV
2326
default['cluster']['dcv']['authenticator']['user'] = "dcvextauth"

cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,20 @@
1919

2020
# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
2121
# Cuda installer naming: cuda_11.8.0_520.61.05_linux
22-
cuda_version = '12.4'
23-
cuda_patch = '1'
22+
cuda_version = '12.8'
23+
cuda_patch = '0'
2424
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
25-
cuda_version_suffix = '550.54.15'
25+
cuda_version_suffix = '570.86.10'
26+
cuda_samples_version = '12.8'
27+
if platform?('amazon') && node['platform_version'] == "2"
28+
cuda_version = '12.4'
29+
cuda_patch = '1'
30+
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
31+
cuda_version_suffix = '550.54.15'
32+
cuda_samples_version = '12.4'
33+
end
2634
cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
2735
cuda_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run"
28-
cuda_samples_version = '12.4'
2936
cuda_samples_url = "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz"
3037
tmp_cuda_run = '/tmp/cuda.run'
3138
tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_ubuntu20+.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
use 'partial/_fabric_manager_install_debian.rb'
2121

2222
def fabric_manager_package
23-
'nvidia-fabricmanager-550'
23+
'nvidia-fabricmanager-570'
2424
end
2525

2626
def fabric_manager_version

cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
require 'spec_helper'
22

33
describe 'aws-parallelcluster-platform::cuda' do
4-
cached(:cuda_version) { '12.4' }
5-
cached(:cuda_patch) { '1' }
4+
cached(:cuda_version) { '12.8' }
5+
cached(:cuda_patch) { '0' }
66
cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" }
7-
cached(:cuda_version_suffix) { '550.54.15' }
7+
cached(:cuda_version_suffix) { '570.86.10' }
88

99
context 'when nvidia not enabled' do
1010
cached(:chef_run) do
@@ -20,7 +20,7 @@
2020
context 'when on arm' do
2121
cached(:cuda_arch) { 'linux_sbsa' }
2222
cached(:cuda_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/cuda_#{cuda_complete_version}_#{cuda_version_suffix}_#{cuda_arch}.run" }
23-
cached(:cuda_samples_version) { '12.4' }
23+
cached(:cuda_samples_version) { '12.8' }
2424
cached(:cuda_samples_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/cuda/samples/v#{cuda_samples_version}.tar.gz" }
2525

2626
cached(:chef_run) do

cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def self.configure(chef_run)
168168

169169
for_all_oses do |platform, version|
170170
context "on #{platform}#{version}" do
171-
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-550' : 'nvidia-fabric-manager' }
171+
cached(:fabric_manager_package) { platform == 'ubuntu' ? 'nvidia-fabricmanager-570' : 'nvidia-fabric-manager' }
172172
cached(:fabric_manager_version) { platform == 'ubuntu' ? "#{nvidia_driver_version}" : nvidia_driver_version }
173173

174174
context 'when fabric manager is to install' do

0 commit comments

Comments
 (0)