File tree Expand file tree Collapse file tree 6 files changed +23
-11
lines changed
cookbooks/aws-parallelcluster-platform Expand file tree Collapse file tree 6 files changed +23
-11
lines changed Original file line number Diff line number Diff line change @@ -10,6 +10,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1010- Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security.
1111
1212** CHANGES**
13+ - Upgrade NVIDIA driver to version 570.86.15 (from 550.127.08) for all OSs except AL2.
14+ - Upgrade CUDA Toolkit to version 12.8.0 (from 12.4.1) for all OSs except AL2.
1315- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel.
1416- Upgrade ` aws-cfn-bootstrap ` to version 2.0-32.
1517- Upgrade amazon-efs-utils to version 2.1.0.
Original file line number Diff line number Diff line change 1616
1717# NVidia
1818default [ 'cluster' ] [ 'nvidia' ] [ 'enabled' ] = 'no'
19- default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '550.127.08 '
19+ default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '570.86.15 '
2020default [ 'cluster' ] [ 'nvidia' ] [ 'dcgm_version' ] = '3.3.6'
21+ if platform? ( 'amazon' ) && node [ 'platform_version' ] == "2"
22+ default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '550.127.08'
23+ end
2124
2225# DCV
2326default [ 'cluster' ] [ 'dcv' ] [ 'authenticator' ] [ 'user' ] = "dcvextauth"
Original file line number Diff line number Diff line change 1919
2020# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
2121# Cuda installer naming: cuda_11.8.0_520.61.05_linux
22- cuda_version = '12.4 '
23- cuda_patch = '1 '
22+ cuda_version = '12.8 '
23+ cuda_patch = '0 '
2424cuda_complete_version = "#{ cuda_version } .#{ cuda_patch } "
25- cuda_version_suffix = '550.54.15'
25+ cuda_version_suffix = '570.86.10'
26+ cuda_samples_version = '12.8'
27+ if platform? ( 'amazon' ) && node [ 'platform_version' ] == "2"
28+ cuda_version = '12.4'
29+ cuda_patch = '1'
30+ cuda_complete_version = "#{ cuda_version } .#{ cuda_patch } "
31+ cuda_version_suffix = '550.54.15'
32+ cuda_samples_version = '12.4'
33+ end
2634cuda_arch = arm_instance? ? 'linux_sbsa' : 'linux'
2735cuda_url = "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/cuda/cuda_#{ cuda_complete_version } _#{ cuda_version_suffix } _#{ cuda_arch } .run"
28- cuda_samples_version = '12.4'
2936cuda_samples_url = "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/cuda/samples/v#{ cuda_samples_version } .tar.gz"
3037tmp_cuda_run = '/tmp/cuda.run'
3138tmp_cuda_sample_archive = '/tmp/cuda-sample.tar.gz'
Original file line number Diff line number Diff line change 2020use 'partial/_fabric_manager_install_debian.rb'
2121
2222def fabric_manager_package
23- 'nvidia-fabricmanager-550 '
23+ 'nvidia-fabricmanager-570 '
2424end
2525
2626def fabric_manager_version
Original file line number Diff line number Diff line change 11require 'spec_helper'
22
33describe 'aws-parallelcluster-platform::cuda' do
4- cached ( :cuda_version ) { '12.4 ' }
5- cached ( :cuda_patch ) { '1 ' }
4+ cached ( :cuda_version ) { '12.8 ' }
5+ cached ( :cuda_patch ) { '0 ' }
66 cached ( :cuda_complete_version ) { "#{ cuda_version } .#{ cuda_patch } " }
7- cached ( :cuda_version_suffix ) { '550.54.15 ' }
7+ cached ( :cuda_version_suffix ) { '570.86.10 ' }
88
99 context 'when nvidia not enabled' do
1010 cached ( :chef_run ) do
2020 context 'when on arm' do
2121 cached ( :cuda_arch ) { 'linux_sbsa' }
2222 cached ( :cuda_url ) { "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/cuda/cuda_#{ cuda_complete_version } _#{ cuda_version_suffix } _#{ cuda_arch } .run" }
23- cached ( :cuda_samples_version ) { '12.4 ' }
23+ cached ( :cuda_samples_version ) { '12.8 ' }
2424 cached ( :cuda_samples_url ) { "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/cuda/samples/v#{ cuda_samples_version } .tar.gz" }
2525
2626 cached ( :chef_run ) do
Original file line number Diff line number Diff line change @@ -168,7 +168,7 @@ def self.configure(chef_run)
168168
169169 for_all_oses do |platform , version |
170170 context "on #{ platform } #{ version } " do
171- cached ( :fabric_manager_package ) { platform == 'ubuntu' ? 'nvidia-fabricmanager-550 ' : 'nvidia-fabric-manager' }
171+ cached ( :fabric_manager_package ) { platform == 'ubuntu' ? 'nvidia-fabricmanager-570 ' : 'nvidia-fabric-manager' }
172172 cached ( :fabric_manager_version ) { platform == 'ubuntu' ? "#{ nvidia_driver_version } " : nvidia_driver_version }
173173
174174 context 'when fabric manager is to install' do
You can’t perform that action at this time.
0 commit comments