Skip to content

Commit a7abe09

Browse files
DCGM upgrade
Signed-off-by: Hanwen <[email protected]>
1 parent faad16d commit a7abe09

File tree

5 files changed

+74
-42
lines changed

5 files changed

+74
-42
lines changed

CHANGELOG.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,15 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1111

1212
**CHANGES**
1313
- Ubuntu 20.04 is no longer supported.
14-
- Upgrade Slurm to version 24.11.5.
14+
- Upgrade Slurm to version 24.11.6 (from 24.05.8).
15+
- Upgrade EFA installer to 1.42.0 (from 1.41.0).
16+
- Efa-driver: efa-2.15.3-1
17+
- Efa-config: efa-config-1.18-1
18+
- Efa-profile: efa-profile-1.7-1
19+
- Libfabric-aws: libfabric-aws-2.1.0-3
20+
- Rdma-core: rdma-core-57.0-1
21+
- Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6-11
22+
- Upgrade Cinc Client to version to 18.4.12 from 18.2.7.
1523
- Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting.
1624
- Upgrade DCV to version 2024.0-19030.
1725
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717
# NVidia
1818
default['cluster']['nvidia']['enabled'] = 'no'
1919
default['cluster']['nvidia']['driver_version'] = '570.172.08'
20-
default['cluster']['nvidia']['dcgm_version'] = '4-cuda12-4.2.3-2.'
20+
default['cluster']['nvidia']['dcgm_version'] = '4.2.3-2'
2121
if platform?('amazon') && node['platform_version'] == "2"
2222
default['cluster']['nvidia']['driver_version'] = '550.127.08'
23-
default['cluster']['nvidia']['dcgm_version'] = '3.3.6-1-'
23+
default['cluster']['nvidia']['dcgm_version'] = '3.3.6-1'
2424
end
2525

2626
# DCV

cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,34 +13,45 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
action :install_package do
16-
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.deb" do
17-
source "#{dcgm_url}"
18-
mode '0644'
19-
retries 3
20-
retry_delay 5
21-
action :create_if_missing
16+
if package_version.start_with?("3.")
17+
packages_urls_list = [dcgm_package]
18+
else
19+
packages_urls_list = [dcgm4_core_package, dcgm4_package]
2220
end
21+
for package in packages_urls_list
22+
remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.deb" do
23+
source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}_#{package_version}_#{arch_suffix}.deb"
24+
mode '0644'
25+
retries 3
26+
retry_delay 5
27+
action :create_if_missing
28+
end
2329

24-
bash "Install #{dcgm_package}" do
25-
user 'root'
26-
cwd node['cluster']['sources_dir']
27-
code <<-DCGM_INSTALL
28-
set -e
29-
dpkg -i #{dcgm_package}-#{package_version}.deb
30-
DCGM_INSTALL
31-
retries 3
32-
retry_delay 5
30+
bash "Install #{package}" do
31+
user 'root'
32+
cwd node['cluster']['sources_dir']
33+
code <<-DCGM_INSTALL
34+
set -e
35+
dpkg -i #{package}-#{package_version}.deb
36+
DCGM_INSTALL
37+
retries 3
38+
retry_delay 5
39+
end
3340
end
3441
end
3542

36-
def dcgm_url
37-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb"
38-
end
39-
4043
def dcgm_package
4144
'datacenter-gpu-manager'
4245
end
4346

47+
def dcgm4_package
48+
"#{dcgm_package}-4-cuda12"
49+
end
50+
51+
def dcgm4_core_package
52+
"#{dcgm_package}-4-core"
53+
end
54+
4455
def arch_suffix
4556
arm_instance? ? 'arm64' : 'amd64'
4657
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,34 +13,47 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
action :install_package do
16-
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.rpm" do
17-
source "#{dcgm_url}"
18-
mode '0644'
19-
retries 3
20-
retry_delay 5
21-
action :create_if_missing
16+
if package_version.start_with?("3.")
17+
packages_urls_list = [dcgm_package]
18+
package_url_separator = "-"
19+
else
20+
packages_urls_list = [dcgm4_core_package, dcgm4_package]
21+
package_url_separator = "."
2222
end
23+
for package in packages_urls_list
24+
remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.rpm" do
25+
source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}-#{package_version}#{package_url_separator}#{arch_suffix}.rpm"
26+
mode '0644'
27+
retries 3
28+
retry_delay 5
29+
action :create_if_missing
30+
end
2331

24-
bash "Install #{dcgm_package}" do
25-
user 'root'
26-
cwd node['cluster']['sources_dir']
27-
code <<-DCGM_INSTALL
28-
set -e
29-
yum install -y #{dcgm_package}-#{package_version}.rpm
30-
DCGM_INSTALL
31-
retries 3
32-
retry_delay 5
32+
bash "Install #{package}" do
33+
user 'root'
34+
cwd node['cluster']['sources_dir']
35+
code <<-DCGM_INSTALL
36+
set -e
37+
yum install -y #{package}-#{package_version}.rpm
38+
DCGM_INSTALL
39+
retries 3
40+
retry_delay 5
41+
end
3342
end
3443
end
3544

36-
def dcgm_url
37-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}#{arch_suffix}.rpm"
38-
end
39-
4045
def dcgm_package
4146
'datacenter-gpu-manager'
4247
end
4348

49+
def dcgm4_package
50+
"#{dcgm_package}-4-cuda12"
51+
end
52+
53+
def dcgm4_core_package
54+
"#{dcgm_package}-4-core"
55+
end
56+
4457
def arch_suffix
4558
arm_instance? ? 'aarch64' : 'x86_64'
4659
end

cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
(!os_properties.arm? || !(os_properties.alinux2? || os_properties.centos?))
1616
end
1717

18-
describe package('datacenter-gpu-manager') do
18+
describe package('datacenter-gpu-manager-4-cuda12') do
1919
it { should be_installed }
2020
end
2121
end

0 commit comments

Comments
 (0)