Skip to content

Commit aa1a782

Browse files
authored
Merge branch 'develop' into develop
2 parents 71a6b2a + aecff90 commit aa1a782

File tree

17 files changed

+115
-71
lines changed

17 files changed

+115
-71
lines changed

CHANGELOG.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,20 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1111

1212
**CHANGES**
1313
- Ubuntu 20.04 is no longer supported.
14-
- Upgrade Slurm to version 24.11.5.
14+
- Upgrade Slurm to version 24.11.6 (from 24.05.8).
15+
- Upgrade EFA installer to 1.42.0 (from 1.41.0).
16+
- Efa-driver: efa-2.15.3-1
17+
- Efa-config: efa-config-1.18-1
18+
- Efa-profile: efa-profile-1.7-1
19+
- Libfabric-aws: libfabric-aws-2.1.0-3
20+
- Rdma-core: rdma-core-57.0-1
21+
- Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6-11
22+
- Upgrade Cinc Client to version to 18.4.12 from 18.2.7.
23+
- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2.
24+
- Upgrade CUDA Toolkit to version 12.8.1 (from 12.8.0) for all OSs except AL2.
25+
- Upgrade DCGM to version 4.2.3 (from 3.3.6) for all OSs except AL2.
26+
- Upgrade Python to 3.12.11 (from 3.12.8) for all OSs except AL2.
27+
- Upgrade Intel MPI Library to 2021.16.0 (from 2021.13.1).
1528
- Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting.
1629
- Upgrade DCV to version 2024.0-19030.
1730
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.

cookbooks/aws-parallelcluster-awsbatch/test/controls/awsbatch_virtualenv_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
pyenv_dir = "#{base_dir}/pyenv"
1414

1515
control 'tag:install_awsbatch_virtualenv_created' do
16-
python_version = os_properties.alinux2? ? '3.9.20' : '3.12.8'
16+
python_version = os_properties.alinux2? ? '3.9.20' : '3.12.11'
1717
title "awsbatch virtualenv should be created on #{python_version}"
1818
only_if { !os_properties.redhat? }
1919

cookbooks/aws-parallelcluster-computefleet/kitchen.computefleet-config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ suites:
3131
attributes:
3232
cluster:
3333
custom_node_package: https://github.com/aws/aws-parallelcluster-node/archive/develop.tar.gz
34-
python-version: 3.12.8
34+
python-version: 3.12.11
3535
node_virtualenv_path: /opt/parallelcluster/pyenv/versions/node_virtualenv
3636
- name: fleet_status
3737
run_list:

cookbooks/aws-parallelcluster-environment/attributes/environment.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@
7070

7171
default['cluster']['head_node_private_ip'] = nil
7272

73-
default['cluster']['efa']['version'] = '1.41.0'
74-
default['cluster']['efa']['sha256'] = '3506354cdfbe31ff552fe75f5d0d9bb7efd29cf79bd99457347d29c751c38f9f'
73+
default['cluster']['efa']['version'] = '1.42.0'
74+
default['cluster']['efa']['sha256'] = '4114fe612905ee05083ae5cb391a00a012510f3abfecc642d86c9a5ae4be9008'
7575

7676
default['cluster']['efs']['version'] = '2.3.1'
7777
default['cluster']['efs']['sha256'] = 'ced12f82e76f9740476b63f30c49bd76cc00b6375e12a9f5f7ba852635c49e15'

cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
# parallelcluster default source dir defined in attributes
44
source_dir = '/opt/parallelcluster/sources'
5-
efa_version = '1.41.0'
6-
efa_checksum = '3506354cdfbe31ff552fe75f5d0d9bb7efd29cf79bd99457347d29c751c38f9f'
5+
efa_version = '1.42.0'
6+
efa_checksum = '4114fe612905ee05083ae5cb391a00a012510f3abfecc642d86c9a5ae4be9008'
77

88
class ConvergeEfa
99
def self.setup(chef_run, efa_version: nil, efa_checksum: nil)

cookbooks/aws-parallelcluster-environment/test/controls/cfn_bootstrap_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
pyenv_dir = "#{base_dir}/pyenv"
1414

1515
control 'tag:install_cfnbootstrap_virtualenv_created' do
16-
cfn_python_version = os_properties.alinux2? ? '3.9.20' : '3.12.8'
16+
cfn_python_version = os_properties.alinux2? ? '3.9.20' : '3.12.11'
1717
title "cfnbootstrap virtualenv should be created on #{cfn_python_version}"
1818
only_if { !os_properties.redhat_on_docker? }
1919

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@
1717
# NVidia
1818
default['cluster']['nvidia']['enabled'] = 'no'
1919
default['cluster']['nvidia']['driver_version'] = '570.172.08'
20-
default['cluster']['nvidia']['dcgm_version'] = '3.3.6'
20+
default['cluster']['nvidia']['dcgm_version'] = '4.2.3-2'
2121
if platform?('amazon') && node['platform_version'] == "2"
2222
default['cluster']['nvidia']['driver_version'] = '550.127.08'
23+
default['cluster']['nvidia']['dcgm_version'] = '3.3.6-1'
2324
end
2425

2526
# nvidia-imex

cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@
2020
# Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive
2121
# Cuda installer naming: cuda_11.8.0_520.61.05_linux
2222
cuda_version = '12.8'
23-
cuda_patch = '0'
23+
cuda_patch = '1'
2424
cuda_complete_version = "#{cuda_version}.#{cuda_patch}"
25-
cuda_version_suffix = '570.86.10'
25+
cuda_version_suffix = '570.124.06'
2626
cuda_samples_version = '12.8'
2727
if platform?('amazon') && node['platform_version'] == "2"
2828
cuda_version = '12.4'

cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# limitations under the License.
1717

1818
intelmpi_supported = !arm_instance?
19-
intelmpi_version = '2021.13'
19+
intelmpi_version = '2021.16'
2020

2121
node.default['conditions']['intel_mpi_supported'] = intelmpi_supported
2222
node.default['cluster']['intelmpi']['version'] = intelmpi_version
@@ -25,9 +25,9 @@
2525

2626
return unless intelmpi_supported
2727

28-
intelmpi_full_version = "#{intelmpi_version}.1.769"
28+
intelmpi_full_version = "#{intelmpi_version}.0.443"
2929
intelmpi_installation_path = "/opt/intel/mpi/#{intelmpi_version}"
30-
intelmpi_installer = "l_mpi_oneapi_p_#{intelmpi_full_version}_offline.sh"
30+
intelmpi_installer = "intel-mpi-#{intelmpi_full_version}_offline.sh"
3131
intelmpi_installer_path = "#{node['cluster']['sources_dir']}/#{intelmpi_installer}"
3232
intelmpi_installer_url = "#{node['cluster']['artifacts_s3_url']}/impi/#{intelmpi_installer}"
3333
intelmpi_qt_version = '6.5.3'

cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb

Lines changed: 31 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,34 +13,45 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
action :install_package do
16-
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.deb" do
17-
source "#{dcgm_url}"
18-
mode '0644'
19-
retries 3
20-
retry_delay 5
21-
action :create_if_missing
22-
end
16+
packages_urls_list = if package_version.start_with?("3.")
17+
[dcgm_package]
18+
else
19+
[dcgm4_core_package, dcgm4_package]
20+
end
21+
packages_urls_list.each do |package|
22+
remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.deb" do
23+
source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}_#{package_version}_#{arch_suffix}.deb"
24+
mode '0644'
25+
retries 3
26+
retry_delay 5
27+
action :create_if_missing
28+
end
2329

24-
bash "Install #{dcgm_package}" do
25-
user 'root'
26-
cwd node['cluster']['sources_dir']
27-
code <<-DCGM_INSTALL
28-
set -e
29-
dpkg -i #{dcgm_package}-#{package_version}.deb
30-
DCGM_INSTALL
31-
retries 3
32-
retry_delay 5
30+
bash "Install #{package}" do
31+
user 'root'
32+
cwd node['cluster']['sources_dir']
33+
code <<-DCGM_INSTALL
34+
set -e
35+
dpkg -i #{package}-#{package_version}.deb
36+
DCGM_INSTALL
37+
retries 3
38+
retry_delay 5
39+
end
3340
end
3441
end
3542

36-
def dcgm_url
37-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb"
38-
end
39-
4043
def dcgm_package
4144
'datacenter-gpu-manager'
4245
end
4346

47+
def dcgm4_package
48+
"#{dcgm_package}-4-cuda12"
49+
end
50+
51+
def dcgm4_core_package
52+
"#{dcgm_package}-4-core"
53+
end
54+
4455
def arch_suffix
4556
arm_instance? ? 'arm64' : 'amd64'
4657
end

0 commit comments

Comments
 (0)