From 946fdc74c56b72af699759e02b25b8a1d421a258 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Wed, 30 Jul 2025 08:15:38 -0700 Subject: [PATCH 1/2] Upgrade dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Upgrade Slurm to version 24.11.6 (from 24.05.8). - Upgrade EFA installer to 1.42.0 (from 1.41.0). - Efa-driver: efa-2.15.3-1 - Efa-config: efa-config-1.18-1 - Efa-profile: efa-profile-1.7-1 - Libfabric-aws: libfabric-aws-2.1.0-3 - Rdma-core: rdma-core-57.0-1 - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6-11 - Upgrade Cinc Client to version to 18.4.12 from 18.2.7. - Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2. - Upgrade CUDA Toolkit to version 12.8.1 (from 12.8.0) for all OSs except AL2. - Upgrade DCGM to version 4.2.3 (from 3.3.6) for all OSs except AL2. - Upgrade Python to 3.12.11 (from 3.12.8) for all OSs except AL2. - Upgrade Intel MPI Library to 2021.16.0 (from 2021.13.1). Among the above upgrade, DCGM is a major version upgrade (from version 3 to version 4) This is a new change in DCGM 4: ``` Installation assets are no longer shipped in a single monolithic package. Instead, installation assets have been split among several packages, allowing clients to opt-out of the installation of assets not applicable to their use case.   Component packages are as follows:       datacenter-gpu-manager-4-core               Provides nv-hostengine binary and other CUDA-agnostic installation assets available through the DCGM open source product       datacenter-gpu-manager-4-cuda11               Provides the CUDA11-specific binaries available through the DCGM open source product       datacenter-gpu-manager-4-cuda12               Provides the CUDA12-specific binaries available through the DCGM open source product       datacenter-gpu-manager-4-proprietary               Provides CUDA-agnostic installation assets not distributed as part of the DCGM open source product       datacenter-gpu-manager-4-proprietary-cuda11               Provides CUDA11 binaries not distributed as part of the DCGM open source product       datacenter-gpu-manager-4-proprietary-cuda12               Provides CUDA12 binaries not distributed as part of the DCGM open source product       datacenter-gpu-manager-4-development               Provides files necessary for the development of downstream software dependent on the DCGM library ``` https://docs.nvidia.com/datacenter/dcgm/latest/release-notes/changelog.html Signed-off-by: Hanwen Signed-off-by: Hanwen --- CHANGELOG.md | 15 +++- .../test/controls/awsbatch_virtualenv_spec.rb | 2 +- .../kitchen.computefleet-config.yml | 2 +- .../Berksfile.lock | 84 +++++++++++++++++++ .../Berksfile.lock | 28 +++++++ .../attributes/environment.rb | 4 +- .../spec/unit/resources/efa_spec.rb | 4 +- .../test/controls/cfn_bootstrap_spec.rb | 2 +- .../Berksfile.lock | 23 +++++ .../attributes/platform.rb | 3 +- .../recipes/install/cuda.rb | 4 +- .../recipes/install/intel_mpi.rb | 6 +- .../partial/_nvidia_dcgm_debian.rb | 51 ++++++----- .../nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb | 51 ++++++----- .../spec/unit/recipes/cuda_spec.rb | 4 +- .../spec/unit/recipes/intel_mpi_spec.rb | 18 ++-- .../spec/unit/resources/nvidia_dcgm_spec.rb | 3 +- .../test/controls/nvidia_dcgm_spec.rb | 11 ++- .../aws-parallelcluster-shared/Berksfile.lock | 15 ++++ .../attributes/versions.rb | 2 +- .../aws-parallelcluster-slurm/Berksfile.lock | 57 +++++++++++++ .../attributes/versions.rb | 4 +- 22 files changed, 322 insertions(+), 71 deletions(-) create mode 100644 cookbooks/aws-parallelcluster-entrypoints/Berksfile.lock create mode 100644 cookbooks/aws-parallelcluster-environment/Berksfile.lock create mode 100644 cookbooks/aws-parallelcluster-platform/Berksfile.lock create mode 100644 cookbooks/aws-parallelcluster-shared/Berksfile.lock create mode 100644 cookbooks/aws-parallelcluster-slurm/Berksfile.lock diff --git a/CHANGELOG.md b/CHANGELOG.md index 384e2e4789..18287a6318 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,20 @@ This file is used to list changes made in each version of the AWS ParallelCluste **CHANGES** - Ubuntu 20.04 is no longer supported. -- Upgrade Slurm to version 24.11.5. +- Upgrade Slurm to version 24.11.6 (from 24.05.8). +- Upgrade EFA installer to 1.42.0 (from 1.41.0). + - Efa-driver: efa-2.15.3-1 + - Efa-config: efa-config-1.18-1 + - Efa-profile: efa-profile-1.7-1 + - Libfabric-aws: libfabric-aws-2.1.0-3 + - Rdma-core: rdma-core-57.0-1 + - Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6-11 +- Upgrade Cinc Client to version to 18.4.12 from 18.2.7. +- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2. +- Upgrade CUDA Toolkit to version 12.8.1 (from 12.8.0) for all OSs except AL2. +- Upgrade DCGM to version 4.2.3 (from 3.3.6) for all OSs except AL2. +- Upgrade Python to 3.12.11 (from 3.12.8) for all OSs except AL2. +- Upgrade Intel MPI Library to 2021.16.0 (from 2021.13.1). - Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting. - Upgrade DCV to version 2024.0-19030. - Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. diff --git a/cookbooks/aws-parallelcluster-awsbatch/test/controls/awsbatch_virtualenv_spec.rb b/cookbooks/aws-parallelcluster-awsbatch/test/controls/awsbatch_virtualenv_spec.rb index 7b877b10f5..5b445b7cd8 100644 --- a/cookbooks/aws-parallelcluster-awsbatch/test/controls/awsbatch_virtualenv_spec.rb +++ b/cookbooks/aws-parallelcluster-awsbatch/test/controls/awsbatch_virtualenv_spec.rb @@ -13,7 +13,7 @@ pyenv_dir = "#{base_dir}/pyenv" control 'tag:install_awsbatch_virtualenv_created' do - python_version = os_properties.alinux2? ? '3.9.20' : '3.12.8' + python_version = os_properties.alinux2? ? '3.9.20' : '3.12.11' title "awsbatch virtualenv should be created on #{python_version}" only_if { !os_properties.redhat? } diff --git a/cookbooks/aws-parallelcluster-computefleet/kitchen.computefleet-config.yml b/cookbooks/aws-parallelcluster-computefleet/kitchen.computefleet-config.yml index 3bcf169b8b..d127d77b6b 100644 --- a/cookbooks/aws-parallelcluster-computefleet/kitchen.computefleet-config.yml +++ b/cookbooks/aws-parallelcluster-computefleet/kitchen.computefleet-config.yml @@ -31,7 +31,7 @@ suites: attributes: cluster: custom_node_package: https://github.com/aws/aws-parallelcluster-node/archive/develop.tar.gz - python-version: 3.12.8 + python-version: 3.12.11 node_virtualenv_path: /opt/parallelcluster/pyenv/versions/node_virtualenv - name: fleet_status run_list: diff --git a/cookbooks/aws-parallelcluster-entrypoints/Berksfile.lock b/cookbooks/aws-parallelcluster-entrypoints/Berksfile.lock new file mode 100644 index 0000000000..756da92317 --- /dev/null +++ b/cookbooks/aws-parallelcluster-entrypoints/Berksfile.lock @@ -0,0 +1,84 @@ +DEPENDENCIES + aws-parallelcluster-awsbatch + path: ../aws-parallelcluster-awsbatch + aws-parallelcluster-computefleet + path: ../aws-parallelcluster-computefleet + aws-parallelcluster-entrypoints + path: . + metadata: true + aws-parallelcluster-environment + path: ../aws-parallelcluster-environment + aws-parallelcluster-platform + path: ../aws-parallelcluster-platform + aws-parallelcluster-shared + path: ../aws-parallelcluster-shared + aws-parallelcluster-slurm + path: ../aws-parallelcluster-slurm + aws-parallelcluster-tests + path: ../aws-parallelcluster-tests + iptables + path: ../third-party/iptables-8.0.0 + line + path: ../third-party/line-4.5.21 + nfs + path: ../third-party/nfs-5.1.5 + openssh + path: ../third-party/openssh-2.11.14 + yum + path: ../third-party/yum-7.4.20 + yum-epel + path: ../third-party/yum-epel-5.0.8 + +GRAPH + aws-parallelcluster-awsbatch (3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + iptables (~> 8.0.0) + line (~> 4.5.21) + nfs (~> 5.1.5) + openssh (~> 2.11.14) + yum (~> 7.4.20) + yum-epel (~> 5.0.8) + aws-parallelcluster-computefleet (3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + aws-parallelcluster-entrypoints (3.13.0) + aws-parallelcluster-awsbatch (~> 3.13.0) + aws-parallelcluster-computefleet (~> 3.13.0) + aws-parallelcluster-environment (~> 3.13.0) + aws-parallelcluster-platform (~> 3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + aws-parallelcluster-slurm (~> 3.13.0) + aws-parallelcluster-environment (3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + line (~> 4.5.21) + nfs (~> 5.1.5) + aws-parallelcluster-platform (3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + line (~> 4.5.21) + aws-parallelcluster-shared (3.13.0) + yum (~> 7.4.20) + yum-epel (~> 5.0.8) + aws-parallelcluster-slurm (3.13.0) + aws-parallelcluster-computefleet (~> 3.13.0) + aws-parallelcluster-environment (~> 3.13.0) + aws-parallelcluster-platform (~> 3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + iptables (~> 8.0.0) + line (~> 4.5.21) + nfs (~> 5.1.5) + openssh (~> 2.11.14) + yum (~> 7.4.20) + yum-epel (~> 5.0.8) + aws-parallelcluster-tests (3.13.0) + aws-parallelcluster-computefleet (~> 3.13.0) + aws-parallelcluster-environment (~> 3.13.0) + aws-parallelcluster-platform (~> 3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + aws-parallelcluster-slurm (~> 3.13.0) + iptables (8.0.0) + line (4.5.21) + nfs (5.1.5) + line (>= 0.0.0) + openssh (2.11.14) + iptables (>= 7.0) + yum (7.4.20) + yum-epel (5.0.8) diff --git a/cookbooks/aws-parallelcluster-environment/Berksfile.lock b/cookbooks/aws-parallelcluster-environment/Berksfile.lock new file mode 100644 index 0000000000..ee9fbb6d3f --- /dev/null +++ b/cookbooks/aws-parallelcluster-environment/Berksfile.lock @@ -0,0 +1,28 @@ +DEPENDENCIES + aws-parallelcluster-environment + path: . + metadata: true + aws-parallelcluster-shared + path: ../aws-parallelcluster-shared + line + path: ../third-party/line-4.5.21 + nfs + path: ../third-party/nfs-5.1.5 + yum + path: ../third-party/yum-7.4.20 + yum-epel + path: ../third-party/yum-epel-5.0.8 + +GRAPH + aws-parallelcluster-environment (3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + line (~> 4.5.21) + nfs (~> 5.1.5) + aws-parallelcluster-shared (3.13.0) + yum (~> 7.4.20) + yum-epel (~> 5.0.8) + line (4.5.21) + nfs (5.1.5) + line (>= 0.0.0) + yum (7.4.20) + yum-epel (5.0.8) diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index 366356c30e..77f8da4d51 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -70,8 +70,8 @@ default['cluster']['head_node_private_ip'] = nil -default['cluster']['efa']['version'] = '1.41.0' -default['cluster']['efa']['sha256'] = '3506354cdfbe31ff552fe75f5d0d9bb7efd29cf79bd99457347d29c751c38f9f' +default['cluster']['efa']['version'] = '1.42.0' +default['cluster']['efa']['sha256'] = '4114fe612905ee05083ae5cb391a00a012510f3abfecc642d86c9a5ae4be9008' default['cluster']['efs']['version'] = '2.3.1' default['cluster']['efs']['sha256'] = 'ced12f82e76f9740476b63f30c49bd76cc00b6375e12a9f5f7ba852635c49e15' diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb index 1fa04cdcc9..bccd275b8c 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb @@ -2,8 +2,8 @@ # parallelcluster default source dir defined in attributes source_dir = '/opt/parallelcluster/sources' -efa_version = '1.41.0' -efa_checksum = '3506354cdfbe31ff552fe75f5d0d9bb7efd29cf79bd99457347d29c751c38f9f' +efa_version = '1.42.0' +efa_checksum = '4114fe612905ee05083ae5cb391a00a012510f3abfecc642d86c9a5ae4be9008' class ConvergeEfa def self.setup(chef_run, efa_version: nil, efa_checksum: nil) diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_bootstrap_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_bootstrap_spec.rb index 132c3a94ed..ff4b88c50b 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cfn_bootstrap_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cfn_bootstrap_spec.rb @@ -13,7 +13,7 @@ pyenv_dir = "#{base_dir}/pyenv" control 'tag:install_cfnbootstrap_virtualenv_created' do - cfn_python_version = os_properties.alinux2? ? '3.9.20' : '3.12.8' + cfn_python_version = os_properties.alinux2? ? '3.9.20' : '3.12.11' title "cfnbootstrap virtualenv should be created on #{cfn_python_version}" only_if { !os_properties.redhat_on_docker? } diff --git a/cookbooks/aws-parallelcluster-platform/Berksfile.lock b/cookbooks/aws-parallelcluster-platform/Berksfile.lock new file mode 100644 index 0000000000..97e06da9a6 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/Berksfile.lock @@ -0,0 +1,23 @@ +DEPENDENCIES + aws-parallelcluster-platform + path: . + metadata: true + aws-parallelcluster-shared + path: ../aws-parallelcluster-shared + line + path: ../third-party/line-4.5.21 + yum + path: ../third-party/yum-7.4.20 + yum-epel + path: ../third-party/yum-epel-5.0.8 + +GRAPH + aws-parallelcluster-platform (3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + line (~> 4.5.21) + aws-parallelcluster-shared (3.13.0) + yum (~> 7.4.20) + yum-epel (~> 5.0.8) + line (4.5.21) + yum (7.4.20) + yum-epel (5.0.8) diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 8201c2c04e..533cde463e 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -17,9 +17,10 @@ # NVidia default['cluster']['nvidia']['enabled'] = 'no' default['cluster']['nvidia']['driver_version'] = '570.172.08' -default['cluster']['nvidia']['dcgm_version'] = '3.3.6' +default['cluster']['nvidia']['dcgm_version'] = '4.2.3-2' if platform?('amazon') && node['platform_version'] == "2" default['cluster']['nvidia']['driver_version'] = '550.127.08' + default['cluster']['nvidia']['dcgm_version'] = '3.3.6-1' end # nvidia-imex diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb index 3c7ba588bb..a311ab0ba9 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/cuda.rb @@ -20,9 +20,9 @@ # Cuda installer from https://developer.nvidia.com/cuda-toolkit-archive # Cuda installer naming: cuda_11.8.0_520.61.05_linux cuda_version = '12.8' -cuda_patch = '0' +cuda_patch = '1' cuda_complete_version = "#{cuda_version}.#{cuda_patch}" -cuda_version_suffix = '570.86.10' +cuda_version_suffix = '570.124.06' cuda_samples_version = '12.8' if platform?('amazon') && node['platform_version'] == "2" cuda_version = '12.4' diff --git a/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb b/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb index 85917d3f6b..b868b3ff69 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/install/intel_mpi.rb @@ -16,7 +16,7 @@ # limitations under the License. intelmpi_supported = !arm_instance? -intelmpi_version = '2021.13' +intelmpi_version = '2021.16' node.default['conditions']['intel_mpi_supported'] = intelmpi_supported node.default['cluster']['intelmpi']['version'] = intelmpi_version @@ -25,9 +25,9 @@ return unless intelmpi_supported -intelmpi_full_version = "#{intelmpi_version}.1.769" +intelmpi_full_version = "#{intelmpi_version}.0.443" intelmpi_installation_path = "/opt/intel/mpi/#{intelmpi_version}" -intelmpi_installer = "l_mpi_oneapi_p_#{intelmpi_full_version}_offline.sh" +intelmpi_installer = "intel-mpi-#{intelmpi_full_version}_offline.sh" intelmpi_installer_path = "#{node['cluster']['sources_dir']}/#{intelmpi_installer}" intelmpi_installer_url = "#{node['cluster']['artifacts_s3_url']}/impi/#{intelmpi_installer}" intelmpi_qt_version = '6.5.3' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb index 5ca316daad..e4882101ad 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb @@ -13,34 +13,45 @@ # See the License for the specific language governing permissions and limitations under the License. action :install_package do - remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.deb" do - source "#{dcgm_url}" - mode '0644' - retries 3 - retry_delay 5 - action :create_if_missing - end + packages_urls_list = if package_version.start_with?("3.") + [dcgm_package] + else + [dcgm4_core_package, dcgm4_package] + end + packages_urls_list.each do |package| + remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.deb" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}_#{package_version}_#{arch_suffix}.deb" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end - bash "Install #{dcgm_package}" do - user 'root' - cwd node['cluster']['sources_dir'] - code <<-DCGM_INSTALL - set -e - dpkg -i #{dcgm_package}-#{package_version}.deb - DCGM_INSTALL - retries 3 - retry_delay 5 + bash "Install #{package}" do + user 'root' + cwd node['cluster']['sources_dir'] + code <<-DCGM_INSTALL + set -e + dpkg -i #{package}-#{package_version}.deb + DCGM_INSTALL + retries 3 + retry_delay 5 + end end end -def dcgm_url - "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb" -end - def dcgm_package 'datacenter-gpu-manager' end +def dcgm4_package + "#{dcgm_package}-4-cuda12" +end + +def dcgm4_core_package + "#{dcgm_package}-4-core" +end + def arch_suffix arm_instance? ? 'arm64' : 'amd64' end diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb index 997762acd1..c22f791e39 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb @@ -13,34 +13,47 @@ # See the License for the specific language governing permissions and limitations under the License. action :install_package do - remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.rpm" do - source "#{dcgm_url}" - mode '0644' - retries 3 - retry_delay 5 - action :create_if_missing + if package_version.start_with?("3.") + packages_urls_list = [dcgm_package] + package_url_separator = "-" + else + packages_urls_list = [dcgm4_core_package, dcgm4_package] + package_url_separator = "." end + packages_urls_list.each do |package| + remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.rpm" do + source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}-#{package_version}#{package_url_separator}#{arch_suffix}.rpm" + mode '0644' + retries 3 + retry_delay 5 + action :create_if_missing + end - bash "Install #{dcgm_package}" do - user 'root' - cwd node['cluster']['sources_dir'] - code <<-DCGM_INSTALL - set -e - yum install -y #{dcgm_package}-#{package_version}.rpm - DCGM_INSTALL - retries 3 - retry_delay 5 + bash "Install #{package}" do + user 'root' + cwd node['cluster']['sources_dir'] + code <<-DCGM_INSTALL + set -e + yum install -y #{package}-#{package_version}.rpm + DCGM_INSTALL + retries 3 + retry_delay 5 + end end end -def dcgm_url - "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}-1-#{arch_suffix}.rpm" -end - def dcgm_package 'datacenter-gpu-manager' end +def dcgm4_package + "#{dcgm_package}-4-cuda12" +end + +def dcgm4_core_package + "#{dcgm_package}-4-core" +end + def arch_suffix arm_instance? ? 'aarch64' : 'x86_64' end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb index 297d1ae932..27a001ff05 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/cuda_spec.rb @@ -2,9 +2,9 @@ describe 'aws-parallelcluster-platform::cuda' do cached(:cuda_version) { '12.8' } - cached(:cuda_patch) { '0' } + cached(:cuda_patch) { '1' } cached(:cuda_complete_version) { "#{cuda_version}.#{cuda_patch}" } - cached(:cuda_version_suffix) { '570.86.10' } + cached(:cuda_version_suffix) { '570.124.06' } context 'when nvidia not enabled' do cached(:chef_run) do diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/intel_mpi_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/intel_mpi_spec.rb index f74bc6fb08..1023c7ec45 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/intel_mpi_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/recipes/intel_mpi_spec.rb @@ -20,8 +20,8 @@ end it 'fetches intel mpi installer script' do - is_expected.to create_remote_file("#{source_dir}/l_mpi_oneapi_p_2021.13.1.769_offline.sh").with( - source: "https://#{aws_region}-aws-parallelcluster.s3.#{aws_region}.test_aws_domain/archives/impi/l_mpi_oneapi_p_2021.13.1.769_offline.sh", + is_expected.to create_remote_file("#{source_dir}/intel-mpi-2021.16.0.443_offline.sh").with( + source: "https://#{aws_region}-aws-parallelcluster.s3.#{aws_region}.test_aws_domain/archives/impi/intel-mpi-2021.16.0.443_offline.sh", mode: '0744', retries: 3, retry_delay: 5 @@ -31,25 +31,25 @@ it 'installs intel mpi' do is_expected.to run_bash('install intel mpi').with( cwd: source_dir, - creates: '/opt/intel/mpi/2021.13' - ).with_code(%r{chmod +x l_mpi_oneapi_p_2021.13.1.769_offline.sh --remove-extracted-files yes -a --silent --eula accept --install-dir /opt/intel}) - .with_code(/rm -f l_mpi_oneapi_p_2021.13.1.769_offline.sh/) + creates: '/opt/intel/mpi/2021.16' + ).with_code(%r{chmod +x intel-mpi-2021.16.0.443_offline.sh --remove-extracted-files yes -a --silent --eula accept --install-dir /opt/intel}) + .with_code(/rm -f intel-mpi-2021.16.0.443_offline.sh/) end it 'appends intel module file dir to modules config' do is_expected.to append_to_config_modules('append intel modules file dir to modules conf') - .with_line('/opt/intel/mpi/2021.13/etc/modulefiles/') + .with_line('/opt/intel/mpi/2021.16/etc/modulefiles/') end it 'renames intel mpi module' do is_expected.to run_execute('rename intel mpi modules file name').with( - command: "mv /opt/intel/mpi/2021.13/etc/modulefiles/mpi /opt/intel/mpi/2021.13/etc/modulefiles/intelmpi", - creates: '/opt/intel/mpi/2021.13/etc/modulefiles/intelmpi' + command: "mv /opt/intel/mpi/2021.16/etc/modulefiles/mpi /opt/intel/mpi/2021.16/etc/modulefiles/intelmpi", + creates: '/opt/intel/mpi/2021.16/etc/modulefiles/intelmpi' ) end it 'adds Qt source file' do - is_expected.to create_template("/opt/intel/mpi/2021.13/qt_source_code.txt").with( + is_expected.to create_template("/opt/intel/mpi/2021.16/qt_source_code.txt").with( source: 'intel_mpi/qt_source_code.erb', owner: 'root', group: 'root', diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb index 3cf2779901..08e45803d1 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb @@ -170,7 +170,8 @@ def self.setup(chef_run, nvidia_enabled: nil) end else it 'installs datacenter gpu manager' do - is_expected.to run_bash('Install datacenter-gpu-manager') + is_expected.to run_bash('Install datacenter-gpu-manager-4-core') + is_expected.to run_bash('Install datacenter-gpu-manager-4-cuda12') end end end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb index 15ddf1c512..29d8179436 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb @@ -14,8 +14,13 @@ ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? && (!os_properties.arm? || !(os_properties.alinux2? || os_properties.centos?)) end - - describe package('datacenter-gpu-manager') do - it { should be_installed } + if os_properties.alinux2? + describe package('datacenter-gpu-manager') do + it { should be_installed } + end + else + describe package('datacenter-gpu-manager-4-cuda12') do + it { should be_installed } + end end end diff --git a/cookbooks/aws-parallelcluster-shared/Berksfile.lock b/cookbooks/aws-parallelcluster-shared/Berksfile.lock new file mode 100644 index 0000000000..82b0b1131b --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/Berksfile.lock @@ -0,0 +1,15 @@ +DEPENDENCIES + aws-parallelcluster-shared + path: . + metadata: true + yum + path: ../third-party/yum-7.4.20 + yum-epel + path: ../third-party/yum-epel-5.0.8 + +GRAPH + aws-parallelcluster-shared (3.13.0) + yum (~> 7.4.20) + yum-epel (~> 5.0.8) + yum (7.4.20) + yum-epel (5.0.8) diff --git a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb index 9af53e5702..03f07ed07d 100644 --- a/cookbooks/aws-parallelcluster-shared/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-shared/attributes/versions.rb @@ -1,5 +1,5 @@ # Python Version -default['cluster']['python-version'] = '3.12.8' +default['cluster']['python-version'] = '3.12.11' default['cluster']['python-major-minor-version'] = '3.12' if platform?('amazon') && node['platform_version'] == "2" default['cluster']['python-version'] = '3.9.20' diff --git a/cookbooks/aws-parallelcluster-slurm/Berksfile.lock b/cookbooks/aws-parallelcluster-slurm/Berksfile.lock new file mode 100644 index 0000000000..4f88206f51 --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/Berksfile.lock @@ -0,0 +1,57 @@ +DEPENDENCIES + aws-parallelcluster-computefleet + path: ../aws-parallelcluster-computefleet + aws-parallelcluster-environment + path: ../aws-parallelcluster-environment + aws-parallelcluster-platform + path: ../aws-parallelcluster-platform + aws-parallelcluster-shared + path: ../aws-parallelcluster-shared + aws-parallelcluster-slurm + path: . + metadata: true + iptables + path: ../third-party/iptables-8.0.0 + line + path: ../third-party/line-4.5.21 + nfs + path: ../third-party/nfs-5.1.5 + openssh + path: ../third-party/openssh-2.11.14 + yum + path: ../third-party/yum-7.4.20 + yum-epel + path: ../third-party/yum-epel-5.0.8 + +GRAPH + aws-parallelcluster-computefleet (3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + aws-parallelcluster-environment (3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + line (~> 4.5.21) + nfs (~> 5.1.5) + aws-parallelcluster-platform (3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + line (~> 4.5.21) + aws-parallelcluster-shared (3.13.0) + yum (~> 7.4.20) + yum-epel (~> 5.0.8) + aws-parallelcluster-slurm (3.13.0) + aws-parallelcluster-computefleet (~> 3.13.0) + aws-parallelcluster-environment (~> 3.13.0) + aws-parallelcluster-platform (~> 3.13.0) + aws-parallelcluster-shared (~> 3.13.0) + iptables (~> 8.0.0) + line (~> 4.5.21) + nfs (~> 5.1.5) + openssh (~> 2.11.14) + yum (~> 7.4.20) + yum-epel (~> 5.0.8) + iptables (8.0.0) + line (4.5.21) + nfs (5.1.5) + line (>= 0.0.0) + openssh (2.11.14) + iptables (>= 7.0) + yum (7.4.20) + yum-epel (5.0.8) diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index 79d0777c59..e14f0ae6f3 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -1,8 +1,8 @@ # Slurm -default['cluster']['slurm']['version'] = '24-11-5-1' +default['cluster']['slurm']['version'] = '24-11-6-1' default['cluster']['slurm']['commit'] = '' default['cluster']['slurm']['branch'] = '' -default['cluster']['slurm']['sha256'] = 'e1a5547edd212c38b5e3230a284133f777b32746551f094aaa81cc4af375e332' +default['cluster']['slurm']['sha256'] = '282708483326f381eb001a14852a1a82e65e18f37b62b7a5f4936c0ed443b600' default['cluster']['slurm']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/slurm" # Munge default['cluster']['munge']['munge_version'] = '0.5.16' From 883caea603876cc259ab4fb4b992afd58b668368 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Thu, 7 Aug 2025 15:30:09 -0400 Subject: [PATCH 2/2] [Build] Remove unnecessary Berksfiles. We removed the dependency on Berkshelf in https://github.com/aws/aws-parallelcluster-cookbook/pull/2989 --- .../Berksfile.lock | 84 ------------------- .../Berksfile.lock | 28 ------- .../Berksfile.lock | 23 ----- .../aws-parallelcluster-shared/Berksfile.lock | 15 ---- .../aws-parallelcluster-slurm/Berksfile.lock | 57 ------------- 5 files changed, 207 deletions(-) delete mode 100644 cookbooks/aws-parallelcluster-entrypoints/Berksfile.lock delete mode 100644 cookbooks/aws-parallelcluster-environment/Berksfile.lock delete mode 100644 cookbooks/aws-parallelcluster-platform/Berksfile.lock delete mode 100644 cookbooks/aws-parallelcluster-shared/Berksfile.lock delete mode 100644 cookbooks/aws-parallelcluster-slurm/Berksfile.lock diff --git a/cookbooks/aws-parallelcluster-entrypoints/Berksfile.lock b/cookbooks/aws-parallelcluster-entrypoints/Berksfile.lock deleted file mode 100644 index 756da92317..0000000000 --- a/cookbooks/aws-parallelcluster-entrypoints/Berksfile.lock +++ /dev/null @@ -1,84 +0,0 @@ -DEPENDENCIES - aws-parallelcluster-awsbatch - path: ../aws-parallelcluster-awsbatch - aws-parallelcluster-computefleet - path: ../aws-parallelcluster-computefleet - aws-parallelcluster-entrypoints - path: . - metadata: true - aws-parallelcluster-environment - path: ../aws-parallelcluster-environment - aws-parallelcluster-platform - path: ../aws-parallelcluster-platform - aws-parallelcluster-shared - path: ../aws-parallelcluster-shared - aws-parallelcluster-slurm - path: ../aws-parallelcluster-slurm - aws-parallelcluster-tests - path: ../aws-parallelcluster-tests - iptables - path: ../third-party/iptables-8.0.0 - line - path: ../third-party/line-4.5.21 - nfs - path: ../third-party/nfs-5.1.5 - openssh - path: ../third-party/openssh-2.11.14 - yum - path: ../third-party/yum-7.4.20 - yum-epel - path: ../third-party/yum-epel-5.0.8 - -GRAPH - aws-parallelcluster-awsbatch (3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - iptables (~> 8.0.0) - line (~> 4.5.21) - nfs (~> 5.1.5) - openssh (~> 2.11.14) - yum (~> 7.4.20) - yum-epel (~> 5.0.8) - aws-parallelcluster-computefleet (3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - aws-parallelcluster-entrypoints (3.13.0) - aws-parallelcluster-awsbatch (~> 3.13.0) - aws-parallelcluster-computefleet (~> 3.13.0) - aws-parallelcluster-environment (~> 3.13.0) - aws-parallelcluster-platform (~> 3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - aws-parallelcluster-slurm (~> 3.13.0) - aws-parallelcluster-environment (3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - line (~> 4.5.21) - nfs (~> 5.1.5) - aws-parallelcluster-platform (3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - line (~> 4.5.21) - aws-parallelcluster-shared (3.13.0) - yum (~> 7.4.20) - yum-epel (~> 5.0.8) - aws-parallelcluster-slurm (3.13.0) - aws-parallelcluster-computefleet (~> 3.13.0) - aws-parallelcluster-environment (~> 3.13.0) - aws-parallelcluster-platform (~> 3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - iptables (~> 8.0.0) - line (~> 4.5.21) - nfs (~> 5.1.5) - openssh (~> 2.11.14) - yum (~> 7.4.20) - yum-epel (~> 5.0.8) - aws-parallelcluster-tests (3.13.0) - aws-parallelcluster-computefleet (~> 3.13.0) - aws-parallelcluster-environment (~> 3.13.0) - aws-parallelcluster-platform (~> 3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - aws-parallelcluster-slurm (~> 3.13.0) - iptables (8.0.0) - line (4.5.21) - nfs (5.1.5) - line (>= 0.0.0) - openssh (2.11.14) - iptables (>= 7.0) - yum (7.4.20) - yum-epel (5.0.8) diff --git a/cookbooks/aws-parallelcluster-environment/Berksfile.lock b/cookbooks/aws-parallelcluster-environment/Berksfile.lock deleted file mode 100644 index ee9fbb6d3f..0000000000 --- a/cookbooks/aws-parallelcluster-environment/Berksfile.lock +++ /dev/null @@ -1,28 +0,0 @@ -DEPENDENCIES - aws-parallelcluster-environment - path: . - metadata: true - aws-parallelcluster-shared - path: ../aws-parallelcluster-shared - line - path: ../third-party/line-4.5.21 - nfs - path: ../third-party/nfs-5.1.5 - yum - path: ../third-party/yum-7.4.20 - yum-epel - path: ../third-party/yum-epel-5.0.8 - -GRAPH - aws-parallelcluster-environment (3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - line (~> 4.5.21) - nfs (~> 5.1.5) - aws-parallelcluster-shared (3.13.0) - yum (~> 7.4.20) - yum-epel (~> 5.0.8) - line (4.5.21) - nfs (5.1.5) - line (>= 0.0.0) - yum (7.4.20) - yum-epel (5.0.8) diff --git a/cookbooks/aws-parallelcluster-platform/Berksfile.lock b/cookbooks/aws-parallelcluster-platform/Berksfile.lock deleted file mode 100644 index 97e06da9a6..0000000000 --- a/cookbooks/aws-parallelcluster-platform/Berksfile.lock +++ /dev/null @@ -1,23 +0,0 @@ -DEPENDENCIES - aws-parallelcluster-platform - path: . - metadata: true - aws-parallelcluster-shared - path: ../aws-parallelcluster-shared - line - path: ../third-party/line-4.5.21 - yum - path: ../third-party/yum-7.4.20 - yum-epel - path: ../third-party/yum-epel-5.0.8 - -GRAPH - aws-parallelcluster-platform (3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - line (~> 4.5.21) - aws-parallelcluster-shared (3.13.0) - yum (~> 7.4.20) - yum-epel (~> 5.0.8) - line (4.5.21) - yum (7.4.20) - yum-epel (5.0.8) diff --git a/cookbooks/aws-parallelcluster-shared/Berksfile.lock b/cookbooks/aws-parallelcluster-shared/Berksfile.lock deleted file mode 100644 index 82b0b1131b..0000000000 --- a/cookbooks/aws-parallelcluster-shared/Berksfile.lock +++ /dev/null @@ -1,15 +0,0 @@ -DEPENDENCIES - aws-parallelcluster-shared - path: . - metadata: true - yum - path: ../third-party/yum-7.4.20 - yum-epel - path: ../third-party/yum-epel-5.0.8 - -GRAPH - aws-parallelcluster-shared (3.13.0) - yum (~> 7.4.20) - yum-epel (~> 5.0.8) - yum (7.4.20) - yum-epel (5.0.8) diff --git a/cookbooks/aws-parallelcluster-slurm/Berksfile.lock b/cookbooks/aws-parallelcluster-slurm/Berksfile.lock deleted file mode 100644 index 4f88206f51..0000000000 --- a/cookbooks/aws-parallelcluster-slurm/Berksfile.lock +++ /dev/null @@ -1,57 +0,0 @@ -DEPENDENCIES - aws-parallelcluster-computefleet - path: ../aws-parallelcluster-computefleet - aws-parallelcluster-environment - path: ../aws-parallelcluster-environment - aws-parallelcluster-platform - path: ../aws-parallelcluster-platform - aws-parallelcluster-shared - path: ../aws-parallelcluster-shared - aws-parallelcluster-slurm - path: . - metadata: true - iptables - path: ../third-party/iptables-8.0.0 - line - path: ../third-party/line-4.5.21 - nfs - path: ../third-party/nfs-5.1.5 - openssh - path: ../third-party/openssh-2.11.14 - yum - path: ../third-party/yum-7.4.20 - yum-epel - path: ../third-party/yum-epel-5.0.8 - -GRAPH - aws-parallelcluster-computefleet (3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - aws-parallelcluster-environment (3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - line (~> 4.5.21) - nfs (~> 5.1.5) - aws-parallelcluster-platform (3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - line (~> 4.5.21) - aws-parallelcluster-shared (3.13.0) - yum (~> 7.4.20) - yum-epel (~> 5.0.8) - aws-parallelcluster-slurm (3.13.0) - aws-parallelcluster-computefleet (~> 3.13.0) - aws-parallelcluster-environment (~> 3.13.0) - aws-parallelcluster-platform (~> 3.13.0) - aws-parallelcluster-shared (~> 3.13.0) - iptables (~> 8.0.0) - line (~> 4.5.21) - nfs (~> 5.1.5) - openssh (~> 2.11.14) - yum (~> 7.4.20) - yum-epel (~> 5.0.8) - iptables (8.0.0) - line (4.5.21) - nfs (5.1.5) - line (>= 0.0.0) - openssh (2.11.14) - iptables (>= 7.0) - yum (7.4.20) - yum-epel (5.0.8)