Skip to content

Commit 1f051f4

Browse files
fix
Signed-off-by: Hanwen <[email protected]>
1 parent 4640d56 commit 1f051f4

File tree

6 files changed

+24
-13
lines changed

6 files changed

+24
-13
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,11 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2020
- Rdma-core: rdma-core-57.0-1
2121
- Open MPI: openmpi40-aws-4.1.7-2 and openmpi50-aws-5.0.6-11
2222
- Upgrade Cinc Client to version to 18.4.12 from 18.2.7.
23+
- Upgrade NVIDIA driver to version 570.172.08 (from 570.86.15) for all OSs except AL2.
24+
- Upgrade CUDA Toolkit to version 12.8.1 (from 12.8.0) for all OSs except AL2.
25+
- Upgrade DCGM to version 4.2.3 (from 3.3.6) for all OSs except AL2.
26+
- Upgrade Python to 3.12.11 (from 3.12.8) for all OSs except AL2.
27+
- Upgrade Intel MPI Library to 2021.16.0 (from 2021.13.1).
2328
- Addressed cluster id mismatch known issue by deleting the file `/var/spool/slurm.state/clustername` before configuring Slurm accounting.
2429
- Upgrade DCV to version 2024.0-19030.
2530
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.

cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,12 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
action :install_package do
16-
if package_version.start_with?("3.")
17-
packages_urls_list = [dcgm_package]
18-
else
19-
packages_urls_list = [dcgm4_core_package, dcgm4_package]
20-
end
21-
for package in packages_urls_list
16+
packages_urls_list = if package_version.start_with?("3.")
17+
[dcgm_package]
18+
else
19+
[dcgm4_core_package, dcgm4_package]
20+
end
21+
packages_urls_list.each do |package|
2222
remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.deb" do
2323
source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}_#{package_version}_#{arch_suffix}.deb"
2424
mode '0644'

cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
packages_urls_list = [dcgm4_core_package, dcgm4_package]
2121
package_url_separator = "."
2222
end
23-
for package in packages_urls_list
23+
packages_urls_list.each do |package|
2424
remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.rpm" do
2525
source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}-#{package_version}#{package_url_separator}#{arch_suffix}.rpm"
2626
mode '0644'

cookbooks/aws-parallelcluster-platform/spec/unit/recipes/intel_mpi_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
end
2121

2222
it 'fetches intel mpi installer script' do
23-
is_expected.to create_remote_file("#{source_dir}/intel-mpi-2021.13.1.769_offline.sh").with(
23+
is_expected.to create_remote_file("#{source_dir}/intel-mpi-2021.16.0.443_offline.sh").with(
2424
source: "https://#{aws_region}-aws-parallelcluster.s3.#{aws_region}.test_aws_domain/archives/impi/intel-mpi-2021.16.0.443_offline.sh",
2525
mode: '0744',
2626
retries: 3,
@@ -43,7 +43,7 @@
4343

4444
it 'renames intel mpi module' do
4545
is_expected.to run_execute('rename intel mpi modules file name').with(
46-
command: "mv /opt/intel/mpi/2021.16/etc/modulefiles/mpi /opt/intel/mpi/2021.13/etc/modulefiles/intelmpi",
46+
command: "mv /opt/intel/mpi/2021.16/etc/modulefiles/mpi /opt/intel/mpi/2021.16/etc/modulefiles/intelmpi",
4747
creates: '/opt/intel/mpi/2021.16/etc/modulefiles/intelmpi'
4848
)
4949
end

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,8 @@ def self.setup(chef_run, nvidia_enabled: nil)
170170
end
171171
else
172172
it 'installs datacenter gpu manager' do
173-
is_expected.to run_bash('Install datacenter-gpu-manager')
173+
is_expected.to run_bash('Install datacenter-gpu-manager-4-core')
174+
is_expected.to run_bash('Install datacenter-gpu-manager-4-cuda12')
174175
end
175176
end
176177
end

cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,13 @@
1414
['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? &&
1515
(!os_properties.arm? || !(os_properties.alinux2? || os_properties.centos?))
1616
end
17-
18-
describe package('datacenter-gpu-manager-4-cuda12') do
19-
it { should be_installed }
17+
if os_properties.alinux2?
18+
describe package('datacenter-gpu-manager') do
19+
it { should be_installed }
20+
end
21+
else
22+
describe package('datacenter-gpu-manager-4-cuda12') do
23+
it { should be_installed }
24+
end
2025
end
2126
end

0 commit comments

Comments
 (0)