From 40c2d540db5b8ba54056161324abe812612b85e5 Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Tue, 10 Dec 2024 09:12:11 -0500 Subject: [PATCH 1/4] Refactor Changelog updates in priority order (#2848) --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6096716fcf..5b1402dc3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Extend Amazon DCV support to Ubuntu2204 on ARM instances. **CHANGES** -- Upgrade mysql-community-client to version 8.0.39. +- Upgrade NVIDIA driver to version 550.127.08 (from 550.90.07). This addresses [a known issue from Nivdia](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-07/index.html#known-issues). - Upgrade Amazon DCV to version `2024.0-18131`. - server: `2024.0-18131-1` - xdcv: `2024.0.631-1` @@ -23,8 +23,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Libfabric-aws: `libfabric-aws-1.22.0-1` - Rdma-core: `rdma-core-54.0-1` - Open MPI: `openmpi40-aws-4.1.7-1` and `openmpi50-aws-5.0.5` -- Upgrade NVIDIA driver to version 550.127.08 (from 550.90.07). This addresses [a known issue from Nivdia](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-07/index.html#known-issues). - Auto-restart slurmctld on failure. +- Upgrade mysql-community-client to version 8.0.39. **BUG FIXES** - Fix an issue in the way we get region when manage volumes so that it can correctly handle local zone. From f3219fb9631e9a161f2e4a3ffc98d055458554ae Mon Sep 17 00:00:00 2001 From: Ryan Anderson Date: Tue, 10 Dec 2024 15:53:58 -0500 Subject: [PATCH 2/4] Update CHANGELOG.md to remove 'we' (#2851) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b1402dc3d..3623a2afce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,7 +27,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Upgrade mysql-community-client to version 8.0.39. **BUG FIXES** -- Fix an issue in the way we get region when manage volumes so that it can correctly handle local zone. +- Fix retrieval of regions when managing volumes to correctly handle local zones. - Fix an issue where adding EFS filesystems with `AccessPointIds` during an update would fail. 3.11.1 From a0ea069657eea1625beb06f2837a89792e6e3fb7 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 11 Dec 2024 13:40:58 +0100 Subject: [PATCH 3/4] [nvidia] On Ubuntu22, install the NVIDIA driver using the gcc version used to compile the kernel. This is required because, NVIDIA driver must be compiled with the same gcc version used by the kernel. If this is not the case, the NVIDIA driver installation would fail a compiler version check. On newer version of Ubuntu22.04 (kernel 6.8+), the kernel is compiled with gcc-12, however gcc-11 is installed as default version by build-essentials, making this change necessary. Signed-off-by: Giacomo Marciani --- CHANGELOG.md | 1 + ...ubuntu20+.rb => nvidia_driver_ubuntu20.rb} | 2 +- .../nvidia_driver/nvidia_driver_ubuntu22.rb | 46 +++++++++++++++++++ .../spec/unit/resources/nvidia_driver_spec.rb | 29 ++++++++++++ .../libraries/ubuntu/helpers.rb | 26 +++++++++++ .../spec/spec_helper.rb | 3 ++ .../unit/libraries/ubuntu/helpers_spec.rb | 30 ++++++++++++ 7 files changed, 136 insertions(+), 1 deletion(-) rename cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/{nvidia_driver_ubuntu20+.rb => nvidia_driver_ubuntu20.rb} (95%) create mode 100644 cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb create mode 100644 cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb create mode 100644 cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 3623a2afce..5ac5cb7bfa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Open MPI: `openmpi40-aws-4.1.7-1` and `openmpi50-aws-5.0.5` - Auto-restart slurmctld on failure. - Upgrade mysql-community-client to version 8.0.39. +- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel. **BUG FIXES** - Fix retrieval of regions when managing volumes to correctly handle local zones. diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb similarity index 95% rename from cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb rename to cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb index 1043206fcc..9e71464f99 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20+.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu20.rb @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and limitations under the License. provides :nvidia_driver, platform: 'ubuntu' do |node| - node['platform_version'].to_i >= 20 + node['platform_version'].to_i == 20 end use 'partial/_nvidia_driver_common.rb' diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb new file mode 100644 index 0000000000..b075c75f37 --- /dev/null +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/nvidia_driver_ubuntu22.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +provides :nvidia_driver, platform: 'ubuntu' do |node| + node['platform_version'].to_i == 22 +end + +use 'partial/_nvidia_driver_common.rb' + +def rebuild_initramfs? + true +end + +def set_compiler? + true +end + +def compiler_version + 'gcc' +end + +def extra_packages + %w() +end + +def compiler_path + gcc_major_version = gcc_major_version_used_by_kernel + + # If the gcc version used to compile the kernel cannot be detected, + # empty string is returned, meaning that the NVIDIA driver will be compiled + # using the system default compiler. + return "" if gcc_major_version.nil? + + "CC=/usr/bin/gcc-#{gcc_major_version}" +end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index 741fddbcc9..2c77cdda36 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -169,11 +169,13 @@ def self.setup(chef_run, nvidia_driver_version: nil) cached(:nvidia_driver_version) { 'nvidia_driver_version' } end cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" } + cached(:kernel_compiler_version) { "KERNEL_COMPILER_VERSION" } cached(:chef_run) do stubs_for_resource('nvidia_driver') do |res| allow(res).to receive(:nvidia_driver_enabled?).and_return(true) allow(res).to receive(:nvidia_arch).and_return(nvidia_arch) allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module) + allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version) end stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true) @@ -245,6 +247,33 @@ def self.setup(chef_run, nvidia_driver_version: nil) .with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}}) .with_code(%r{rm -f /tmp/nvidia.run}) end + elsif platform == 'ubuntu' && version == '22.04' + it 'installs gcc' do + is_expected.to install_package('gcc').with_retries(10).with_retry_delay(5) + end + + it 'creates dkms/nvidia.conf' do + compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}" + is_expected.to create_template('/etc/dkms/nvidia.conf').with( + source: 'nvidia/amazon/dkms/nvidia.conf.erb', + cookbook: 'aws-parallelcluster-platform', + owner: 'root', + group: 'root', + mode: '0644', + variables: { compiler_path: compiler_path } + ) + end + it 'installs nvidia driver' do + compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}" + is_expected.to run_bash('nvidia.run advanced') + .with( + user: 'root', + group: 'root', + cwd: '/tmp', + creates: '/usr/bin/nvidia-smi' + ) + .with_code(%r{#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}}) + end else it "doesn't install gcc10" do is_expected.not_to install_package('gcc10') diff --git a/cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb b/cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb new file mode 100644 index 0000000000..17c5a150ac --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/libraries/ubuntu/helpers.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the +# License. A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and +# limitations under the License. + +def gcc_major_version_used_by_kernel + # Detects the gcc major version used to compile the kernel, e.g. 12. + # If the version cannot be detected, nil is returned. + # (Tested only on Ubuntu) + begin + gcc_major_version = shell_out("cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2").stdout.strip + rescue => error + Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}") + return "" + end + Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_major_version}") + gcc_major_version +end diff --git a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb index 849b190097..ff6af2da76 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb @@ -1,6 +1,9 @@ require 'chefspec' require 'chefspec/berkshelf' +# Chef::Mixin::ShellOut is required to mock shellout +include Chef::Mixin::ShellOut + RSpec.configure do |c| c.before(:each) do allow(File).to receive(:exist?).and_call_original diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb new file mode 100644 index 0000000000..6788256adc --- /dev/null +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/libraries/ubuntu/helpers_spec.rb @@ -0,0 +1,30 @@ +require_relative '../../../../libraries/ubuntu/helpers' +require 'spec_helper' + +describe 'gcc_major_version_used_by_kernel' do + let(:cmd) { "cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2" } + let(:shellout) { double(run_command: nil, error!: nil, stdout: '', stderr: '', exitstatus: 0, live_stream: '') } + + context 'when gcc version can be detected' do + before do + allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout) + allow(shellout).to receive(:stdout).and_return("1") + end + + it 'returns the correct gcc major version' do + result = gcc_major_version_used_by_kernel + expect(result).to eq("1") + end + end + + context 'when gcc version cannot be detected' do + before do + allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_raise(Mixlib::ShellOut::ShellCommandFailed) + end + + it 'returns an empty string' do + result = gcc_major_version_used_by_kernel + expect(result).to eq("") + end + end +end From 485417c472c3b61b02285a20ff2973c80e4fb336 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 13 Dec 2024 16:10:25 +0100 Subject: [PATCH 4/4] [CI] Use Cinc v23 to execute spec tests. Signed-off-by: Giacomo Marciani --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 994c127a40..6a9b7c216f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -120,6 +120,7 @@ jobs: with: omnitruckUrl: omnitruck.cinc.sh project: cinc-workstation + version: 23 - name: Run ChefSpec on ${{ matrix.cookbook }} run: | cd ${{ matrix.cookbook }}