Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ jobs:
with:
omnitruckUrl: omnitruck.cinc.sh
project: cinc-workstation
version: 23
- name: Run ChefSpec on ${{ matrix.cookbook }}
run: |
cd ${{ matrix.cookbook }}
Expand Down
7 changes: 4 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Extend Amazon DCV support to Ubuntu2204 on ARM instances.

**CHANGES**
- Upgrade mysql-community-client to version 8.0.39.
- Upgrade NVIDIA driver to version 550.127.08 (from 550.90.07). This addresses [a known issue from Nivdia](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-07/index.html#known-issues).
- Upgrade Amazon DCV to version `2024.0-18131`.
- server: `2024.0-18131-1`
- xdcv: `2024.0.631-1`
Expand All @@ -23,11 +23,12 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Libfabric-aws: `libfabric-aws-1.22.0-1`
- Rdma-core: `rdma-core-54.0-1`
- Open MPI: `openmpi40-aws-4.1.7-1` and `openmpi50-aws-5.0.5`
- Upgrade NVIDIA driver to version 550.127.08 (from 550.90.07). This addresses [a known issue from Nivdia](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-07/index.html#known-issues).
- Auto-restart slurmctld on failure.
- Upgrade mysql-community-client to version 8.0.39.
- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel.

**BUG FIXES**
- Fix an issue in the way we get region when manage volumes so that it can correctly handle local zone.
- Fix retrieval of regions when managing volumes to correctly handle local zones.
- Fix an issue where adding EFS filesystems with `AccessPointIds` during an update would fail.

3.11.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and limitations under the License.

provides :nvidia_driver, platform: 'ubuntu' do |node|
node['platform_version'].to_i >= 20
node['platform_version'].to_i == 20
end

use 'partial/_nvidia_driver_common.rb'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# frozen_string_literal: true

# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

provides :nvidia_driver, platform: 'ubuntu' do |node|
node['platform_version'].to_i == 22
end

use 'partial/_nvidia_driver_common.rb'

def rebuild_initramfs?
true
end

def set_compiler?
true
end

def compiler_version
'gcc'
end

def extra_packages
%w()
end

def compiler_path
gcc_major_version = gcc_major_version_used_by_kernel

# If the gcc version used to compile the kernel cannot be detected,
# empty string is returned, meaning that the NVIDIA driver will be compiled
# using the system default compiler.
return "" if gcc_major_version.nil?

"CC=/usr/bin/gcc-#{gcc_major_version}"
end
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,13 @@ def self.setup(chef_run, nvidia_driver_version: nil)
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
end
cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" }
cached(:kernel_compiler_version) { "KERNEL_COMPILER_VERSION" }
cached(:chef_run) do
stubs_for_resource('nvidia_driver') do |res|
allow(res).to receive(:nvidia_driver_enabled?).and_return(true)
allow(res).to receive(:nvidia_arch).and_return(nvidia_arch)
allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module)
allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version)
end

stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true)
Expand Down Expand Up @@ -245,6 +247,33 @@ def self.setup(chef_run, nvidia_driver_version: nil)
.with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
.with_code(%r{rm -f /tmp/nvidia.run})
end
elsif platform == 'ubuntu' && version == '22.04'
it 'installs gcc' do
is_expected.to install_package('gcc').with_retries(10).with_retry_delay(5)
end

it 'creates dkms/nvidia.conf' do
compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}"
is_expected.to create_template('/etc/dkms/nvidia.conf').with(
source: 'nvidia/amazon/dkms/nvidia.conf.erb',
cookbook: 'aws-parallelcluster-platform',
owner: 'root',
group: 'root',
mode: '0644',
variables: { compiler_path: compiler_path }
)
end
it 'installs nvidia driver' do
compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}"
is_expected.to run_bash('nvidia.run advanced')
.with(
user: 'root',
group: 'root',
cwd: '/tmp',
creates: '/usr/bin/nvidia-smi'
)
.with_code(%r{#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
end
else
it "doesn't install gcc10" do
is_expected.not_to install_package('gcc10')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# frozen_string_literal: true

# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

def gcc_major_version_used_by_kernel
# Detects the gcc major version used to compile the kernel, e.g. 12.
# If the version cannot be detected, nil is returned.
# (Tested only on Ubuntu)
begin
gcc_major_version = shell_out("cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2").stdout.strip
rescue => error
Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}")
return ""
end
Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_major_version}")
gcc_major_version
end
3 changes: 3 additions & 0 deletions cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
require 'chefspec'
require 'chefspec/berkshelf'

# Chef::Mixin::ShellOut is required to mock shellout
include Chef::Mixin::ShellOut

RSpec.configure do |c|
c.before(:each) do
allow(File).to receive(:exist?).and_call_original
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
require_relative '../../../../libraries/ubuntu/helpers'
require 'spec_helper'

describe 'gcc_major_version_used_by_kernel' do
let(:cmd) { "cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2" }
let(:shellout) { double(run_command: nil, error!: nil, stdout: '', stderr: '', exitstatus: 0, live_stream: '') }

context 'when gcc version can be detected' do
before do
allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout)
allow(shellout).to receive(:stdout).and_return("1")
end

it 'returns the correct gcc major version' do
result = gcc_major_version_used_by_kernel
expect(result).to eq("1")
end
end

context 'when gcc version cannot be detected' do
before do
allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_raise(Mixlib::ShellOut::ShellCommandFailed)
end

it 'returns an empty string' do
result = gcc_major_version_used_by_kernel
expect(result).to eq("")
end
end
end
Loading