Skip to content

Commit 1b7fb9f

Browse files
gmarcianidreambeyondorangehimani2411
authored
On Ubuntu22, install the NVIDIA driver using the gcc version used to compile the kernel. (aws#2852)
* Refactor Changelog updates in priority order (aws#2848) * Update CHANGELOG.md to remove 'we' (aws#2851) * [nvidia] On Ubuntu22, install the NVIDIA driver using the gcc version used to compile the kernel. This is required because, NVIDIA driver must be compiled with the same gcc version used by the kernel. If this is not the case, the NVIDIA driver installation would fail a compiler version check. On newer version of Ubuntu22.04 (kernel 6.8+), the kernel is compiled with gcc-12, however gcc-11 is installed as default version by build-essentials, making this change necessary. Signed-off-by: Giacomo Marciani <[email protected]> * [CI] Use Cinc v23 to execute spec tests. Signed-off-by: Giacomo Marciani <[email protected]> --------- Signed-off-by: Giacomo Marciani <[email protected]> Co-authored-by: Ryan Anderson <[email protected]> Co-authored-by: Himani Anil Deshpande <[email protected]>
1 parent e479a86 commit 1b7fb9f

File tree

8 files changed

+142
-1
lines changed

8 files changed

+142
-1
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ jobs:
120120
with:
121121
omnitruckUrl: omnitruck.cinc.sh
122122
project: cinc-workstation
123+
version: 23
123124
- name: Run ChefSpec on ${{ matrix.cookbook }}
124125
run: |
125126
cd ${{ matrix.cookbook }}

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ aws-parallelcluster-cookbook CHANGELOG
22
======================================
33

44
This file is used to list changes made in each version of the AWS ParallelCluster cookbook.
5+
3.13.0
6+
------
7+
8+
**CHANGES**
9+
- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel.
510

611
3.12.0
712
------
@@ -26,6 +31,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2631
- Auto-restart slurmctld on failure.
2732
- Upgrade mysql-community-client to version 8.0.39.
2833

34+
2935
**BUG FIXES**
3036
- Fix retrieval of regions when managing volumes to correctly handle local zones.
3137
- Fix an issue where adding EFS filesystems with `AccessPointIds` during an update would fail.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
provides :nvidia_driver, platform: 'ubuntu' do |node|
16-
node['platform_version'].to_i >= 20
16+
node['platform_version'].to_i == 20
1717
end
1818

1919
use 'partial/_nvidia_driver_common.rb'
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_driver, platform: 'ubuntu' do |node|
16+
node['platform_version'].to_i == 22
17+
end
18+
19+
use 'partial/_nvidia_driver_common.rb'
20+
21+
def rebuild_initramfs?
22+
true
23+
end
24+
25+
def set_compiler?
26+
true
27+
end
28+
29+
def compiler_version
30+
'gcc'
31+
end
32+
33+
def extra_packages
34+
%w()
35+
end
36+
37+
def compiler_path
38+
gcc_major_version = gcc_major_version_used_by_kernel
39+
40+
# If the gcc version used to compile the kernel cannot be detected,
41+
# empty string is returned, meaning that the NVIDIA driver will be compiled
42+
# using the system default compiler.
43+
return "" if gcc_major_version.nil?
44+
45+
"CC=/usr/bin/gcc-#{gcc_major_version}"
46+
end

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,13 @@ def self.setup(chef_run, nvidia_driver_version: nil)
169169
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
170170
end
171171
cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" }
172+
cached(:kernel_compiler_version) { "KERNEL_COMPILER_VERSION" }
172173
cached(:chef_run) do
173174
stubs_for_resource('nvidia_driver') do |res|
174175
allow(res).to receive(:nvidia_driver_enabled?).and_return(true)
175176
allow(res).to receive(:nvidia_arch).and_return(nvidia_arch)
176177
allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module)
178+
allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version)
177179
end
178180

179181
stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true)
@@ -245,6 +247,33 @@ def self.setup(chef_run, nvidia_driver_version: nil)
245247
.with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
246248
.with_code(%r{rm -f /tmp/nvidia.run})
247249
end
250+
elsif platform == 'ubuntu' && version == '22.04'
251+
it 'installs gcc' do
252+
is_expected.to install_package('gcc').with_retries(10).with_retry_delay(5)
253+
end
254+
255+
it 'creates dkms/nvidia.conf' do
256+
compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}"
257+
is_expected.to create_template('/etc/dkms/nvidia.conf').with(
258+
source: 'nvidia/amazon/dkms/nvidia.conf.erb',
259+
cookbook: 'aws-parallelcluster-platform',
260+
owner: 'root',
261+
group: 'root',
262+
mode: '0644',
263+
variables: { compiler_path: compiler_path }
264+
)
265+
end
266+
it 'installs nvidia driver' do
267+
compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}"
268+
is_expected.to run_bash('nvidia.run advanced')
269+
.with(
270+
user: 'root',
271+
group: 'root',
272+
cwd: '/tmp',
273+
creates: '/usr/bin/nvidia-smi'
274+
)
275+
.with_code(%r{#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
276+
end
248277
else
249278
it "doesn't install gcc10" do
250279
is_expected.not_to install_package('gcc10')
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
6+
# License. A copy of the License is located at
7+
#
8+
# http://aws.amazon.com/apache2.0/
9+
#
10+
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
11+
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
def gcc_major_version_used_by_kernel
15+
# Detects the gcc major version used to compile the kernel, e.g. 12.
16+
# If the version cannot be detected, nil is returned.
17+
# (Tested only on Ubuntu)
18+
begin
19+
gcc_major_version = shell_out("cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2").stdout.strip
20+
rescue => error
21+
Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}")
22+
return ""
23+
end
24+
Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_major_version}")
25+
gcc_major_version
26+
end

cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
require 'chefspec'
22
require 'chefspec/berkshelf'
33

4+
# Chef::Mixin::ShellOut is required to mock shellout
5+
include Chef::Mixin::ShellOut
6+
47
RSpec.configure do |c|
58
c.before(:each) do
69
allow(File).to receive(:exist?).and_call_original
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
require_relative '../../../../libraries/ubuntu/helpers'
2+
require 'spec_helper'
3+
4+
describe 'gcc_major_version_used_by_kernel' do
5+
let(:cmd) { "cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2" }
6+
let(:shellout) { double(run_command: nil, error!: nil, stdout: '', stderr: '', exitstatus: 0, live_stream: '') }
7+
8+
context 'when gcc version can be detected' do
9+
before do
10+
allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout)
11+
allow(shellout).to receive(:stdout).and_return("1")
12+
end
13+
14+
it 'returns the correct gcc major version' do
15+
result = gcc_major_version_used_by_kernel
16+
expect(result).to eq("1")
17+
end
18+
end
19+
20+
context 'when gcc version cannot be detected' do
21+
before do
22+
allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_raise(Mixlib::ShellOut::ShellCommandFailed)
23+
end
24+
25+
it 'returns an empty string' do
26+
result = gcc_major_version_used_by_kernel
27+
expect(result).to eq("")
28+
end
29+
end
30+
end

0 commit comments

Comments
 (0)