Skip to content

Commit 0b03399

Browse files
committed
[nvidia] On Ubuntu22, install the NVIDIA driver using the gcc version used to compile the kernel.
This is required because, NVIDIA driver must be compiled with the same gcc version used by the kernel. If this is not the case, the NVIDIA driver installation would fail a compiler version check. On newer version of Ubuntu22.04 (kernel 6.8+), the kernel is compiled with gcc-12, however gcc-11 is installed as default version by build-essentials, making this change necessary. Signed-off-by: Giacomo Marciani <[email protected]>
1 parent f3219fb commit 0b03399

File tree

8 files changed

+127
-2
lines changed

8 files changed

+127
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
2525
- Open MPI: `openmpi40-aws-4.1.7-1` and `openmpi50-aws-5.0.5`
2626
- Auto-restart slurmctld on failure.
2727
- Upgrade mysql-community-client to version 8.0.39.
28+
- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel.
2829

2930
**BUG FIXES**
3031
- Fix retrieval of regions when managing volumes to correctly handle local zones.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
provides :nvidia_driver, platform: 'ubuntu' do |node|
16-
node['platform_version'].to_i >= 20
16+
node['platform_version'].to_i == 20
1717
end
1818

1919
use 'partial/_nvidia_driver_common.rb'
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_driver, platform: 'ubuntu' do |node|
16+
node['platform_version'].to_i == 22
17+
end
18+
19+
use 'partial/_nvidia_driver_common.rb'
20+
21+
def rebuild_initramfs?
22+
true
23+
end
24+
25+
def set_compiler?
26+
true
27+
end
28+
29+
def compiler_version
30+
'gcc'
31+
end
32+
33+
def extra_packages
34+
%w()
35+
end
36+
37+
def compiler_path
38+
gcc_major_version = gcc_major_version_used_by_kernel
39+
40+
# If the gcc version used to compile the kernel cannot be detected,
41+
# empty string is returned, meaning that the NVIDIA driver will be compiled
42+
# using the system default compiler.
43+
return "" if gcc_major_version.nil?
44+
45+
"CC=/usr/bin/gcc-#{gcc_major_version}"
46+
end

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,13 @@ def self.setup(chef_run, nvidia_driver_version: nil)
169169
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
170170
end
171171
cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" }
172+
cached(:kernel_compiler_version) { "KERNEL_COMPILER_VERSION" }
172173
cached(:chef_run) do
173174
stubs_for_resource('nvidia_driver') do |res|
174175
allow(res).to receive(:nvidia_driver_enabled?).and_return(true)
175176
allow(res).to receive(:nvidia_arch).and_return(nvidia_arch)
176177
allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module)
178+
allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version)
177179
end
178180

179181
stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true)
@@ -245,6 +247,33 @@ def self.setup(chef_run, nvidia_driver_version: nil)
245247
.with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
246248
.with_code(%r{rm -f /tmp/nvidia.run})
247249
end
250+
elsif platform == 'ubuntu' && version == '22.04'
251+
it 'installs gcc' do
252+
is_expected.to install_package('gcc').with_retries(10).with_retry_delay(5)
253+
end
254+
255+
it 'creates dkms/nvidia.conf' do
256+
compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}"
257+
is_expected.to create_template('/etc/dkms/nvidia.conf').with(
258+
source: 'nvidia/amazon/dkms/nvidia.conf.erb',
259+
cookbook: 'aws-parallelcluster-platform',
260+
owner: 'root',
261+
group: 'root',
262+
mode: '0644',
263+
variables: { compiler_path: compiler_path }
264+
)
265+
end
266+
it 'installs nvidia driver' do
267+
compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}"
268+
is_expected.to run_bash('nvidia.run advanced')
269+
.with(
270+
user: 'root',
271+
group: 'root',
272+
cwd: '/tmp',
273+
creates: '/usr/bin/nvidia-smi'
274+
)
275+
.with_code(%r{#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
276+
end
248277
else
249278
it "doesn't install gcc10" do
250279
is_expected.not_to install_package('gcc10')

cookbooks/aws-parallelcluster-shared/libraries/helpers.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,4 +105,4 @@ def wait_sync_file(path)
105105
retry_delay 10
106106
timeout 5
107107
end
108-
end
108+
end
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
def gcc_major_version_used_by_kernel
2+
# Detects the gcc major version used to compile the kernel, e.g. 12.
3+
# If the version cannot be detected, nil is returned.
4+
# (Tested only on Ubuntu)
5+
begin
6+
gcc_major_version = Mixlib::ShellOut.new("cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2").run_command.stdout.strip
7+
rescue => error
8+
Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}")
9+
return ""
10+
end
11+
Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_major_version}")
12+
gcc_major_version
13+
end

cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
require 'chefspec'
22
require 'chefspec/berkshelf'
33

4+
# Chef::Mixin::ShellOut is required to mock shellout
5+
include Chef::Mixin::ShellOut
6+
47
RSpec.configure do |c|
58
c.before(:each) do
69
allow(File).to receive(:exist?).and_call_original
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
require_relative '../../../../libraries/ubuntu/helpers'
2+
require 'spec_helper'
3+
4+
describe 'gcc_major_version_used_by_kernel' do
5+
let(:cmd) { "cat /proc/version | grep -Eo 'gcc-[0-9]+' | cut -d '-' -f 2" }
6+
let(:shellout) { double(run_command: nil) }
7+
let(:shellout_execution) { double(error!: nil, stdout: '', stderr: '', exitstatus: 0, live_stream: '') }
8+
9+
context 'when gcc version can be detected' do
10+
before do
11+
allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout)
12+
allow(shellout).to receive(:run_command).and_return(shellout_execution)
13+
allow(shellout_execution).to receive(:stdout).and_return("1")
14+
end
15+
16+
it 'returns the correct gcc major version' do
17+
result = gcc_major_version_used_by_kernel
18+
expect(result).to eq("1")
19+
end
20+
end
21+
22+
context 'when gcc version cannot be detected' do
23+
before do
24+
allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout)
25+
allow(shellout).to receive(:run_command).and_raise(Mixlib::ShellOut::ShellCommandFailed)
26+
end
27+
28+
it 'returns an empty string' do
29+
result = gcc_major_version_used_by_kernel
30+
expect(result).to eq("")
31+
end
32+
end
33+
end

0 commit comments

Comments
 (0)