Skip to content

Commit 48f8bc8

Browse files
committed
[nvidia] On Ubuntu22, install the NVIDIA driver using the gcc version used to compile the kernel.
This is required because, NVIDIA driver must be compiled with the same gcc version used by the kernel. If this is not the case, the NVIDIA driver installation would fail a compiler version check. On newer version of Ubuntu22.04 (kernel 6.8+), the kernel is compiled with gcc-12, however gcc-11 is installed as default version by build-essentials, making this change necessary. Signed-off-by: Giacomo Marciani <[email protected]>
1 parent f3219fb commit 48f8bc8

File tree

6 files changed

+98
-1
lines changed

6 files changed

+98
-1
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
provides :nvidia_driver, platform: 'ubuntu' do |node|
16-
node['platform_version'].to_i >= 20
16+
node['platform_version'].to_i == 20
1717
end
1818

1919
use 'partial/_nvidia_driver_common.rb'
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_driver, platform: 'ubuntu' do |node|
16+
node['platform_version'].to_i == 22
17+
end
18+
19+
use 'partial/_nvidia_driver_common.rb'
20+
21+
def rebuild_initramfs?
22+
true
23+
end
24+
25+
def compiler_path
26+
gcc_major_version = gcc_major_version_used_by_kernel
27+
28+
# If the gcc version used to compile the kernel cannot be detected,
29+
# empty string is returned, meaning that the NVIDIA driver will be compiled
30+
# using the system default compiler.
31+
return "" if gcc_major_version.nil?
32+
33+
"CC=/usr/bin/gcc-#{gcc_major_version}"
34+
end

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,13 @@ def self.setup(chef_run, nvidia_driver_version: nil)
169169
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
170170
end
171171
cached(:nvidia_driver_url) { "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_driver/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" }
172+
cached(:kernel_compiler_version) { "KERNEL_COMPILER_VERSION" }
172173
cached(:chef_run) do
173174
stubs_for_resource('nvidia_driver') do |res|
174175
allow(res).to receive(:nvidia_driver_enabled?).and_return(true)
175176
allow(res).to receive(:nvidia_arch).and_return(nvidia_arch)
176177
allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module)
178+
allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version)
177179
end
178180

179181
stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true)
@@ -245,6 +247,18 @@ def self.setup(chef_run, nvidia_driver_version: nil)
245247
.with_code(%r{CC=/usr/bin/gcc10-gcc ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
246248
.with_code(%r{rm -f /tmp/nvidia.run})
247249
end
250+
elsif platform == 'ubuntu' && version == '22.04'
251+
it 'installs nvidia driver' do
252+
compiler_path = "CC=/usr/bin/gcc-#{kernel_compiler_version}"
253+
is_expected.to run_bash('nvidia.run advanced')
254+
.with(
255+
user: 'root',
256+
group: 'root',
257+
cwd: '/tmp',
258+
creates: '/usr/bin/nvidia-smi'
259+
)
260+
.with_code(%r{#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{kernel_module}})
261+
end
248262
else
249263
it "doesn't install gcc10" do
250264
is_expected.not_to install_package('gcc10')

cookbooks/aws-parallelcluster-shared/libraries/helpers.rb

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,16 @@ def wait_sync_file(path)
106106
timeout 5
107107
end
108108
end
109+
110+
def gcc_major_version_used_by_kernel
111+
# Detects the gcc major version used to compile the kernel, e.g. 12.
112+
# If the version cannot be detected, nil is returned.
113+
begin
114+
gcc_major_version = shell_out("awk '{print $8}' /proc/version | tr -d ',' | cut -d '.' -f 1").stdout.strip
115+
rescue => error
116+
Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}")
117+
return ""
118+
end
119+
Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_major_version}")
120+
gcc_major_version
121+
end

cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
require 'chefspec'
22
require 'chefspec/berkshelf'
33

4+
# Chef::Mixin::ShellOut is required to mock shellout
5+
include Chef::Mixin::ShellOut
6+
47
RSpec.configure do |c|
58
c.before(:each) do
69
allow(File).to receive(:exist?).and_call_original
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
require_relative '../../../libraries/helpers'
2+
require 'spec_helper'
3+
4+
describe 'gcc_major_version_used_by_kernel' do
5+
6+
let(:cmd) { "awk '{print $8}' /proc/version | tr -d ',' | cut -d '.' -f 1" }
7+
let(:shellout) { double(run_command: nil, error!: nil, stdout: '', stderr: '', exitstatus: 0, live_stream: '') }
8+
9+
context 'when gcc version can be detected' do
10+
11+
before do
12+
allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_return(shellout)
13+
allow(shellout).to receive(:stdout).and_return("1")
14+
end
15+
16+
it 'returns the correct gcc major version' do
17+
result = gcc_major_version_used_by_kernel
18+
expect(result).to eq("1")
19+
end
20+
end
21+
22+
context 'when gcc version cannot be detected' do
23+
24+
before do
25+
allow(Mixlib::ShellOut).to receive(:new).with(cmd, any_args).and_raise(Mixlib::ShellOut::ShellCommandFailed)
26+
end
27+
28+
it 'returns an empty string' do
29+
result = gcc_major_version_used_by_kernel
30+
expect(result).to eq("")
31+
end
32+
end
33+
end

0 commit comments

Comments
 (0)