Skip to content

Commit 87e7970

Browse files
committed
DONOTMERGE Force NVIDIA driver to use the same gcc version used to compile the kernel
Signed-off-by: Giacomo Marciani <[email protected]>
1 parent f3219fb commit 87e7970

File tree

4 files changed

+62
-3
lines changed

4 files changed

+62
-3
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
provides :nvidia_driver, platform: 'ubuntu' do |node|
16-
node['platform_version'].to_i >= 20
16+
node['platform_version'].to_i == 20
1717
end
1818

1919
use 'partial/_nvidia_driver_common.rb'
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# frozen_string_literal: true
2+
3+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License").
6+
# You may not use this file except in compliance with the License.
7+
# A copy of the License is located at
8+
#
9+
# http://aws.amazon.com/apache2.0/
10+
#
11+
# or in the "LICENSE.txt" file accompanying this file.
12+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
13+
# See the License for the specific language governing permissions and limitations under the License.
14+
15+
provides :nvidia_driver, platform: 'ubuntu' do |node|
16+
node['platform_version'].to_i == 22
17+
end
18+
19+
use 'partial/_nvidia_driver_common.rb'
20+
21+
# def set_compiler?
22+
# # Ubuntu22.04 with Kernel 6.8.x needs to set CC to /usr/bin/gcc12 using dkms override
23+
# node['kernel']['release'].split('.')[0].to_i == 6
24+
# node['kernel']['release'].split('.')[1].to_i == 8
25+
# end
26+
27+
def rebuild_initramfs?
28+
true
29+
end
30+
31+
# def compiler_version
32+
# 'gcc'
33+
# end
34+
35+
def compiler_path
36+
gcc_version = get_gcc_version_used_by_kernel
37+
38+
if gcc_version.nil?
39+
return ""
40+
end
41+
42+
gcc_major_version = gcc_version.split('.')[0].to_i
43+
44+
"CC=/usr/bin/gcc-#{gcc_major_version}"
45+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,14 +73,13 @@
7373
end
7474

7575
# Install driver
76-
# TODO remove --no-cc-version-check when we can update ubuntu 22 images
7776
bash 'nvidia.run advanced' do
7877
user 'root'
7978
group 'root'
8079
cwd '/tmp'
8180
code <<-NVIDIA
8281
set -e
83-
#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau --no-cc-version-check -m=#{nvidia_kernel_module}
82+
#{compiler_path} ./nvidia.run --silent --dkms --disable-nouveau -m=#{nvidia_kernel_module}
8483
rm -f /tmp/nvidia.run
8584
NVIDIA
8685
creates '/usr/bin/nvidia-smi'
@@ -120,6 +119,10 @@ def compiler_path
120119
""
121120
end
122121

122+
def extra_packages
123+
[]
124+
end
125+
123126
def nvidia_kernel_module
124127
if ['false', 'no', false].include?(node['cluster']['nvidia']['kernel_open'])
125128
"kernel"

cookbooks/aws-parallelcluster-shared/libraries/helpers.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,14 @@ def wait_sync_file(path)
106106
timeout 5
107107
end
108108
end
109+
110+
def get_gcc_version_used_by_kernel
111+
begin
112+
gcc_version = shell_out!("awk '{print $8}' /proc/version | tr -d ',' | cut -d '.' -f 1").stdout.strip
113+
rescue => error
114+
Chef::Log.error("Cannot detect gcc version used to compile the kernel: #{error}")
115+
return nil
116+
end
117+
Chef::Log.info("Detected version of gcc used to compile the kernel is: #{gcc_version}")
118+
gcc_version
119+
end

0 commit comments

Comments
 (0)