Skip to content

Commit d8184f7

Browse files
committed
[BuildImage] Load kernel module drm_client_lib before the installation of NVIDIA driver, if the module is available on the kernel.
Starting kernel `5.14.0-611`, some DRM symbols required by the NVIDIA driver are exported by new client modules.
1 parent c3c60a6 commit d8184f7

File tree

3 files changed

+39
-2
lines changed

3 files changed

+39
-2
lines changed

CHANGELOG.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste
77
------
88

99
**CHANGES**
10-
1. Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes
11-
and achieve better performance at scale.
10+
- Add chef attribute `cluster/in_place_update_on_fleet_enabled` to disable in-place updates on compute and login nodes
11+
and achieve better performance at scale.
12+
- Load kernel module `drm_client_lib` before installation of NVIDIA driver, if available on the kernel.
1213

1314

1415
3.14.0

cookbooks/aws-parallelcluster-platform/resources/nvidia_driver/partial/_nvidia_driver_common.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,13 @@
7272
end
7373
end
7474

75+
# Load kernel modules in best effort
76+
kernel_modules_to_load.each do |kernel_module|
77+
execute "Load kernel module if exposed by the kernel: #{kernel_module}" do
78+
command "if modinfo #{kernel_module}; then modprobe #{kernel_module}; fi"
79+
end
80+
end
81+
7582
# Install driver
7683
bash 'nvidia.run advanced' do
7784
user 'root'
@@ -126,3 +133,7 @@ def nvidia_kernel_module
126133
"kernel-open"
127134
end
128135
end
136+
137+
def kernel_modules_to_load
138+
%w(drm_client_lib)
139+
end

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,27 @@ def self.setup(chef_run, nvidia_driver_version: nil)
140140
end
141141
end
142142

143+
describe 'nvidia_driver:kernel_modules_to_load' do
144+
cached(:chef_run) do
145+
ChefSpec::SoloRunner.new(step_into: ['nvidia_driver'])
146+
end
147+
148+
cached(:resource) do
149+
ConvergeNvidiaDriver.setup(chef_run)
150+
chef_run.find_resource('nvidia_driver', 'setup')
151+
end
152+
153+
it 'returns expected kernel modules' do
154+
expect(resource.kernel_modules_to_load).to eq(%w(drm_client_lib))
155+
end
156+
end
157+
143158
describe 'nvidia_driver:setup' do
144159
for_all_oses do |platform, version|
145160
cached(:nvidia_arch) { 'nvidia_arch' }
146161
cached(:nvidia_kernel_module) { 'nvidia_kernel_module' }
147162
cached(:nvidia_driver_version) { 'nvidia_driver_version' }
163+
cached(:kernel_modules_to_load) { %w(module1 module2) }
148164
cached(:nvidia_driver_url) { "https://us.download.nvidia.com/tesla/#{nvidia_driver_version}/NVIDIA-Linux-#{nvidia_arch}-#{nvidia_driver_version}.run" }
149165

150166
context "on #{platform}#{version} when nvidia_driver not enabled" do
@@ -176,6 +192,7 @@ def self.setup(chef_run, nvidia_driver_version: nil)
176192
allow(res).to receive(:nvidia_arch).and_return(nvidia_arch)
177193
allow(res).to receive(:nvidia_kernel_module).and_return(kernel_module)
178194
allow(res).to receive(:gcc_major_version_used_by_kernel).and_return(kernel_compiler_version)
195+
allow(res).to receive(:kernel_modules_to_load).and_return(kernel_modules_to_load)
179196
end
180197

181198
stub_command("lsinitramfs /boot/initrd.img-$(uname -r) | grep nouveau").and_return(true)
@@ -220,6 +237,14 @@ def self.setup(chef_run, nvidia_driver_version: nil)
220237
)
221238
end
222239

240+
it 'loads kernel modules in they are exposed by the kernel' do
241+
kernel_modules_to_load.each do |kernel_module|
242+
is_expected.to run_execute("Load kernel module if exposed by the kernel: #{kernel_module}").with(
243+
command: "if modinfo #{kernel_module}; then modprobe #{kernel_module}; fi"
244+
)
245+
end
246+
end
247+
223248
if platform == 'amazon'
224249
compiler_version = version == '2023' ? 'gcc' : 'gcc10'
225250
compiler_path = version == '2023' ? 'CC=/usr/bin/gcc' : 'CC=/usr/bin/gcc10-gcc'

0 commit comments

Comments
 (0)