Skip to content

Commit 433ef1b

Browse files
author
Himani Anil Deshpande
committed
[NVIDIA-IMEX] Configuring nvidia-imex only for gb200 and ComputeFleet node
1 parent a7f0576 commit 433ef1b

File tree

3 files changed

+31
-24
lines changed

3 files changed

+31
-24
lines changed

cookbooks/aws-parallelcluster-platform/recipes/config/nvidia_config.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,7 @@
2424
end
2525

2626
include_recipe "aws-parallelcluster-platform::nvidia_uvm"
27+
28+
nvidia_imex 'Configure nvidia-imex' do
29+
action :configure
30+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@
5858
end
5959

6060
action :configure do
61-
return unless imex_installed
62-
# Start nvidia-imex on p6e-gb200
61+
return unless imex_installed && node['cluster']['node_type'] == "ComputeFleet"
62+
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
6363
if get_nvswitch_count(get_device_ids['gb200']) > 1
6464
service nvidia_imex_service do
6565
action %i(start enable)

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ def self.configure(chef_run)
257257
describe 'nvidia_imex:configure' do
258258
for_all_oses do |platform, version|
259259
context "on #{platform}#{version}" do
260-
context 'when nvidia-imex binary is not installed' do
260+
context "when nvidia-imex binary is not installed" do
261261
cached(:chef_run) do
262262
stubs_for_resource('nvidia_imex') do |res|
263263
allow(res).to receive(:imex_installed).and_return(false)
@@ -272,34 +272,37 @@ def self.configure(chef_run)
272272
end
273273
end
274274

275-
context 'when get_nvswitch_count > 1' do
276-
cached(:chef_run) do
277-
stubs_for_provider('nvidia_imex[configure]') do |pro|
278-
allow(pro).to receive(:imex_installed).and_return(true)
279-
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
280-
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4)
275+
%w(HeadNode LoginNode ComputeFleet).each do |node_type|
276+
context "when get_nvswitch_count > 1 on #{node_type} node" do
277+
cached(:chef_run) do
278+
stubs_for_provider('nvidia_imex[configure]') do |pro|
279+
allow(pro).to receive(:imex_installed).and_return(true)
280+
allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' })
281+
allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4)
282+
end
283+
runner(platform: platform, version: version, step_into: ['nvidia_imex'])
281284
end
282-
runner = runner(platform: platform, version: version, step_into: ['nvidia_imex'])
283-
ConvergeNvidiaImex.configure(runner)
284-
end
285-
cached(:node) { chef_run.node }
285+
cached(:node) { chef_run.node }
286286

287-
before do
288-
chef_run.node.override['cluster']['region'] = 'aws_region'
289-
end
290-
291-
if platform == 'amazon' && version == '2'
292-
it 'does not configure nvidia-imex' do
293-
is_expected.not_to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true })
287+
before do
288+
chef_run.node.override['cluster']['region'] = 'aws_region'
289+
chef_run.node.override['cluster']['node_type'] = node_type
290+
ConvergeNvidiaImex.configure(chef_run)
294291
end
295-
else
296-
it 'starts nvidia-imex service' do
297-
is_expected.to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true })
292+
293+
if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type)
294+
it 'does not configure nvidia-imex' do
295+
is_expected.not_to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true })
296+
end
297+
else
298+
it 'it starts nvidia-imex service' do
299+
is_expected.to start_service('nvidia-imex').with_action(%i(start enable)).with_supports({ status: true })
300+
end
298301
end
299302
end
300303
end
301304

302-
context 'when get_nvswitch_count <= 1' do
305+
context "when get_nvswitch_count <= 1" do
303306
cached(:chef_run) do
304307
stubs_for_provider('nvidia_imex[configure]') do |pro|
305308
allow(pro).to receive(:imex_installed).and_return(true)

0 commit comments

Comments
 (0)