Skip to content

Commit b6544a2

Browse files
author
Himani Anil Deshpande
committed
[NVIDIA_IMEX] Install nvidia-imex from s3
1 parent d43d640 commit b6544a2

File tree

8 files changed

+61
-34
lines changed

8 files changed

+61
-34
lines changed

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_alinux2023.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,7 @@
1818

1919
use 'partial/_nvidia_imex_common.rb'
2020
use 'partial/_nvidia_imex_rhel.rb'
21+
22+
def platform
23+
"amzn#{node['platform_version'].to_i}"
24+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_redhat8.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,7 @@
1818

1919
use 'partial/_nvidia_imex_common.rb'
2020
use 'partial/_nvidia_imex_rhel.rb'
21+
22+
def platform
23+
"rhel#{node['platform_version'].to_i}"
24+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_rocky8.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,7 @@
1818

1919
use 'partial/_nvidia_imex_common.rb'
2020
use 'partial/_nvidia_imex_rhel.rb'
21+
22+
def platform
23+
"rhel#{node['platform_version'].to_i}"
24+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/nvidia_imex_ubuntu22+.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,7 @@
1818

1919
use 'partial/_nvidia_imex_common.rb'
2020
use 'partial/_nvidia_imex_debian.rb'
21+
22+
def platform
23+
"ubuntu#{node['platform_version'].delete('.')}"
24+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,50 +19,49 @@
1919
return unless nvidia_enabled_or_installed?
2020
return if on_docker? || imex_installed? || aws_region.start_with?("us-iso")
2121

22-
# Add NVIDIA repo for nvidia-imex
23-
nvidia_repo 'add nvidia repository' do
24-
action :add
25-
end
26-
2722
directory node['cluster']['nvidia']['imex']['shared_dir']
2823

29-
template "#{node['cluster']['nvidia']['imex']['shared_dir']}/config.cfg" do
30-
source 'nvidia-imex/nvidia-imex-config.erb'
31-
owner 'root'
32-
group 'root'
33-
mode '0755'
34-
end
35-
36-
template "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config.cfg" do
37-
source 'nvidia-imex/nvidia-imex-nodes.erb'
38-
owner 'root'
39-
group 'root'
40-
mode '0755'
41-
end
42-
43-
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
44-
source 'nvidia-imex/nvidia-imex.service.erb'
45-
owner 'root'
46-
group 'root'
47-
mode '0644'
48-
action :create
49-
end
50-
5124
action_install_imex
5225
# Save Imex version in Node Attributes for InSpec Tests
5326
node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version
5427
node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package
5528
node_attributes 'dump node attributes'
56-
57-
nvidia_repo 'remove nvidia repository' do
58-
action :remove
59-
end
6029
end
6130

6231
action :configure do
6332
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
6433
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
6534
if get_nvswitch_count(get_device_ids['gb200']) > 1
35+
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file,
36+
# if one doesn't already exist in a common, shared location.
37+
template nvidia_imex_nodes_conf_file do
38+
source 'nvidia-imex/nvidia-imex-nodes.erb'
39+
owner 'root'
40+
group 'root'
41+
mode '0755'
42+
action :create
43+
not_if { file_exists_and_cluster_update?(nvidia_imex_nodes_conf_file) }
44+
end
45+
46+
template nvidia_imex_main_conf_file do
47+
source 'nvidia-imex/nvidia-imex-config.erb'
48+
owner 'root'
49+
group 'root'
50+
mode '0755'
51+
action :create
52+
not_if { file_exists_and_cluster_update?(nvidia_imex_main_conf_file) }
53+
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
54+
end
55+
56+
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
57+
source 'nvidia-imex/nvidia-imex.service.erb'
58+
owner 'root'
59+
group 'root'
60+
mode '0644'
61+
action :create
62+
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
63+
end
64+
6665
service nvidia_imex_service do
6766
action %i(enable start)
6867
supports status: true
@@ -93,3 +92,15 @@ def imex_installed?
9392
def nvidia_enabled_or_installed?
9493
nvidia_enabled? || nvidia_installed?
9594
end
95+
96+
def file_exists_and_cluster_update?(file_path)
97+
::File.exist?(file_path) && !are_queues_updated?
98+
end
99+
100+
def nvidia_imex_main_conf_file
101+
"#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['launch_template_id']}.cfg"
102+
end
103+
104+
def nvidia_imex_nodes_conf_file
105+
"#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['launch_template_id']}.cfg"
106+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_rhel.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,5 @@ def arch_suffix
4040
end
4141

4242
def nvidia_imex_url
43-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_fabric/#{platform}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.#{arch_suffix}.rpm"
43+
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_imex/#{platform}/#{nvidia_imex_package}-#{nvidia_imex_full_version}.#{arch_suffix}.rpm"
4444
end

cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex-config.erb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ SERVER_PORT=50000
8686
# Possible Values:
8787
# Full path/filename string (max length of 256).
8888
# Default Value: /etc/nvidia-imex/nodes_config.cfg
89-
IMEX_NODE_CONFIG_FILE=<%= node['cluster']['nvidia']['imex']['shared_dir'] %>/nodes_config.cfg
89+
IMEX_NODE_CONFIG_FILE=<%= @imex_nodes_config_file_path %>
9090

9191
# Description: Name of the network interface used for communication.
9292
# OPTIONAL - If empty, network interface will be determined by matching bind IP to

cookbooks/aws-parallelcluster-platform/templates/nvidia-imex/nvidia-imex.service.erb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ PrivateTmp=false
1212
Type=forking
1313
TimeoutStartSec=infinity
1414

15-
ExecStart=/usr/bin/nvidia-imex -c <%= node['cluster']['nvidia']['imex']['shared_dir'] %>/config.cfg
15+
ExecStart=/usr/bin/nvidia-imex -c <%= @imex_main_config_file_path %>
1616

1717
LimitCORE=infinity
1818

0 commit comments

Comments
 (0)