Skip to content

Commit f5dfc46

Browse files
author
Himani Anil Deshpande
committed
[Gb200] Support IMEX configuration Local to each compute nodes
* we remove /opt/parallelcluster/shared/nvidia-imex directory creation * We keep default path of `/etc/nvidia-imex/nodes_config.cfg` and `/etc/nvidia-imex/config.cfg` for IMEX configuration
1 parent 2a8e2a1 commit f5dfc46

File tree

2 files changed

+38
-42
lines changed

2 files changed

+38
-42
lines changed

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
end
2525

2626
# nvidia-imex
27-
default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex"
27+
default['cluster']['nvidia']['imex']['conf_dir'] = "/etc/nvidia-imex"
28+
default['cluster']['nvidia']['imex']['main_config'] = "#{node['cluster']['nvidia']['imex']['conf_dir']}/config.cfg"
29+
default['cluster']['nvidia']['imex']['nodes_config'] = "#{node['cluster']['nvidia']['imex']['conf_dir']}/nodes_config.cfg"
2830
default['cluster']['nvidia']['imex']['force_configuration'] = false
2931

3032
# NVIDIA NVLSM

cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb

Lines changed: 35 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -19,53 +19,52 @@
1919
return unless nvidia_enabled_or_installed?
2020
return if on_docker? || imex_installed? || aws_region.start_with?("us-iso")
2121

22-
# We are using the existence of this directory to verify if Imex was installed by ParallelCluster
23-
directory node['cluster']['nvidia']['imex']['shared_dir']
24-
2522
action_install_imex
23+
24+
action_create_configuration_files
2625
# Save Imex version in Node Attributes for InSpec Tests
2726
node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version
2827
node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package
2928
node_attributes 'dump node attributes'
3029
end
3130

32-
action :configure do
33-
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
34-
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
35-
if (is_gb200_node? && pcluster_installed_imex?) || enable_force_configuration?
36-
# For each Compute Resource, we generate a unique NVIDIA IMEX configuration file,
37-
# if one doesn't already exist in a common, shared location.
38-
template nvidia_imex_nodes_conf_file do
39-
source 'nvidia-imex/nvidia-imex-nodes.erb'
40-
owner 'root'
41-
group 'root'
42-
mode '0755'
43-
action :create_if_missing
44-
end
31+
action :create_configuration_files do
32+
# We create or update IMEX configuration files if ParallelCluster is installing IMEX
33+
template nvidia_imex_nodes_conf_file do
34+
source 'nvidia-imex/nvidia-imex-nodes.erb'
35+
owner 'root'
36+
group 'root'
37+
mode '0755'
38+
action :create
39+
end
4540

46-
template nvidia_imex_main_conf_file do
47-
source 'nvidia-imex/nvidia-imex-config.erb'
48-
owner 'root'
49-
group 'root'
50-
mode '0755'
51-
action :create_if_missing
52-
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
53-
end
41+
template nvidia_imex_main_conf_file do
42+
source 'nvidia-imex/nvidia-imex-config.erb'
43+
owner 'root'
44+
group 'root'
45+
mode '0755'
46+
action :create
47+
variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file)
48+
end
5449

55-
# We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file.
56-
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
57-
source 'nvidia-imex/nvidia-imex.service.erb'
58-
owner 'root'
59-
group 'root'
60-
mode '0644'
61-
action :create
62-
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
63-
end
50+
# We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file.
51+
template "/etc/systemd/system/#{nvidia_imex_service}.service" do
52+
source 'nvidia-imex/nvidia-imex.service.erb'
53+
owner 'root'
54+
group 'root'
55+
mode '0644'
56+
action :create
57+
variables(imex_main_config_file_path: nvidia_imex_main_conf_file)
58+
end
59+
end
6460

61+
action :configure do
62+
return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet"
63+
# Start nvidia-imex on p6e-gb200 and only on ComputeFleet
64+
if is_gb200_node? || enable_force_configuration?
6565
service nvidia_imex_service do
6666
action %i(enable start)
6767
supports status: true
68-
only_if { ::File.exist?("/etc/systemd/system/#{nvidia_imex_service}.service") }
6968
end
7069
end
7170
end
@@ -95,18 +94,13 @@ def nvidia_enabled_or_installed?
9594
end
9695

9796
def nvidia_imex_main_conf_file
98-
"#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg"
97+
"#{node['cluster']['nvidia']['imex']['main_config']}"
9998
end
10099

101100
def nvidia_imex_nodes_conf_file
102-
"#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg"
101+
"#{node['cluster']['nvidia']['imex']['nodes_config']}"
103102
end
104103

105104
def enable_force_configuration?
106105
['true', 'yes', true].include?(node['cluster']['nvidia']['imex']['force_configuration'])
107106
end
108-
109-
def pcluster_installed_imex?
110-
# We configure Imex only if the shared directory exists
111-
Dir.exist?(node['cluster']['nvidia']['imex']['shared_dir'])
112-
end

0 commit comments

Comments
 (0)