|
19 | 19 | return unless nvidia_enabled_or_installed? |
20 | 20 | return if on_docker? || imex_installed? || aws_region.start_with?("us-iso") |
21 | 21 |
|
22 | | - # We are using the existence of this directory to verify if Imex was installed by ParallelCluster |
23 | | - directory node['cluster']['nvidia']['imex']['shared_dir'] |
24 | | - |
25 | 22 | action_install_imex |
| 23 | + |
| 24 | + action_create_configuration_files |
26 | 25 | # Save Imex version in Node Attributes for InSpec Tests |
27 | 26 | node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version |
28 | 27 | node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package |
29 | 28 | node_attributes 'dump node attributes' |
30 | 29 | end |
31 | 30 |
|
32 | | -action :configure do |
33 | | - return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" |
34 | | - # Start nvidia-imex on p6e-gb200 and only on ComputeFleet |
35 | | - if (is_gb200_node? && pcluster_installed_imex?) || enable_force_configuration? |
36 | | - # For each Compute Resource, we generate a unique NVIDIA IMEX configuration file, |
37 | | - # if one doesn't already exist in a common, shared location. |
38 | | - template nvidia_imex_nodes_conf_file do |
39 | | - source 'nvidia-imex/nvidia-imex-nodes.erb' |
40 | | - owner 'root' |
41 | | - group 'root' |
42 | | - mode '0755' |
43 | | - action :create_if_missing |
44 | | - end |
| 31 | +action :create_configuration_files do |
| 32 | + # We create or update IMEX configuration files if ParallelCluster is installing IMEX |
| 33 | + template nvidia_imex_nodes_conf_file do |
| 34 | + source 'nvidia-imex/nvidia-imex-nodes.erb' |
| 35 | + owner 'root' |
| 36 | + group 'root' |
| 37 | + mode '0755' |
| 38 | + action :create |
| 39 | + end |
45 | 40 |
|
46 | | - template nvidia_imex_main_conf_file do |
47 | | - source 'nvidia-imex/nvidia-imex-config.erb' |
48 | | - owner 'root' |
49 | | - group 'root' |
50 | | - mode '0755' |
51 | | - action :create_if_missing |
52 | | - variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) |
53 | | - end |
| 41 | + template nvidia_imex_main_conf_file do |
| 42 | + source 'nvidia-imex/nvidia-imex-config.erb' |
| 43 | + owner 'root' |
| 44 | + group 'root' |
| 45 | + mode '0755' |
| 46 | + action :create |
| 47 | + variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) |
| 48 | + end |
54 | 49 |
|
55 | | - # We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file. |
56 | | - template "/etc/systemd/system/#{nvidia_imex_service}.service" do |
57 | | - source 'nvidia-imex/nvidia-imex.service.erb' |
58 | | - owner 'root' |
59 | | - group 'root' |
60 | | - mode '0644' |
61 | | - action :create |
62 | | - variables(imex_main_config_file_path: nvidia_imex_main_conf_file) |
63 | | - end |
| 50 | + # We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file. |
| 51 | + template "/etc/systemd/system/#{nvidia_imex_service}.service" do |
| 52 | + source 'nvidia-imex/nvidia-imex.service.erb' |
| 53 | + owner 'root' |
| 54 | + group 'root' |
| 55 | + mode '0644' |
| 56 | + action :create |
| 57 | + variables(imex_main_config_file_path: nvidia_imex_main_conf_file) |
| 58 | + end |
| 59 | +end |
64 | 60 |
|
| 61 | +action :configure do |
| 62 | + return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" |
| 63 | + # Start nvidia-imex on p6e-gb200 and only on ComputeFleet |
| 64 | + if is_gb200_node? || enable_force_configuration? |
65 | 65 | service nvidia_imex_service do |
66 | 66 | action %i(enable start) |
67 | 67 | supports status: true |
68 | | - only_if { ::File.exist?("/etc/systemd/system/#{nvidia_imex_service}.service") } |
69 | 68 | end |
70 | 69 | end |
71 | 70 | end |
@@ -95,18 +94,13 @@ def nvidia_enabled_or_installed? |
95 | 94 | end |
96 | 95 |
|
97 | 96 | def nvidia_imex_main_conf_file |
98 | | - "#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg" |
| 97 | + "#{node['cluster']['nvidia']['imex']['main_config']}" |
99 | 98 | end |
100 | 99 |
|
101 | 100 | def nvidia_imex_nodes_conf_file |
102 | | - "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg" |
| 101 | + "#{node['cluster']['nvidia']['imex']['nodes_config']}" |
103 | 102 | end |
104 | 103 |
|
105 | 104 | def enable_force_configuration? |
106 | 105 | ['true', 'yes', true].include?(node['cluster']['nvidia']['imex']['force_configuration']) |
107 | 106 | end |
108 | | - |
109 | | -def pcluster_installed_imex? |
110 | | - # We configure Imex only if the shared directory exists |
111 | | - Dir.exist?(node['cluster']['nvidia']['imex']['shared_dir']) |
112 | | -end |
0 commit comments