diff --git a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb index 6566160ec..5ef1c4ce1 100644 --- a/cookbooks/aws-parallelcluster-platform/attributes/platform.rb +++ b/cookbooks/aws-parallelcluster-platform/attributes/platform.rb @@ -24,7 +24,9 @@ end # nvidia-imex -default['cluster']['nvidia']['imex']['shared_dir'] = "#{node['cluster']['shared_dir']}/nvidia-imex" +default['cluster']['nvidia']['imex']['conf_dir'] = "/etc/nvidia-imex" +default['cluster']['nvidia']['imex']['main_config'] = "#{node['cluster']['nvidia']['imex']['conf_dir']}/config.cfg" +default['cluster']['nvidia']['imex']['nodes_config'] = "#{node['cluster']['nvidia']['imex']['conf_dir']}/nodes_config.cfg" default['cluster']['nvidia']['imex']['force_configuration'] = false # NVIDIA NVLSM diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb index fc126e43e..ed1c2c751 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_imex/partial/_nvidia_imex_common.rb @@ -19,47 +19,49 @@ return unless nvidia_enabled_or_installed? return if on_docker? || imex_installed? || aws_region.start_with?("us-iso") - directory node['cluster']['nvidia']['imex']['shared_dir'] - action_install_imex + + action_create_configuration_files # Save Imex version in Node Attributes for InSpec Tests node.default['cluster']['nvidia']['imex']['version'] = nvidia_imex_full_version node.default['cluster']['nvidia']['imex']['package'] = nvidia_imex_package node_attributes 'dump node attributes' end +action :create_configuration_files do + # We create or update IMEX configuration files if ParallelCluster is installing IMEX + template nvidia_imex_nodes_conf_file do + source 'nvidia-imex/nvidia-imex-nodes.erb' + owner 'root' + group 'root' + mode '0755' + action :create + end + + template nvidia_imex_main_conf_file do + source 'nvidia-imex/nvidia-imex-config.erb' + owner 'root' + group 'root' + mode '0755' + action :create + variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) + end + + # We keep nvidia-imex.service file in this location to give precedence to pcluster configured service file. + template "/etc/systemd/system/#{nvidia_imex_service}.service" do + source 'nvidia-imex/nvidia-imex.service.erb' + owner 'root' + group 'root' + mode '0644' + action :create + variables(imex_main_config_file_path: nvidia_imex_main_conf_file) + end +end + action :configure do return unless imex_installed? && node['cluster']['node_type'] == "ComputeFleet" # Start nvidia-imex on p6e-gb200 and only on ComputeFleet if is_gb200_node? || enable_force_configuration? - # For each Compute Resource, we generate a unique NVIDIA IMEX configuration file, - # if one doesn't already exist in a common, shared location. - template nvidia_imex_nodes_conf_file do - source 'nvidia-imex/nvidia-imex-nodes.erb' - owner 'root' - group 'root' - mode '0755' - action :create_if_missing - end - - template nvidia_imex_main_conf_file do - source 'nvidia-imex/nvidia-imex-config.erb' - owner 'root' - group 'root' - mode '0755' - action :create_if_missing - variables(imex_nodes_config_file_path: nvidia_imex_nodes_conf_file) - end - - template "/etc/systemd/system/#{nvidia_imex_service}.service" do - source 'nvidia-imex/nvidia-imex.service.erb' - owner 'root' - group 'root' - mode '0644' - action :create - variables(imex_main_config_file_path: nvidia_imex_main_conf_file) - end - service nvidia_imex_service do action %i(enable start) supports status: true @@ -92,11 +94,11 @@ def nvidia_enabled_or_installed? end def nvidia_imex_main_conf_file - "#{node['cluster']['nvidia']['imex']['shared_dir']}/config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg" + "#{node['cluster']['nvidia']['imex']['main_config']}" end def nvidia_imex_nodes_conf_file - "#{node['cluster']['nvidia']['imex']['shared_dir']}/nodes_config_#{node['cluster']['scheduler_queue_name']}_#{node['cluster']['scheduler_compute_resource_name']}.cfg" + "#{node['cluster']['nvidia']['imex']['nodes_config']}" end def enable_force_configuration? diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb index 0985bffdb..b745d3d77 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_imex_spec.rb @@ -3,6 +3,7 @@ nvidia_version = "1.2.3" SOURCE_DIR = 'SOURCE_DIR'.freeze nvidia_imex_shared_dir = "SHARED_DIR/nvidia-imex" +imex_service_file = "/etc/systemd/system/nvidia-imex.service" imex_binary = '/usr/bin/nvidia-imex' imex_ctl_binary = '/usr/bin/nvidia-imex-ctl' queue_name = 'queue-name' @@ -296,118 +297,152 @@ def self.configure(chef_run) describe 'nvidia_imex:configure' do [%w(false), [false], %w(no), %w(true), [true], %w(yes)].each do |force_indicator| - for_all_oses do |platform, version| - context "on #{platform}#{version} with force_configuration #{force_indicator}" do - context "when nvidia-imex binary is not installed" do - cached(:chef_run) do - stubs_for_resource('nvidia_imex') do |res| - allow(res).to receive(:imex_installed?).and_return(false) - end - runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) - ConvergeNvidiaImex.configure(runner) - end - cached(:node) { chef_run.node } - - it 'does not configure nvidia-imex' do - is_expected.not_to configure_nvidia_imex('nvidia-imex') - end - end - - %w(HeadNode LoginNode ComputeFleet).each do |node_type| - context "when get_nvswitch_count > 1 on #{node_type} node" do - cached(:chef_run) do - stubs_for_provider('nvidia_imex[configure]') do |pro| - allow(pro).to receive(:imex_installed?).and_return(true) - allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) - allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4) - allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator) + [true, false].each do |shared_dir_exists| + [true, false].each do |imex_service_file_exists| + for_all_oses do |platform, version| + context "on #{platform}#{version} with force_configuration #{force_indicator} with shared_dir existence #{shared_dir_exists}" do + context "when nvidia-imex binary is not installed" do + cached(:chef_run) do + stubs_for_resource('nvidia_imex') do |res| + allow(res).to receive(:imex_installed?).and_return(false) + allow(Dir).to receive(:exist?).with(nvidia_imex_shared_dir).and_return(shared_dir_exists) + allow(File).to receive(:exist?).with(imex_service_file).and_return(imex_service_file_exists) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.configure(runner) end - runner(platform: platform, version: version, step_into: ['nvidia_imex']) - end - cached(:node) { chef_run.node } + cached(:node) { chef_run.node } - before do - chef_run.node.override['cluster']['region'] = 'aws_region' - chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator - chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir - chef_run.node.override['cluster']['node_type'] = node_type - chef_run.node.override['cluster']['scheduler_queue_name'] = queue_name - chef_run.node.override['cluster']['scheduler_compute_resource_name'] = compute_resource_name - - ConvergeNvidiaImex.configure(chef_run) - end - - if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) it 'does not configure nvidia-imex' do - is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" }) - is_expected.not_to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" }) - is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) - end - else - it 'it starts nvidia-imex service' do - is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg") - .with(source: 'nvidia-imex/nvidia-imex-config.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0755') - .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" }) - is_expected.to create_template("/etc/systemd/system/nvidia-imex.service") - .with(source: 'nvidia-imex/nvidia-imex.service.erb') - .with(user: 'root') - .with(group: 'root') - .with(mode: '0644') - .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" }) - is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + is_expected.not_to configure_nvidia_imex('nvidia-imex') end end - end - end - context "when get_nvswitch_count <= 1" do - cached(:chef_run) do - stubs_for_provider('nvidia_imex[configure]') do |pro| - allow(pro).to receive(:imex_installed?).and_return(true) - allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) - allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1) - allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator) + %w(HeadNode LoginNode ComputeFleet).each do |node_type| + context "when get_nvswitch_count > 1 on #{node_type} node" do + cached(:chef_run) do + stubs_for_provider('nvidia_imex[configure]') do |pro| + allow(pro).to receive(:imex_installed?).and_return(true) + allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(pro).to receive(:get_nvswitch_count).with('test').and_return(4) + allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator) + allow(Dir).to receive(:exist?).with(nvidia_imex_shared_dir).and_return(shared_dir_exists) + allow(File).to receive(:exist?).with(imex_service_file).and_return(imex_service_file_exists) + end + runner(platform: platform, version: version, step_into: ['nvidia_imex']) + end + cached(:node) { chef_run.node } + + before do + chef_run.node.override['cluster']['region'] = 'aws_region' + chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator + chef_run.node.override['cluster']['nvidia']['imex']['shared_dir'] = nvidia_imex_shared_dir + chef_run.node.override['cluster']['node_type'] = node_type + chef_run.node.override['cluster']['scheduler_queue_name'] = queue_name + chef_run.node.override['cluster']['scheduler_compute_resource_name'] = compute_resource_name + + ConvergeNvidiaImex.configure(chef_run) + end + + if (platform == 'amazon' && version == '2') || %w(HeadNode LoginNode).include?(node_type) + it 'does not configure nvidia-imex' do + is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" }) + is_expected.not_to create_template(imex_service_file) + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" }) + is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + end + else + it 'it starts nvidia-imex service' do + if shared_dir_exists + is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" }) + is_expected.to create_template(imex_service_file) + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" }) + else + is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-nodes.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + is_expected.not_to create_if_missing_template("#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg") + .with(source: 'nvidia-imex/nvidia-imex-config.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0755') + .with(variables: { imex_nodes_config_file_path: "#{nvidia_imex_shared_dir}/nodes_config_#{queue_name}_#{compute_resource_name}.cfg" }) + is_expected.not_to create_template(imex_service_file) + .with(source: 'nvidia-imex/nvidia-imex.service.erb') + .with(user: 'root') + .with(group: 'root') + .with(mode: '0644') + .with(variables: { imex_main_config_file_path: "#{nvidia_imex_shared_dir}/config_#{queue_name}_#{compute_resource_name}.cfg" }) + if imex_service_file_exists + is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + else + is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + end + end + end + end + end end - runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) - ConvergeNvidiaImex.configure(runner) - end - cached(:node) { chef_run.node } - before do - chef_run.node.override['cluster']['region'] = 'aws_region' - chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator - end + context "when get_nvswitch_count <= 1" do + cached(:chef_run) do + stubs_for_provider('nvidia_imex[configure]') do |pro| + allow(pro).to receive(:imex_installed?).and_return(true) + allow(pro).to receive(:get_device_ids).and_return({ 'gb200' => 'test' }) + allow(pro).to receive(:get_nvswitch_count).with('test').and_return(1) + allow(pro).to receive(:enable_force_configuration?).and_return(force_indicator) + allow(Dir).to receive(:exist?).with(nvidia_imex_shared_dir).and_return(shared_dir_exists) + allow(File).to receive(:exist?).with(imex_service_file).and_return(imex_service_file_exists) + end + runner = runner(platform: platform, version: version, step_into: ['nvidia_imex']) + ConvergeNvidiaImex.configure(runner) + end + cached(:node) { chef_run.node } - if ['true', 'yes', true].include?(force_indicator) - it 'does configure nvidia-imex' do - is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) - end - else - it 'does not configure nvidia-imex' do - is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + before do + chef_run.node.override['cluster']['region'] = 'aws_region' + chef_run.node.override['cluster']['nvidia']['imex']['force_configuration'] = force_indicator + end + + if ['true', 'yes', true].include?(force_indicator) && imex_service_file_exists + it 'does configure nvidia-imex' do + is_expected.to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + end + else + it 'does not configure nvidia-imex' do + is_expected.not_to start_service('nvidia-imex').with_action(%i(enable start)).with_supports({ status: true }) + end + end end end end