From 936404c3c45879ae859c3f968f4af9b661c89462 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 12 Mar 2025 15:59:39 -0400 Subject: [PATCH 01/19] [Docs] Update changelog for release 3.13.0. --- CHANGELOG.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc96e5aa96..1d5933a660 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,8 @@ This file is used to list changes made in each version of the AWS ParallelCluste 3.13.0 ------ **ENHANCEMENTS** -- Add support for Ubuntu24. +- Add support for Ubuntu 24.04. + Notice that ParallelCluster official AMI for Ubuntu 24.04 does not support Lustre. - Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security. **CHANGES** @@ -15,14 +16,14 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Upgrade CUDA Toolkit to version 12.8.0 (from 12.4.1) for all OSs except AL2. - Upgrade Python to 3.12.8 for all OSs except AL2 (from 3.9.20). - On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel. -- Upgrade `aws-cfn-bootstrap` to version 2.0-33. -- Upgrade EFA installer to `1.38.0`. - - Efa-driver: `efa-2.13.0-1` - - Efa-config: `efa-config-1.17-1` - - Efa-profile: `efa-profile-1.7-1` - - Libfabric-aws: `libfabric-aws-1.22.0-1` - - Rdma-core: `rdma-core-54.0-1` - - Open MPI: `openmpi40-aws-4.1.7-1` and `openmpi50-aws-5.0.5` +- Upgrade aws-cfn-bootstrap to version 2.0-33. +- Upgrade EFA installer to 1.38.0 (from 1.36.0). + - Efa-driver: efa-2.13.0-1 + - Efa-config: efa-config-1.17-1 + - Efa-profile: efa-profile-1.7-1 + - Libfabric-aws: libfabric-aws-1.22.0-1 + - Rdma-core: rdma-core-54.0-1 + - Open MPI: openmpi40-aws-4.1.7-1 and openmpi50-aws-5.0.5 - Upgrade amazon-efs-utils to version 2.1.0. - Remove third-party cookbook: apt-7.5.22 and pyenv-4.2.3. - Upgrade third-party cookbook dependencies: @@ -38,8 +39,11 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Install NVIDIA drivers without the option 'no-cc-version-check', which is now deprecated in the NVIDIA installer. **BUG FIXES** -- Remove usage of cfn-init for compute node bootstrapping to reduce node scale up time. +- Remove usage of cfn-init for compute node bootstrapping to reduce node scale-up time. - Fix the execution of overriding aws-parallelcluster-node package only on the head node during update. +- On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel + to prevent installation failures. +- Fix the execution of overriding aws-parallelcluster-node package only on the head node during update. - Fix an issue where containerized jobs executed through Pyxis/Enroot in a multi-user environment (integrated with Active Directory) would fail. - Fix usage of authselect causing node bootstrap failures on Rocky 9.5+ when directory service is used. From aeef32d1c7e44e3b902a896d0abf1aed90e02033 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 12 Mar 2025 17:24:37 -0400 Subject: [PATCH 02/19] [Docs] Fix changelog for 3.13.0. --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d5933a660..9fd8ad6dc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,7 +40,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste **BUG FIXES** - Remove usage of cfn-init for compute node bootstrapping to reduce node scale-up time. -- Fix the execution of overriding aws-parallelcluster-node package only on the head node during update. - On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel to prevent installation failures. - Fix the execution of overriding aws-parallelcluster-node package only on the head node during update. From 445e623c090f1b0856b3bb26aa121331a9c5a1c3 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 12 Mar 2025 09:24:28 -0400 Subject: [PATCH 03/19] [Test] Add Amazon Linux 2023 to spec tests. --- cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb index ff6af2da76..78a6e77e09 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb @@ -40,6 +40,7 @@ def for_oses(os_list) def for_all_oses [ %w(amazon 2), + %w(amazon 2023), %w(ubuntu 20.04), %w(ubuntu 22.04), %w(redhat 8), From d8764742f2a5d7473dc7b99f6b8ff257f290d716 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 12 Mar 2025 09:30:14 -0400 Subject: [PATCH 04/19] [Test] Add Ubuntu 24.04 to spec tests. --- cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb index 78a6e77e09..27993e0270 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/spec_helper.rb @@ -43,6 +43,7 @@ def for_all_oses %w(amazon 2023), %w(ubuntu 20.04), %w(ubuntu 22.04), + %w(ubuntu 24.04), %w(redhat 8), %w(rocky 8), %w(redhat 9), From 7b43f4339897939e0cbe71a4b6e4545edd161f8d Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 12 Mar 2025 12:31:16 -0400 Subject: [PATCH 05/19] [Test] Fix spec tests failures for Ubuntu24.04 and Amazon Linux 2023. --- .../spec/unit/resources/efa_spec.rb | 4 +- .../unit/resources/network_service_spec.rb | 43 ++++++++++++------- .../unit/resources/raid_mount_unmount_spec.rb | 2 +- .../spec/unit/resources/arm_pl_spec.rb | 9 +++- .../spec/unit/resources/dcv_spec.rb | 30 +++++++------ .../spec/unit/resources/gdrcopy_spec.rb | 8 ++++ .../unit/resources/install_packages_spec.rb | 2 +- .../spec/unit/resources/modules_spec.rb | 2 + .../spec/unit/resources/nvidia_dcgm_spec.rb | 4 +- .../spec/unit/resources/nvidia_driver_spec.rb | 39 +++++++++++------ .../spec/unit/resources/nvidia_repo_spec.rb | 4 +- .../spec/unit/resources/stunnel_spec.rb | 2 +- .../spec/unit/resources/package_repos_spec.rb | 11 +++-- .../spec/unit/resources/dns_domain_spec.rb | 8 ++-- .../spec/unit/resources/mysql_client_spec.rb | 2 + 15 files changed, 112 insertions(+), 58 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb index a11e606ca6..bd48b7a21a 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb @@ -53,9 +53,9 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:prerequisites) do - if %(redhat rocky).include?(platform) + if %(redhat rocky).include?(platform) || platform == 'amazon' && version == '2023' %w(environment-modules libibverbs-utils librdmacm-utils rdma-core-devel) - elsif platform == 'amazon' + elsif platform == 'amazon' && version == '2' %w(environment-modules libibverbs-utils librdmacm-utils) else "environment-modules" diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/network_service_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/network_service_spec.rb index d778cfbc85..a7bc29b3ab 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/network_service_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/network_service_spec.rb @@ -27,19 +27,28 @@ def self.reload(chef_run) end cached(:node) { chef_run.node } cached(:network_service_name) do - { - 'amazon' => 'network', - 'centos' => 'network', - 'redhat' => 'NetworkManager', - 'rocky' => 'NetworkManager', - 'ubuntu' => 'systemd-resolved', - }[platform] + if platform == 'amazon' && version == '2' || platform == 'centos' + 'network' + elsif platform == 'amazon' && version == '2023' + 'systemd-networkd' + elsif platform == 'ubuntu' + 'systemd-resolved' + elsif %(redhat rocky).include?(platform) + 'NetworkManager' + else + raise "Cannot determine network_service_name: unrecognized platform #{platform}" + end end it "restarts network service" do is_expected.to restart_network_service('restart') + network_services_to_restart = if platform == 'amazon' && version == '2023' + [network_service_name, 'systemd-resolved'] + else + [network_service_name] + end - is_expected.to write_log("Restarting '#{network_service_name}' service, platform #{platform} '#{node['platform_version']}'") + is_expected.to write_log("Restarting '#{network_services_to_restart.join(' ')}' service, platform #{platform} '#{node['platform_version']}'") is_expected.to restart_service(network_service_name) .with(ignore_failure: true) @@ -56,13 +65,17 @@ def self.reload(chef_run) ConvergeNetworkService.reload(runner) end cached(:network_service_name) do - { - 'amazon' => 'network', - 'centos' => 'network', - 'redhat' => 'NetworkManager', - 'rocky' => 'NetworkManager', - 'ubuntu' => 'systemd-resolved', - }[platform] + if platform == 'amazon' && version == '2' || platform == 'centos' + 'network' + elsif platform == 'amazon' && version == '2023' + 'systemd-networkd' + elsif platform == 'ubuntu' + 'systemd-resolved' + elsif %(redhat rocky).include?(platform) + 'NetworkManager' + else + raise "Cannot determine network_service_name: unrecognized platform #{platform}" + end end it 'reloads network_service' do diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/raid_mount_unmount_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/raid_mount_unmount_spec.rb index f0600d3893..ad4f16ff78 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/raid_mount_unmount_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/raid_mount_unmount_spec.rb @@ -11,7 +11,7 @@ def wait_for_block_dev(_path) context "on #{platform}#{version}" do cached(:venv_path) { 'venv' } cached(:raid_superblock_version) do - %(redhat rocky).include?(platform) || "#{platform}#{version}" == 'ubuntu20.04' || "#{platform}#{version}" == 'ubuntu22.04' ? '1.2' : '0.90' + %(redhat rocky ubuntu).include?(platform) || "#{platform}#{version}" == 'amazon2023' ? '1.2' : '0.90' end cached(:chef_run) do runner = runner( diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/arm_pl_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/arm_pl_spec.rb index a5c956879d..e9f0c1c2f2 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/arm_pl_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/arm_pl_spec.rb @@ -26,14 +26,19 @@ def self.setup(chef_run) when 'ubuntu' "Ubuntu-#{version}" when 'amazon' - "AmazonLinux-2" + if version == '2' + "AmazonLinux-2" + elsif version == '2023' + 'RHEL-9' + end else "RHEL-#{version}" end end cached(:gcc_major_minor_version) do - if platform == 'ubuntu' && version == '22.04' || version == '9' + case "#{platform}#{version}" + when 'amazon2023', 'ubuntu24.04', 'ubuntu22.04', 'redhat9', 'rocky9' '11.3' else '9.3' diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/dcv_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/dcv_spec.rb index 08d07adcfa..882c03da6d 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/dcv_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/dcv_spec.rb @@ -42,7 +42,8 @@ def self.nothing(chef_run) allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true) end - if platform == 'ubuntu' && version.to_i == 20 + case "#{platform}#{version}" + when "amazon2023", "ubuntu20.04" it "is false" do expect(resource.dcv_supported?).to eq(false) end @@ -58,9 +59,10 @@ def self.nothing(chef_run) end context 'when not on arm' do - it "is true" do + is_supported = !("#{platform}#{version}" == 'amazon2023') + it "is #{is_supported}" do allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false) - expect(resource.dcv_supported?).to eq(true) + expect(resource.dcv_supported?).to eq(is_supported) end end end @@ -142,8 +144,8 @@ def self.nothing(chef_run) expect(resource.xdcv).to eq("nice-xdcv_#{xdcv_version}_#{dcv_pkg_arch}.#{base_os}.deb") expect(resource.dcv_web_viewer).to eq("nice-dcv-web-viewer_#{dcv_webviewer_version}_#{dcv_pkg_arch}.#{base_os}.deb") expect(resource.dcv_gl).to eq("/nice-dcv-gl_#{dcv_gl_version}_#{dcv_pkg_arch}.#{base_os}.deb") - else - dcv_platform_version = platform == "amazon" ? "7" : version.to_i + elsif "#{platform}#{version}" != 'amazon2023' + dcv_platform_version = "#{platform}#{version}" == "amazon2" ? "7" : version.to_i dcv_platform_version_pkg = platform == "amazon" ? "amzn2" : "el" + version expect(resource.dcv_package).to eq("nice-dcv-#{dcv_version}-#{dcv_platform_version_pkg}-#{dcv_url_arch}") expect(resource.dcv_server).to eq("nice-dcv-server-#{dcv_server_version}.el#{dcv_platform_version}.#{dcv_url_arch}.rpm") @@ -479,14 +481,16 @@ def self.nothing(chef_run) .with_code(/apt -y purge ifupdown/) .with_code(%r{wget https://d1uj6qtbmh3dt5.cloudfront.net/NICE-GPG-KEY}) when 'amazon' - is_expected.to install_package(alinux_prereq_packages).with_retries(10).with_retry_delay(5) - is_expected.to create_file('Setup Gnome standard').with( - content: "PREFERRED=/usr/bin/gnome-session", - owner: "root", - group: "root", - mode: "0755", - path: "/etc/sysconfig/desktop" - ) + if version == '2' + is_expected.to install_package(alinux_prereq_packages).with_retries(10).with_retry_delay(5) + is_expected.to create_file('Setup Gnome standard').with( + content: "PREFERRED=/usr/bin/gnome-session", + owner: "root", + group: "root", + mode: "0755", + path: "/etc/sysconfig/desktop" + ) + end else is_expected.to run_execute('Install gnome desktop').with_command('yum -y install @gnome').with_retries(3).with_retry_delay(5) is_expected.to install_package('xorg-x11-server-Xorg').with_retries(3).with_retry_delay(5) diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb index 3a8a6a8f73..441e62fd95 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/gdrcopy_spec.rb @@ -201,6 +201,12 @@ def self.configure(chef_run) case platform when 'ubuntu' %w(build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms) + when 'amazon' + if version == '2023' + %w(dkms rpm-build make check check-devel) + else + %w(dkms rpm-build make check check-devel subunit subunit-devel) + end else %w(dkms rpm-build make check check-devel subunit subunit-devel) end @@ -209,6 +215,7 @@ def self.configure(chef_run) cached(:gdrcopy_platform) do platforms = { 'amazon2' => 'amzn-2', + 'amazon2023' => 'amzn-2023', 'centos7' => 'el7', 'rhel8' => 'el8', 'rocky8' => 'el8', @@ -216,6 +223,7 @@ def self.configure(chef_run) 'rocky9' => 'el9', 'ubuntu20.04' => 'Ubuntu20_04', 'ubuntu22.04' => 'Ubuntu22_04', + 'ubuntu24.04' => 'Ubuntu24_04', } platforms["#{platform}#{version}"] end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/install_packages_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/install_packages_spec.rb index 4db9fc9d7d..2e49318e20 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/install_packages_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/install_packages_spec.rb @@ -59,7 +59,7 @@ def self.setup(chef_run) end end - if platform == 'amazon' + if platform == 'amazon' && version == '2' it 'installs extra packages' do is_expected.to install_alinux_extras_topic('R3.4') end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/modules_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/modules_spec.rb index 0ac9173058..00ae24dc6d 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/modules_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/modules_spec.rb @@ -38,6 +38,8 @@ "/usr/share/modules/init/.modulespath" when 'redhat', 'rocky' '/etc/environment-modules/modulespath' + when 'amazon' + version == '2' ? '/usr/share/Modules/init/.modulespath' : '/etc/environment-modules/modulespath' else "/usr/share/Modules/init/.modulespath" end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb index ca32faed4c..3cf2779901 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_dcgm_spec.rb @@ -89,7 +89,7 @@ def self.setup(chef_run, nvidia_enabled: nil) chef_run.find_resource('nvidia_dcgm', 'setup') end - if %w(centos amazon).include?(platform) + if %w(centos7 amazon2).include?("#{platform}#{version}") it "is not enabled" do expect(resource._nvidia_dcgm_enabled).to eq(false) end @@ -164,7 +164,7 @@ def self.setup(chef_run, nvidia_enabled: nil) ConvergeNvidiaDcgm.setup(chef_run) end - if %w(centos amazon).include?(platform) + if %w(centos7 amazon2).include?("#{platform}#{version}") it 'does not install datacenter gpu manager' do is_expected.not_to run_bash('Install datacenter-gpu-manager') end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb index a1d81426d7..4d0b8b57ca 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_driver_spec.rb @@ -221,21 +221,34 @@ def self.setup(chef_run, nvidia_driver_version: nil) end if platform == 'amazon' - compiler_path = version == 2023 ? 'CC=/usr/bin/gcc' : 'CC=/usr/bin/gcc10-gcc' - it 'installs gcc10' do - is_expected.to install_package('gcc10').with_retries(10).with_retry_delay(5) + compiler_version = version == '2023' ? 'gcc' : 'gcc10' + compiler_path = version == '2023' ? 'CC=/usr/bin/gcc' : 'CC=/usr/bin/gcc10-gcc' + if version == '2' + it "installs #{compiler_version}" do + is_expected.to install_package(compiler_version).with_retries(10).with_retry_delay(5) + end + it 'creates dkms/nvidia.conf' do + is_expected.to create_template('/etc/dkms/nvidia.conf').with( + source: 'nvidia/amazon/dkms/nvidia.conf.erb', + cookbook: 'aws-parallelcluster-platform', + owner: 'root', + group: 'root', + mode: '0644', + variables: { compiler_path: compiler_path } + ) + end + else + # Amazon Linux 2023 is expected to install the compiler and create nvidia conf when kernel version is 6. + # Here we are testing with kernel version 5 + it "does not install #{compiler_version}" do + is_expected.not_to install_package(compiler_version).with_retries(10).with_retry_delay(5) + end + + it 'does not create dkms/nvidia.conf' do + is_expected.not_to create_template('/etc/dkms/nvidia.conf') + end end - it 'creates dkms/nvidia.conf' do - is_expected.to create_template('/etc/dkms/nvidia.conf').with( - source: 'nvidia/amazon/dkms/nvidia.conf.erb', - cookbook: 'aws-parallelcluster-platform', - owner: 'root', - group: 'root', - mode: '0644', - variables: { compiler_path: compiler_path } - ) - end it 'installs nvidia driver' do is_expected.to run_bash('nvidia.run advanced') .with( diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_repo_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_repo_spec.rb index f96eb304c5..bb3a1b145f 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_repo_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_repo_spec.rb @@ -84,7 +84,9 @@ def self.remove(chef_run) cached(:arch_suffix) { 'arch_suffix' } cached(:nvidia_platform) do case platform - when 'amazon', 'centos' + when 'amazon' + version == '2023' ? 'rhel9' : 'rhel7' + when 'centos' 'rhel7' when 'redhat', 'rocky' "rhel#{version.to_i}" diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/stunnel_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/stunnel_spec.rb index 02b345d463..68d19ec189 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/stunnel_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/stunnel_spec.rb @@ -45,7 +45,7 @@ def self.setup(chef_run, stunnel_version:, stunnel_checksum:) is_expected.to setup_stunnel('setup') end - if platform == 'amazon' + if platform == 'amazon' && version == '2' it "doesn't install stunnel" do is_expected.not_to run_bash('install stunnel') end diff --git a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/package_repos_spec.rb b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/package_repos_spec.rb index 9730f6ca5c..1758192042 100644 --- a/cookbooks/aws-parallelcluster-shared/spec/unit/resources/package_repos_spec.rb +++ b/cookbooks/aws-parallelcluster-shared/spec/unit/resources/package_repos_spec.rb @@ -28,10 +28,15 @@ def self.setup(chef_run) expect(chef_run).to include_recipe('yum') end - it 'installs epel' do - is_expected.to install_alinux_extras_topic('epel') + if version == '2' + it 'installs epel' do + is_expected.to install_alinux_extras_topic('epel') + end + else + it 'does not install epel' do + is_expected.not_to install_alinux_extras_topic('epel') + end end - when 'centos' it 'installs yum and epel' do expect(chef_run).to include_recipe('yum') diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/dns_domain_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/dns_domain_spec.rb index e5b6e84b39..6e5b72ffdd 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/dns_domain_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/dns_domain_spec.rb @@ -41,9 +41,9 @@ def self.configure(chef_run) for_all_oses do |platform, version| context "on #{platform}#{version}" do cached(:dns_domain) { 'dns_domain' } - cached(:search_domain_config_path) { platform == 'ubuntu' ? '/etc/systemd/resolved.conf' : '/etc/dhcp/dhclient.conf' } - cached(:append_pattern) { platform == 'ubuntu' ? 'Domains=*' : 'append domain-name*' } - cached(:append_line) { platform == 'ubuntu' ? "Domains=#{dns_domain}" : "append domain-name \" #{dns_domain}\";" } + cached(:search_domain_config_path) { platform == 'ubuntu' || platform == 'amazon' && version == '2023' ? '/etc/systemd/resolved.conf' : '/etc/dhcp/dhclient.conf' } + cached(:append_pattern) { platform == 'ubuntu' || platform == 'amazon' && version == '2023' ? 'Domains=*' : 'append domain-name*' } + cached(:append_line) { platform == 'ubuntu' || platform == 'amazon' && version == '2023' ? "Domains=#{dns_domain}" : "append domain-name \" #{dns_domain}\";" } cached(:chef_run) do runner = runner(platform: platform, version: version, step_into: ['dns_domain']) do |node| @@ -56,7 +56,7 @@ def self.configure(chef_run) is_expected.to configure_dns_domain('configure') end - it 'updates search domaint' do + it 'updates search domain' do is_expected.to edit_replace_or_add("append Route53 search domain in #{search_domain_config_path}").with( path: search_domain_config_path, pattern: append_pattern, diff --git a/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/mysql_client_spec.rb b/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/mysql_client_spec.rb index 048ddc2972..5550e3896e 100644 --- a/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/mysql_client_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/spec/unit/resources/mysql_client_spec.rb @@ -30,6 +30,8 @@ def self.validate(chef_run) cached(:package_platform) do platform_version = if version.to_i == 2 7 + elsif platform == 'amazon' && version == '2023' + 9 else version.to_i end From 9dc7ae9d512eb58da2f1eaf166a49405dea26083 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Thu, 13 Mar 2025 17:54:11 -0400 Subject: [PATCH 06/19] [EFA] Upgrade EFA to version 1.38.1 (from 1.38.0) to address issue on Rocky9. --- CHANGELOG.md | 2 +- .../aws-parallelcluster-environment/attributes/environment.rb | 4 ++-- .../spec/unit/resources/efa_spec.rb | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fd8ad6dc4..39c192d440 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Upgrade Python to 3.12.8 for all OSs except AL2 (from 3.9.20). - On Ubuntu 22.04, install the Nvidia driver with the same compiler version used to compile the kernel. - Upgrade aws-cfn-bootstrap to version 2.0-33. -- Upgrade EFA installer to 1.38.0 (from 1.36.0). +- Upgrade EFA installer to 1.38.1 (from 1.36.0). - Efa-driver: efa-2.13.0-1 - Efa-config: efa-config-1.17-1 - Efa-profile: efa-profile-1.7-1 diff --git a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb index c7558dcecb..dba56d192f 100644 --- a/cookbooks/aws-parallelcluster-environment/attributes/environment.rb +++ b/cookbooks/aws-parallelcluster-environment/attributes/environment.rb @@ -70,8 +70,8 @@ default['cluster']['head_node_private_ip'] = nil -default['cluster']['efa']['version'] = '1.38.0' -default['cluster']['efa']['sha256'] = '4f436954f35ad53754b4d005fd8d0be63de3b4184de41a695b504bdce0fecb22' +default['cluster']['efa']['version'] = '1.38.1' +default['cluster']['efa']['sha256'] = '83923374afd388b1cfcf4b3a21a2b1ba7cf46a01a587f7b519b8386cb95e4f81' # TODO: Move to platform cookbook default['cluster']['spack_shared_dir'] = "#{node['cluster']['shared_dir']}/spack" diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb index bd48b7a21a..868594f8c6 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/efa_spec.rb @@ -2,8 +2,8 @@ # parallelcluster default source dir defined in attributes source_dir = '/opt/parallelcluster/sources' -efa_version = '1.38.0' -efa_checksum = '4f436954f35ad53754b4d005fd8d0be63de3b4184de41a695b504bdce0fecb22' +efa_version = '1.38.1' +efa_checksum = '83923374afd388b1cfcf4b3a21a2b1ba7cf46a01a587f7b519b8386cb95e4f81' class ConvergeEfa def self.setup(chef_run, efa_version: nil, efa_checksum: nil) From 98c689edfabba628b2078dd8c9ac71a8f46bf240 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 14 Mar 2025 06:32:05 -0700 Subject: [PATCH 07/19] Update the default root volume size to 45 GB Signed-off-by: Hanwen --- kitchen.ec2.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kitchen.ec2.yml b/kitchen.ec2.yml index 8f68924006..342d65ebfe 100644 --- a/kitchen.ec2.yml +++ b/kitchen.ec2.yml @@ -95,7 +95,7 @@ platforms: block_device_mappings: - device_name: /dev/xvda ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 45 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -121,7 +121,7 @@ platforms: block_device_mappings: - device_name: /dev/xvda ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 45 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp3 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -147,7 +147,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 45 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -173,7 +173,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 45 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -199,7 +199,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 45 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -225,7 +225,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 45 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -251,7 +251,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 45 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -277,7 +277,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 45 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp2 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> @@ -303,7 +303,7 @@ platforms: block_device_mappings: - device_name: /dev/sda1 ebs: - volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 40 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> + volume_size: <% if (ENV['KITCHEN_VOLUME_SIZE'] || '') == '' %> 45 <% else %> <%= ENV['KITCHEN_VOLUME_SIZE'] %> <% end %> volume_type: gp3 delete_on_termination: true <% %w(a b c d e f g h i j k l m n o p q r s t u v w x).each_with_index do | c, i | %> From 0f4fede60395954019a2561e3e2df1665af2a6cf Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 14 Mar 2025 08:05:27 -0700 Subject: [PATCH 08/19] Upgrade Slurm to version 24.05.7 (from 24.05.6) Signed-off-by: Hanwen --- CHANGELOG.md | 2 +- cookbooks/aws-parallelcluster-slurm/attributes/versions.rb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 39c192d440..e8299b8bde 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security. **CHANGES** -- Upgrade Slurm to version 24.05.6. +- Upgrade Slurm to version 24.05.7. - Upgrade NVIDIA driver to version 570.86.15 (from 550.127.08) for all OSs except AL2. - Upgrade CUDA Toolkit to version 12.8.0 (from 12.4.1) for all OSs except AL2. - Upgrade Python to 3.12.8 for all OSs except AL2 (from 3.9.20). diff --git a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb index 7b62fe188a..c4c457b90c 100644 --- a/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb +++ b/cookbooks/aws-parallelcluster-slurm/attributes/versions.rb @@ -1,8 +1,8 @@ # Slurm -default['cluster']['slurm']['version'] = '24-05-6-1' +default['cluster']['slurm']['version'] = '24-05-7-1' default['cluster']['slurm']['commit'] = '' default['cluster']['slurm']['branch'] = '' -default['cluster']['slurm']['sha256'] = '0ba810649ebc1c3b1c1d7102dbd5365e53fd7ce7c25ab2108bd0196b6988ddb2' +default['cluster']['slurm']['sha256'] = '297e85853314a0a4d227ca66bb44179c099f0de5d86e83ffe21cb464b9ad3709' default['cluster']['slurm']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/slurm" # Munge default['cluster']['munge']['munge_version'] = '0.5.16' From d2ff769c5e20efe07e0e3d478c26dabda9b0c707 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 14 Mar 2025 11:14:18 -0700 Subject: [PATCH 09/19] Use separate commands when copying files and preserving permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `cp -p` fails with the following error on Ubuntu 24: ``` STDERR: cp: preserving permissions for ‘/local/home/ubuntu/.ssh/authorized_keys’: Operation not supported ``` Signed-off-by: Hanwen --- .../recipes/config/cluster_user.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/recipes/config/cluster_user.rb b/cookbooks/aws-parallelcluster-platform/recipes/config/cluster_user.rb index 58e21c855d..7cd35e402d 100644 --- a/cookbooks/aws-parallelcluster-platform/recipes/config/cluster_user.rb +++ b/cookbooks/aws-parallelcluster-platform/recipes/config/cluster_user.rb @@ -73,7 +73,9 @@ bash "copy_auth_file" do code <<-PERMS set -e - cp -p #{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + cp #{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + chmod --reference=#{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + chown --reference=#{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys PERMS only_if { node['cluster']['default_user_home'] == 'local' } end @@ -90,7 +92,9 @@ bash "copy_auth_file" do code <<-PERMS set -e - cp -p #{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + cp #{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + chmod --reference=#{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys + chown --reference=#{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys PERMS only_if { node['cluster']['default_user_home'] == 'local' } end From 52fae13e82e333a7c13d6a36611e4576fa8464ac Mon Sep 17 00:00:00 2001 From: Hanwen Date: Thu, 13 Mar 2025 12:55:02 -0700 Subject: [PATCH 10/19] Shortening RHEL/Rock boot time by deprioritizing ipv6 and disable internet check This commit saves time because the os won't retry on unsupported ipv6 and optional Internet connection Signed-off-by: Hanwen --- CHANGELOG.md | 4 ++++ .../redhat/dns_domain/99-disable-ipv6-metadata.cfg | 3 +++ .../files/redhat/dns_domain/NetworkManager.conf | 7 +++++++ .../rocky/dns_domain/99-disable-ipv6-metadata.cfg | 3 +++ .../files/rocky/dns_domain/NetworkManager.conf | 7 +++++++ .../resources/dns_domain/dns_domain_redhat8.rb | 10 ++++++++++ .../resources/dns_domain/dns_domain_rocky8.rb | 10 ++++++++++ 7 files changed, 44 insertions(+) create mode 100644 cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/99-disable-ipv6-metadata.cfg create mode 100644 cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/99-disable-ipv6-metadata.cfg diff --git a/CHANGELOG.md b/CHANGELOG.md index e8299b8bde..9ce229e13d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,10 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Remove generation of DSA keys for login nodes as DSA, which became unsupported in OpenSSH 9.7+. - Set instance ID and instance type information in Slurm upon compute nodes launch. - Install NVIDIA drivers without the option 'no-cc-version-check', which is now deprecated in the NVIDIA installer. +- Reduce RHEL/Rocky Linux boot time by the following network customization: + - Configuring higher priority to IPv4 than IPv6 + - Disabling Internet connectivity check + - Configuring only IPv4 IMDS endpoint to cloud-init **BUG FIXES** - Remove usage of cfn-init for compute node bootstrapping to reduce node scale-up time. diff --git a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/99-disable-ipv6-metadata.cfg b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/99-disable-ipv6-metadata.cfg new file mode 100644 index 0000000000..71dd7f17ad --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/99-disable-ipv6-metadata.cfg @@ -0,0 +1,3 @@ +datasource: + Ec2: + metadata_urls: [ 'http://169.254.169.254' ] \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf index 64540b3c48..3caa0bccac 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf @@ -23,6 +23,13 @@ plugins = ifcfg-rh, dhcp = dhclient +[connection] +ipv4.route-metric=100 +ipv6.route-metric=200 + +[connectivity] +enabled=false + [logging] # When debugging NetworkManager, enabling debug logging is of great help. # diff --git a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/99-disable-ipv6-metadata.cfg b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/99-disable-ipv6-metadata.cfg new file mode 100644 index 0000000000..71dd7f17ad --- /dev/null +++ b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/99-disable-ipv6-metadata.cfg @@ -0,0 +1,3 @@ +datasource: + Ec2: + metadata_urls: [ 'http://169.254.169.254' ] \ No newline at end of file diff --git a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf index 64540b3c48..3caa0bccac 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf @@ -23,6 +23,13 @@ plugins = ifcfg-rh, dhcp = dhclient +[connection] +ipv4.route-metric=100 +ipv6.route-metric=200 + +[connectivity] +enabled=false + [logging] # When debugging NetworkManager, enabling debug logging is of great help. # diff --git a/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_redhat8.rb b/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_redhat8.rb index 3cb10a8d8f..4b63e5eca8 100644 --- a/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_redhat8.rb +++ b/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_redhat8.rb @@ -33,6 +33,16 @@ mode '0644' end + # Disable ipv6 IMDS in cloud init to speed up + cookbook_file '99-disable-ipv6-metadata.cfg' do + path '/etc/cloud/cloud.cfg.d/99-disable-ipv6-metadata.cfg' + source 'dns_domain/99-disable-ipv6-metadata.cfg' + cookbook 'aws-parallelcluster-slurm' + user 'root' + group 'root' + mode '0644' + end + action_update_search_domain network_service 'Restart network service' end diff --git a/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_rocky8.rb b/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_rocky8.rb index 4ad36e1856..448619f0ac 100644 --- a/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_rocky8.rb +++ b/cookbooks/aws-parallelcluster-slurm/resources/dns_domain/dns_domain_rocky8.rb @@ -33,6 +33,16 @@ mode '0644' end + # Disable ipv6 IMDS in cloud init to speed up + cookbook_file '99-disable-ipv6-metadata.cfg' do + path '/etc/cloud/cloud.cfg.d/99-disable-ipv6-metadata.cfg' + source 'dns_domain/99-disable-ipv6-metadata.cfg' + cookbook 'aws-parallelcluster-slurm' + user 'root' + group 'root' + mode '0644' + end + action_update_search_domain network_service 'Restart network service' end From 57352feee140c9642116db2405033db41afd0889 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Tue, 18 Mar 2025 16:01:36 -0400 Subject: [PATCH 11/19] [Ubuntu24] Install Lustre client for Ubuntu24 (#2912) Co-authored-by: Himani Anil Deshpande --- CHANGELOG.md | 1 - .../resources/lustre/partial/_install_lustre_debian.rb | 1 - .../test/controls/lustre_spec.rb | 10 +++++----- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ce229e13d..5359214299 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,6 @@ This file is used to list changes made in each version of the AWS ParallelCluste ------ **ENHANCEMENTS** - Add support for Ubuntu 24.04. - Notice that ParallelCluster official AMI for Ubuntu 24.04 does not support Lustre. - Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security. **CHANGES** diff --git a/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_install_lustre_debian.rb b/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_install_lustre_debian.rb index 13a797c7f8..78a7b13149 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_install_lustre_debian.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/lustre/partial/_install_lustre_debian.rb @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and limitations under the License. action :setup do - return if node['platform_version'].to_i == 24 apt_repository 'fsxlustreclientrepo' do uri "https://fsx-lustre-client-repo.s3.amazonaws.com/ubuntu" components ['main'] diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb index ef15c538d5..0d1f5b707d 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/lustre_spec.rb @@ -27,7 +27,7 @@ end end - if os_properties.redhat? && inspec.os.release.to_f >= 8.2 && !os_properties.on_docker? && !os_properties.ubuntu2404? + if os_properties.redhat? && inspec.os.release.to_f >= 8.2 && !os_properties.on_docker? # TODO: restore installation and check on docker when Lustre is available for RH8.9 # See: https://docs.aws.amazon.com/fsx/latest/LustreGuide/install-lustre-client.html unless inspec.os.release.to_f == 8.7 && (node['cluster']['kernel_release'].include?("4.18.0-425.3.1.el8") || node['cluster']['kernel_release'].include?("4.18.0-425.13.1.el8_7")) @@ -55,7 +55,7 @@ end end - if os_properties.debian_family? && !os_properties.ubuntu2404? + if os_properties.debian_family? describe apt('https://fsx-lustre-client-repo.s3.amazonaws.com/ubuntu') do it { should exist } it { should be_enabled } @@ -89,7 +89,7 @@ control 'tag:install_lustre_lnet_kernel_module_enabled' do title "Verify that lnet kernel module is enabled" - only_if { !os_properties.on_docker? && !os_properties.alinux? && !os_properties.ubuntu2404? } + only_if { !os_properties.on_docker? && !os_properties.alinux? } describe kernel_module("lnet") do it { should be_loaded } it { should_not be_disabled } @@ -98,7 +98,7 @@ end control 'lustre_mounted' do - only_if { !os_properties.on_docker? && !os_properties.ubuntu2404? } + only_if { !os_properties.on_docker? } describe mount('/shared_dir') do it { should be_mounted } its('type') { should eq 'lustre' } @@ -106,7 +106,7 @@ end control 'lustre_unmounted' do - only_if { !os_properties.on_docker? && !os_properties.ubuntu2404? } + only_if { !os_properties.on_docker? } describe mount('/shared_dir') do it { should_not be_mounted } From 101b89965656d4caef15ea535f8a51ef82ef7c1a Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Wed, 19 Mar 2025 16:35:21 -0400 Subject: [PATCH 12/19] Adding Ubuntu24 for kitchen tests for storage (#2915) * Adding Ubuntu24 for kitchen tests for storage * Adding Rocky9, rhel9 and al2023 --------- Co-authored-by: Himani Anil Deshpande --- .../test/controls/cloudwatch_spec.rb | 7 +++---- .../test/controls/sticky_bits_spec.rb | 2 +- .../test/controls/mysql_client_spec.rb | 5 ++--- test/environments/kitchen.rb | 20 +++++++++++++++---- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb index 0d1f4783f1..d88b2e2c2f 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb @@ -20,14 +20,13 @@ describe 'Check the presence of the cloudwatch package gpg key' # In Ubuntu >20.04 due to environment variable the keyring is placed under home of the user ubuntu with the permission of root - ubuntu2004 = os_properties.ubuntu2004? - ubuntu2204 = os_properties.ubuntu2204? - keyring = (ubuntu2004 || ubuntu2204) && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : '' + + keyring = os_properties.ubuntu? && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : '' sudo = os_properties.redhat_on_docker? ? '' : 'sudo' describe bash("#{sudo} gpg --list-keys #{keyring}") do # Don't check exit status for Ubuntu20 because it returns 2 when executed in the validate phase of a created AMI # os_properties cannot be used in the describe block level. It can be used within an it{} block - its('exit_status') { should eq 0 } unless ubuntu2004 || ubuntu2204 + its('exit_status') { should eq 0 } unless os_properties.ubuntu? its('stdout') { should match /3B789C72/ } its('stdout') { should match /Amazon CloudWatch Agent/ } end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/sticky_bits_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/sticky_bits_spec.rb index c72f192973..1068082914 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/sticky_bits_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/sticky_bits_spec.rb @@ -1,7 +1,7 @@ control 'tag:config_sticky_bits_configured' do title 'Check sticky bits configuration' - if (os_properties.ubuntu2004? || os_properties.ubuntu2204?) && !os_properties.on_docker? + if os_properties.ubuntu? && !os_properties.on_docker? # This test passes on Mac but doesn't work as GitHub action. describe kernel_parameter('fs.protected_regular') do its('value') { should eq 0 } diff --git a/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb b/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb index bbe1ddf088..bd50750ec8 100644 --- a/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/test/controls/mysql_client_spec.rb @@ -13,13 +13,14 @@ title "MySql client is installed" mysql_packages = [] + ubuntu = os_properties.ubuntu? if os.redhat? mysql_packages.concat %w(mysql-community-client-plugins mysql-community-common mysql-community-devel mysql-community-libs) if os_properties.alinux2? || os_properties.centos7? mysql_packages.concat %w(mysql-community-libs-compat) end - elsif os_properties.ubuntu2004? || os_properties.ubuntu2204? + elsif ubuntu mysql_packages.concat %w(libmysqlclient-dev libmysqlclient21) else describe "unsupported OS" do @@ -27,8 +28,6 @@ end end - ubuntu = os_properties.ubuntu? - mysql_packages.each do |pkg| describe package(pkg) do it { should be_installed } diff --git a/test/environments/kitchen.rb b/test/environments/kitchen.rb index ec880d60b6..d8702b5cdf 100644 --- a/test/environments/kitchen.rb +++ b/test/environments/kitchen.rb @@ -5,29 +5,41 @@ name 'kitchen' default_attributes 'kitchen_hooks' => { 'ebs_mount-vol_array/alinux2' => '', + 'ebs_mount-vol_array/alinux2023' => '', 'ebs_mount-vol_array/rhel8' => '', - 'ebs_mount-vol_array/centos7' => '', + 'ebs_mount-vol_array/rhel9' => '', 'ebs_mount-vol_array/ubuntu2004' => '', 'ebs_mount-vol_array/ubuntu2204' => '', + 'ebs_mount-vol_array/ubuntu2404' => '', 'ebs_mount-vol_array/rocky8' => '', + 'ebs_mount-vol_array/rocky9' => '', 'ebs_unmount-vol_array/alinux2' => '', + 'ebs_unmount-vol_array/alinux2023' => '', 'ebs_unmount-vol_array/rhel8' => '', - 'ebs_unmount-vol_array/centos7' => '', + 'ebs_unmount-vol_array/rhel9' => '', 'ebs_unmount-vol_array/ubuntu2004' => '', 'ebs_unmount-vol_array/ubuntu2204' => '', + 'ebs_unmount-vol_array/ubuntu2404' => '', 'ebs_unmount-vol_array/rocky8' => '', + 'ebs_unmount-vol_array/rocky9' => '', 'raid_mount-raid_vol_array/alinux2' => '', + 'raid_mount-raid_vol_array/alinux2023' => '', 'raid_mount-raid_vol_array/rhel8' => '', - 'raid_mount-raid_vol_array/centos7' => '', + 'raid_mount-raid_vol_array/rhel9' => '', 'raid_mount-raid_vol_array/ubuntu2004' => '', 'raid_mount-raid_vol_array/ubuntu2204' => '', + 'raid_mount-raid_vol_array/ubuntu2404' => '', 'raid_mount-raid_vol_array/rocky8' => '', + 'raid_mount-raid_vol_array/rocky9' => '', 'raid_unmount-raid_vol_array/alinux2' => '', + 'raid_unmount-raid_vol_array/alinux2023' => '', 'raid_unmount-raid_vol_array/rhel8' => '', - 'raid_unmount-raid_vol_array/centos7' => '', + 'raid_unmount-raid_vol_array/rhel9' => '', 'raid_unmount-raid_vol_array/ubuntu2004' => '', 'raid_unmount-raid_vol_array/ubuntu2204' => '', + 'raid_unmount-raid_vol_array/ubuntu2404' => '', 'raid_unmount-raid_vol_array/rocky8' => '', + 'raid_unmount-raid_vol_array/rocky9' => '', 'lustre_mount-fsx_fs_id_array' => ["fs-0ab11b3ade43091fe"], 'lustre_mount-fsx_dns_name_array' => ["fs-0ab11b3ade43091fe.fsx.us-west-2.amazonaws.com"], 'lustre_mount-fsx_mount_name_array' => ["qz5b7bev"], From ad9059dade3779d4c2fdabd19076a098a628e9c6 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Wed, 19 Mar 2025 17:24:30 -0400 Subject: [PATCH 13/19] [Kitchen test] Add Ubuntu Os for C_state Install phase tests (#2916) Co-authored-by: Himani Anil Deshpande --- .../test/controls/c_states_spec.rb | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/c_states_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/c_states_spec.rb index 86b5686008..e560019afd 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/c_states_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/c_states_spec.rb @@ -3,16 +3,13 @@ title 'Check the configuration to disable c states' only_if { !os_properties.on_docker? && os_properties.x86? } - if os_properties.ubuntu2004? - describe file('/etc/default/grub') do - it { should exist } - its('content') { should match(/processor.max_cstate=1/) } - its('content') { should match(/intel_idle.max_cstate=1/) } - end - describe file('/boot/grub/grub.cfg') do - it { should exist } - its('content') { should match(/processor.max_cstate=1/) } - its('content') { should match(/intel_idle.max_cstate=1/) } + if os_properties.ubuntu? + %w(/etc/default/grub /boot/grub/grub.cfg).each do |file_path| + describe file(file_path) do + it { should exist } + its('content') { should match(/processor.max_cstate=1/) } + its('content') { should match(/intel_idle.max_cstate=1/) } + end end else describe bash('cpupower idle-info') do From c1007d895a22669c0a47b30848ab205bc505c07b Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Thu, 20 Mar 2025 10:22:59 -0400 Subject: [PATCH 14/19] [Kitchen Test] Use os_properties outside it block (#2918) Co-authored-by: Himani Anil Deshpande --- .../test/controls/cloudwatch_spec.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb index d88b2e2c2f..ee35703090 100644 --- a/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/test/controls/cloudwatch_spec.rb @@ -20,13 +20,13 @@ describe 'Check the presence of the cloudwatch package gpg key' # In Ubuntu >20.04 due to environment variable the keyring is placed under home of the user ubuntu with the permission of root - - keyring = os_properties.ubuntu? && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : '' + is_ubuntu = os_properties.ubuntu? + keyring = is_ubuntu && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : '' sudo = os_properties.redhat_on_docker? ? '' : 'sudo' describe bash("#{sudo} gpg --list-keys #{keyring}") do # Don't check exit status for Ubuntu20 because it returns 2 when executed in the validate phase of a created AMI # os_properties cannot be used in the describe block level. It can be used within an it{} block - its('exit_status') { should eq 0 } unless os_properties.ubuntu? + its('exit_status') { should eq 0 } unless is_ubuntu its('stdout') { should match /3B789C72/ } its('stdout') { should match /Amazon CloudWatch Agent/ } end From d6e6d243d168ee179ddc029876e3c67b14746afb Mon Sep 17 00:00:00 2001 From: Hanwen Date: Thu, 20 Mar 2025 12:50:40 -0700 Subject: [PATCH 15/19] Clean resolv.conf if it's not managed by system at the end of AMI build The conditional statement avoids cleaning if the `/etc/resolv.conf` is a symbolic link. It is a symbolic link when it is managed by other systems. Cleaning the `/etc/resolv.conf` speed up instance launch because it wouldn't try to use name server from the AMI creation environment. The delay was shown in `/var/log/cloud-init.log`: ``` 2025-03-19 16:00:07,721 - util.py[DEBUG]: Resolving URL: http://169.254.169.254 took 40.099 seconds 2025-03-19 16:00:07,721 - util.py[DEBUG]: Resolving URL: http://[fd00:ec2::254] took 0.000 seconds 2025-03-19 16:00:17,731 - util.py[DEBUG]: Resolving URL: http://instance-data.:8773 took 10.010 seconds ``` Example content of `/etc/resolv.conf`: ``` cat /etc/resolv.conf # Generated by NetworkManager search ec2.internal nameserver 192.168.0.2 ``` Signed-off-by: Hanwen --- cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh index 2943b51f38..00f98efc16 100644 --- a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh +++ b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh @@ -20,5 +20,10 @@ if [ "${ID}${VERSION_ID}" == "centos7" ]; then rm -f /etc/sysconfig/network-scripts/ifcfg-eth0 fi +# Clean resolv.conf if it's not managed by system +if [ ! -L "/etc/resolv.conf" ]; then + echo -n > /etc/resolv.conf +fi + find /var/log -type f -exec /bin/rm -v {} \; touch /var/log/lastlog From 17a93bace198839e13e6eceb9054df0b9adac7f2 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Thu, 20 Mar 2025 13:25:01 -0700 Subject: [PATCH 16/19] Remove nameserver from resolv.conf at the end of AMI build The conditional statement avoids cleaning if the `/etc/resolv.conf` is a symbolic link. It is a symbolic link when it is managed by other systems. Cleaning the `/etc/resolv.conf` speeds up instance launch because it wouldn't try to use name server from the AMI creation environment. The delay was shown in `/var/log/cloud-init.log`: ``` 2025-03-19 16:00:07,721 - util.py[DEBUG]: Resolving URL: http://169.254.169.254 took 40.099 seconds 2025-03-19 16:00:07,721 - util.py[DEBUG]: Resolving URL: http://[fd00:ec2::254] took 0.000 seconds 2025-03-19 16:00:17,731 - util.py[DEBUG]: Resolving URL: http://instance-data.:8773 took 10.010 seconds ``` Example content of `/etc/resolv.conf`: ``` cat /etc/resolv.conf # Generated by NetworkManager search ec2.internal nameserver 192.168.0.2 ``` Signed-off-by: Hanwen --- cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh index 00f98efc16..bb14fa8e53 100644 --- a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh +++ b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh @@ -22,7 +22,7 @@ fi # Clean resolv.conf if it's not managed by system if [ ! -L "/etc/resolv.conf" ]; then - echo -n > /etc/resolv.conf + sed -i '/^nameserver/d' /etc/resolv.conf fi find /var/log -type f -exec /bin/rm -v {} \; From 1c425ba0a86e4e63adb289ee14b48b0384481e48 Mon Sep 17 00:00:00 2001 From: Hanwen Date: Fri, 21 Mar 2025 12:58:02 -0700 Subject: [PATCH 17/19] Clean Resolv.conf in official AMI build 1. Only cleanup resolv conf during official AMI build. In the future, we will evaluate to apply this improvement to all AMI builds. 2. Also clean up `/run/systemd/resolve/resolv.conf `. This file exists on Ubuntu Signed-off-by: Hanwen --- .../aws-parallelcluster-platform/files/ami_cleanup.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh index bb14fa8e53..7a50cc215a 100644 --- a/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh +++ b/cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh @@ -1,5 +1,7 @@ #!/bin/bash +IS_OFFICIAL_AMI_BUILD=${1:-"false"} + # clean up cloud init artifacts https://cloudinit.readthedocs.io/en/latest/topics/cli.html#clean cloud-init clean -s @@ -21,8 +23,10 @@ if [ "${ID}${VERSION_ID}" == "centos7" ]; then fi # Clean resolv.conf if it's not managed by system -if [ ! -L "/etc/resolv.conf" ]; then - sed -i '/^nameserver/d' /etc/resolv.conf +if [ "${IS_OFFICIAL_AMI_BUILD}" == "true" ]; then + echo "Clean resolv.conf for official AMIs" + echo -n > /etc/resolv.conf + rm -f /run/systemd/resolve/resolv.conf fi find /var/log -type f -exec /bin/rm -v {} \; From 79911772cf05a64f707be7c8d6bf9e254436d5af Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande <79726937+himani2411@users.noreply.github.com> Date: Tue, 25 Mar 2025 09:51:12 -0400 Subject: [PATCH 18/19] Remove route metric to check if it affects route tables (#2923) Co-authored-by: Himani Anil Deshpande --- .../files/redhat/dns_domain/NetworkManager.conf | 4 ---- .../files/rocky/dns_domain/NetworkManager.conf | 4 ---- 2 files changed, 8 deletions(-) diff --git a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf index 3caa0bccac..e89b4a02fd 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/redhat/dns_domain/NetworkManager.conf @@ -23,10 +23,6 @@ plugins = ifcfg-rh, dhcp = dhclient -[connection] -ipv4.route-metric=100 -ipv6.route-metric=200 - [connectivity] enabled=false diff --git a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf index 3caa0bccac..e89b4a02fd 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf +++ b/cookbooks/aws-parallelcluster-slurm/files/rocky/dns_domain/NetworkManager.conf @@ -23,10 +23,6 @@ plugins = ifcfg-rh, dhcp = dhclient -[connection] -ipv4.route-metric=100 -ipv6.route-metric=200 - [connectivity] enabled=false From 20d803fb259bf41957afe54ad48b0a5ccbbb921f Mon Sep 17 00:00:00 2001 From: Hanwen Date: Tue, 25 Mar 2025 12:29:49 -0700 Subject: [PATCH 19/19] Consider string "true" as turning on Lustre, Nvidia installation Signed-off-by: Hanwen --- cookbooks/aws-parallelcluster-environment/libraries/fsx.rb | 2 +- cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb | 2 +- .../fabric_manager/partial/_fabric_manager_common.rb | 2 +- .../resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb | 2 +- .../test/controls/enroot_spec.rb | 6 +++--- .../test/controls/nvidia_dcgm_spec.rb | 2 +- .../test/controls/nvidia_fabric_manager_spec.rb | 2 +- .../aws-parallelcluster-slurm/test/controls/pyxis_spec.rb | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cookbooks/aws-parallelcluster-environment/libraries/fsx.rb b/cookbooks/aws-parallelcluster-environment/libraries/fsx.rb index 40f4871a79..70e1916084 100644 --- a/cookbooks/aws-parallelcluster-environment/libraries/fsx.rb +++ b/cookbooks/aws-parallelcluster-environment/libraries/fsx.rb @@ -10,5 +10,5 @@ def aws_domain_for_fsx(region) end def lustre_enabled? - ['yes', true].include?(node['cluster']['lustre']['enabled']) + ['yes', true, 'true'].include?(node['cluster']['lustre']['enabled']) end diff --git a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb index 29f258490e..a5952b0313 100644 --- a/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb +++ b/cookbooks/aws-parallelcluster-platform/libraries/nvidia.rb @@ -1,5 +1,5 @@ def nvidia_enabled? - ['yes', true].include?(node['cluster']['nvidia']['enabled']) + ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) end # diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index 1c5ac45aba..027766f98f 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -45,7 +45,7 @@ def _fabric_manager_enabled end def _nvidia_enabled - nvidia_enabled.nil? ? ['yes', true].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled + nvidia_enabled.nil? ? ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled end def _nvidia_driver_version diff --git a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb index 7ad1032211..bd4278f9a6 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_common.rb @@ -24,7 +24,7 @@ end def _nvidia_enabled - nvidia_enabled.nil? ? ['yes', true].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled + nvidia_enabled.nil? ? ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled end def package_version diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb index a8d92f625e..47eac71c1d 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'tag:install_expected_version_of_enroot_installed' do - only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { !os_properties.on_docker? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } expected_enroot_version = node['cluster']['enroot']['version'] @@ -31,7 +31,7 @@ end control 'tag:config_enroot_enabled_on_graphic_instances' do - only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { !os_properties.on_docker? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } describe 'enroot service should be enabled' do subject { command("enroot version") } its('exit_status') { should cmp == 0 } @@ -39,7 +39,7 @@ end control 'tag:config_enroot_disabled_on_non_graphic_instances' do - only_if { !os_properties.on_docker? && !['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { !os_properties.on_docker? && !['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } describe 'enroot service should be disabled' do subject { command("enroot version") } diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb index 6ee542a651..15ddf1c512 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_dcgm_spec.rb @@ -11,7 +11,7 @@ control 'tag:install_nvidia_dcgm_installed' do only_if do - ['yes', true].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? && + ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? && (!os_properties.arm? || !(os_properties.alinux2? || os_properties.centos?)) end diff --git a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb index 4b3564af59..242ce90e5f 100644 --- a/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'tag:install_expected_versions_of_nvidia_fabric_manager_installed' do - only_if { !os_properties.arm? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } describe package(node['cluster']['nvidia']['fabricmanager']['package']) do it { should be_installed } diff --git a/cookbooks/aws-parallelcluster-slurm/test/controls/pyxis_spec.rb b/cookbooks/aws-parallelcluster-slurm/test/controls/pyxis_spec.rb index d63f77de56..0cc7e8697e 100644 --- a/cookbooks/aws-parallelcluster-slurm/test/controls/pyxis_spec.rb +++ b/cookbooks/aws-parallelcluster-slurm/test/controls/pyxis_spec.rb @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and limitations under the License. control 'tag:install_pyxis_installed' do - only_if { ['yes', true].include?(node['cluster']['nvidia']['enabled']) } + only_if { ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) } title 'Checks Pyxis has been installed'