Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ This file is used to list changes made in each version of the AWS ParallelCluste
------
**ENHANCEMENTS**
- Add support for Ubuntu 24.04.
Notice that ParallelCluster official AMI for Ubuntu 24.04 does not support Lustre.
- Disable unused services like cups and wpa_supplicant from Official ParallelCluster AMIs to improve security.

**CHANGES**
- Upgrade Slurm to version 24.05.6.
- Upgrade Slurm to version 24.05.7.
- Upgrade NVIDIA driver to version 570.86.15 (from 550.127.08) for all OSs except AL2.
- Upgrade CUDA Toolkit to version 12.8.0 (from 12.4.1) for all OSs except AL2.
- Upgrade Python to 3.12.8 for all OSs except AL2 (from 3.9.20).
Expand All @@ -43,6 +42,10 @@ This file is used to list changes made in each version of the AWS ParallelCluste
- Remove generation of DSA keys for login nodes as DSA, which became unsupported in OpenSSH 9.7+.
- Set instance ID and instance type information in Slurm upon compute nodes launch.
- Install NVIDIA drivers without the option 'no-cc-version-check', which is now deprecated in the NVIDIA installer.
- Reduce RHEL/Rocky Linux boot time by the following network customization:
- Configuring higher priority to IPv4 than IPv6
- Disabling Internet connectivity check
- Configuring only IPv4 IMDS endpoint to cloud-init

**BUG FIXES**
- Remove usage of cfn-init for compute node bootstrapping to reduce node scale-up time.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ def aws_domain_for_fsx(region)
end

def lustre_enabled?
['yes', true].include?(node['cluster']['lustre']['enabled'])
['yes', true, 'true'].include?(node['cluster']['lustre']['enabled'])
end
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and limitations under the License.

action :setup do
return if node['platform_version'].to_i == 24
apt_repository 'fsxlustreclientrepo' do
uri "https://fsx-lustre-client-repo.s3.amazonaws.com/ubuntu"
components ['main']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,13 @@

describe 'Check the presence of the cloudwatch package gpg key'
# In Ubuntu >20.04 due to environment variable the keyring is placed under home of the user ubuntu with the permission of root
ubuntu2004 = os_properties.ubuntu2004?
ubuntu2204 = os_properties.ubuntu2204?
keyring = (ubuntu2004 || ubuntu2204) && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : ''
is_ubuntu = os_properties.ubuntu?
keyring = is_ubuntu && !os_properties.on_docker? ? '--keyring /home/ubuntu/.gnupg/pubring.kbx' : ''
sudo = os_properties.redhat_on_docker? ? '' : 'sudo'
describe bash("#{sudo} gpg --list-keys #{keyring}") do
# Don't check exit status for Ubuntu20 because it returns 2 when executed in the validate phase of a created AMI
# os_properties cannot be used in the describe block level. It can be used within an it{} block
its('exit_status') { should eq 0 } unless ubuntu2004 || ubuntu2204
its('exit_status') { should eq 0 } unless is_ubuntu
its('stdout') { should match /3B789C72/ }
its('stdout') { should match /Amazon CloudWatch Agent/ }
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
end
end

if os_properties.redhat? && inspec.os.release.to_f >= 8.2 && !os_properties.on_docker? && !os_properties.ubuntu2404?
if os_properties.redhat? && inspec.os.release.to_f >= 8.2 && !os_properties.on_docker?
# TODO: restore installation and check on docker when Lustre is available for RH8.9
# See: https://docs.aws.amazon.com/fsx/latest/LustreGuide/install-lustre-client.html
unless inspec.os.release.to_f == 8.7 && (node['cluster']['kernel_release'].include?("4.18.0-425.3.1.el8") || node['cluster']['kernel_release'].include?("4.18.0-425.13.1.el8_7"))
Expand Down Expand Up @@ -55,7 +55,7 @@
end
end

if os_properties.debian_family? && !os_properties.ubuntu2404?
if os_properties.debian_family?
describe apt('https://fsx-lustre-client-repo.s3.amazonaws.com/ubuntu') do
it { should exist }
it { should be_enabled }
Expand Down Expand Up @@ -89,7 +89,7 @@

control 'tag:install_lustre_lnet_kernel_module_enabled' do
title "Verify that lnet kernel module is enabled"
only_if { !os_properties.on_docker? && !os_properties.alinux? && !os_properties.ubuntu2404? }
only_if { !os_properties.on_docker? && !os_properties.alinux? }
describe kernel_module("lnet") do
it { should be_loaded }
it { should_not be_disabled }
Expand All @@ -98,15 +98,15 @@
end

control 'lustre_mounted' do
only_if { !os_properties.on_docker? && !os_properties.ubuntu2404? }
only_if { !os_properties.on_docker? }
describe mount('/shared_dir') do
it { should be_mounted }
its('type') { should eq 'lustre' }
end
end

control 'lustre_unmounted' do
only_if { !os_properties.on_docker? && !os_properties.ubuntu2404? }
only_if { !os_properties.on_docker? }

describe mount('/shared_dir') do
it { should_not be_mounted }
Expand Down
9 changes: 9 additions & 0 deletions cookbooks/aws-parallelcluster-platform/files/ami_cleanup.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

IS_OFFICIAL_AMI_BUILD=${1:-"false"}

# clean up cloud init artifacts https://cloudinit.readthedocs.io/en/latest/topics/cli.html#clean
cloud-init clean -s

Expand All @@ -20,5 +22,12 @@ if [ "${ID}${VERSION_ID}" == "centos7" ]; then
rm -f /etc/sysconfig/network-scripts/ifcfg-eth0
fi

# Clean resolv.conf if it's not managed by system
if [ "${IS_OFFICIAL_AMI_BUILD}" == "true" ]; then
echo "Clean resolv.conf for official AMIs"
echo -n > /etc/resolv.conf
rm -f /run/systemd/resolve/resolv.conf
fi

find /var/log -type f -exec /bin/rm -v {} \;
touch /var/log/lastlog
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
def nvidia_enabled?
['yes', true].include?(node['cluster']['nvidia']['enabled'])
['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled'])
end

#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@
bash "copy_auth_file" do
code <<-PERMS
set -e
cp -p #{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys
cp #{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys
chmod --reference=#{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys
chown --reference=#{node['cluster']['shared_dir']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys
PERMS
only_if { node['cluster']['default_user_home'] == 'local' }
end
Expand All @@ -90,7 +92,9 @@
bash "copy_auth_file" do
code <<-PERMS
set -e
cp -p #{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys
cp #{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys
chmod --reference=#{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys
chown --reference=#{node['cluster']['shared_dir_login_nodes']}/authorized_keys #{node['cluster']['cluster_user_home']}/.ssh/authorized_keys
PERMS
only_if { node['cluster']['default_user_home'] == 'local' }
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _fabric_manager_enabled
end

def _nvidia_enabled
nvidia_enabled.nil? ? ['yes', true].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled
nvidia_enabled.nil? ? ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled
end

def _nvidia_driver_version
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
end

def _nvidia_enabled
nvidia_enabled.nil? ? ['yes', true].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled
nvidia_enabled.nil? ? ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) : nvidia_enabled
end

def package_version
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,13 @@
title 'Check the configuration to disable c states'
only_if { !os_properties.on_docker? && os_properties.x86? }

if os_properties.ubuntu2004?
describe file('/etc/default/grub') do
it { should exist }
its('content') { should match(/processor.max_cstate=1/) }
its('content') { should match(/intel_idle.max_cstate=1/) }
end
describe file('/boot/grub/grub.cfg') do
it { should exist }
its('content') { should match(/processor.max_cstate=1/) }
its('content') { should match(/intel_idle.max_cstate=1/) }
if os_properties.ubuntu?
%w(/etc/default/grub /boot/grub/grub.cfg).each do |file_path|
describe file(file_path) do
it { should exist }
its('content') { should match(/processor.max_cstate=1/) }
its('content') { should match(/intel_idle.max_cstate=1/) }
end
end
else
describe bash('cpupower idle-info') do
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and limitations under the License.

control 'tag:install_expected_version_of_enroot_installed' do
only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) }
only_if { !os_properties.on_docker? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) }

expected_enroot_version = node['cluster']['enroot']['version']

Expand All @@ -31,15 +31,15 @@
end

control 'tag:config_enroot_enabled_on_graphic_instances' do
only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) }
only_if { !os_properties.on_docker? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) }
describe 'enroot service should be enabled' do
subject { command("enroot version") }
its('exit_status') { should cmp == 0 }
end
end

control 'tag:config_enroot_disabled_on_non_graphic_instances' do
only_if { !os_properties.on_docker? && !['yes', true].include?(node['cluster']['nvidia']['enabled']) }
only_if { !os_properties.on_docker? && !['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) }

describe 'enroot service should be disabled' do
subject { command("enroot version") }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

control 'tag:install_nvidia_dcgm_installed' do
only_if do
['yes', true].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? &&
['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !instance.custom_ami? &&
(!os_properties.arm? || !(os_properties.alinux2? || os_properties.centos?))
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and limitations under the License.

control 'tag:install_expected_versions_of_nvidia_fabric_manager_installed' do
only_if { !os_properties.arm? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) }
only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) }

describe package(node['cluster']['nvidia']['fabricmanager']['package']) do
it { should be_installed }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
control 'tag:config_sticky_bits_configured' do
title 'Check sticky bits configuration'

if (os_properties.ubuntu2004? || os_properties.ubuntu2204?) && !os_properties.on_docker?
if os_properties.ubuntu? && !os_properties.on_docker?
# This test passes on Mac but doesn't work as GitHub action.
describe kernel_parameter('fs.protected_regular') do
its('value') { should eq 0 }
Expand Down
4 changes: 2 additions & 2 deletions cookbooks/aws-parallelcluster-slurm/attributes/versions.rb
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Slurm
default['cluster']['slurm']['version'] = '24-05-6-1'
default['cluster']['slurm']['version'] = '24-05-7-1'
default['cluster']['slurm']['commit'] = ''
default['cluster']['slurm']['branch'] = ''
default['cluster']['slurm']['sha256'] = '0ba810649ebc1c3b1c1d7102dbd5365e53fd7ce7c25ab2108bd0196b6988ddb2'
default['cluster']['slurm']['sha256'] = '297e85853314a0a4d227ca66bb44179c099f0de5d86e83ffe21cb464b9ad3709'
default['cluster']['slurm']['base_url'] = "#{node['cluster']['artifacts_s3_url']}/dependencies/slurm"
# Munge
default['cluster']['munge']['munge_version'] = '0.5.16'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
datasource:
Ec2:
metadata_urls: [ 'http://169.254.169.254' ]
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
plugins = ifcfg-rh,
dhcp = dhclient

[connectivity]
enabled=false

[logging]
# When debugging NetworkManager, enabling debug logging is of great help.
#
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
datasource:
Ec2:
metadata_urls: [ 'http://169.254.169.254' ]
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
plugins = ifcfg-rh,
dhcp = dhclient

[connectivity]
enabled=false

[logging]
# When debugging NetworkManager, enabling debug logging is of great help.
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@
mode '0644'
end

# Disable ipv6 IMDS in cloud init to speed up
cookbook_file '99-disable-ipv6-metadata.cfg' do
path '/etc/cloud/cloud.cfg.d/99-disable-ipv6-metadata.cfg'
source 'dns_domain/99-disable-ipv6-metadata.cfg'
cookbook 'aws-parallelcluster-slurm'
user 'root'
group 'root'
mode '0644'
end

action_update_search_domain
network_service 'Restart network service'
end
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,16 @@
mode '0644'
end

# Disable ipv6 IMDS in cloud init to speed up
cookbook_file '99-disable-ipv6-metadata.cfg' do
path '/etc/cloud/cloud.cfg.d/99-disable-ipv6-metadata.cfg'
source 'dns_domain/99-disable-ipv6-metadata.cfg'
cookbook 'aws-parallelcluster-slurm'
user 'root'
group 'root'
mode '0644'
end

action_update_search_domain
network_service 'Restart network service'
end
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,21 @@
title "MySql client is installed"

mysql_packages = []
ubuntu = os_properties.ubuntu?
if os.redhat?
mysql_packages.concat %w(mysql-community-client-plugins mysql-community-common
mysql-community-devel mysql-community-libs)
if os_properties.alinux2? || os_properties.centos7?
mysql_packages.concat %w(mysql-community-libs-compat)
end
elsif os_properties.ubuntu2004? || os_properties.ubuntu2204?
elsif ubuntu
mysql_packages.concat %w(libmysqlclient-dev libmysqlclient21)
else
describe "unsupported OS" do
pending "support for #{os.name}-#{os.release} needs to be implemented"
end
end

ubuntu = os_properties.ubuntu?

mysql_packages.each do |pkg|
describe package(pkg) do
it { should be_installed }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# See the License for the specific language governing permissions and limitations under the License.

control 'tag:install_pyxis_installed' do
only_if { ['yes', true].include?(node['cluster']['nvidia']['enabled']) }
only_if { ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) }

title 'Checks Pyxis has been installed'

Expand Down
Loading
Loading