Skip to content

Commit a0505f9

Browse files
committed
Fix the way Pyxis and Enroot are configured.
1. Pyxis is disabled by default. In particular, the SPANK config file and the Pyxis config file required to enable it are stored in `/opt/parallelcluster/examples` folder so that they are ineffective but can be used by the user to enable Pyxisby simply moving them to the expected location. 2. Move Pyxis and Enroot configuration to build time (there was no reason to configure Pyxis and Enroot at runtime) 3. Changed Pyxis runtime path to `/run/pyxis`. As per [documentation](https://github.com/NVIDIA/pyxis/wiki/Setup#slurm-plugstack-configuration) a tmpfs should be used. Asd a consequence, we needed to define a tmpfiles config to make sure that the dedicated folder is not deleted at boot time. 4. Changed Enroot paths, following the suggestion in the [documentation](https://github.com/NVIDIA/pyxis/wiki/Setup#enroot-configuration-example) 1. Using tmpfs storage for `ENROOT_RUNTIME_PATH` and `ENROOT_DATA_PATH` 2. Using a persistent local storage for `ENROOT_CACHE_PATH` and `ENROOT_CONFIG_PATH`. 5. *Minor*: Moved Pyxis attributes from platform cookbook to slurm cookbook because Pyxis is a SLURM plugin so it would be conceptually wrong to have its attributes defined in platform cookbook. Signed-off-by: Giacomo Marciani <[email protected]>
1 parent 9282157 commit a0505f9

File tree

17 files changed

+349
-109
lines changed

17 files changed

+349
-109
lines changed

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
# ArmPL
1010
default['conditions']['arm_pl_supported'] = arm_instance?
1111

12-
# Enroot + Pyxis
12+
# Enroot
1313
default['cluster']['enroot']['version'] = '3.4.1'
14-
default['cluster']['pyxis']['version'] = '0.20.0'
14+
default['cluster']['enroot']['temporary_dir'] = '/run/enroot'
15+
default['cluster']['enroot']['persistent_dir'] = '/var/enroot'
1516

1617
# NVidia
1718
default['cluster']['nvidia']['enabled'] = 'no'

cookbooks/aws-parallelcluster-platform/recipes/config.rb

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,3 @@
2626
include_recipe 'aws-parallelcluster-platform::supervisord_config'
2727
fetch_config 'Fetch and load cluster configs'
2828
include_recipe 'aws-parallelcluster-platform::config_login' if node['cluster']['node_type'] == 'LoginNode'
29-
enroot 'Configure Enroot' do
30-
action :configure
31-
end

cookbooks/aws-parallelcluster-platform/recipes/install/directories.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
directory node['cluster']['license_dir']
2222
directory node['cluster']['configs_dir']
2323
directory node['cluster']['shared_dir']
24+
directory node['cluster']['examples_dir']
2425
directory node['cluster']['shared_dir_login_nodes']
2526

2627
# Create ParallelCluster log folder

cookbooks/aws-parallelcluster-platform/resources/enroot/partial/_enroot_common.rb

Lines changed: 24 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# frozen_string_literal: true
22
#
3-
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License").
66
# You may not use this file except in compliance with the License.
@@ -18,50 +18,36 @@
1818
action :setup do
1919
return if on_docker?
2020
action_install_package
21-
end
22-
23-
action :configure do
24-
return if on_docker?
25-
return unless enroot_installed
2621

27-
cookbook_file "/tmp/enroot.template.conf" do
28-
source 'enroot/enroot.template.conf'
29-
cookbook 'aws-parallelcluster-platform'
22+
template "/etc/enroot/enroot.conf" do
23+
source 'enroot/enroot.conf.erb'
3024
owner 'root'
3125
group 'root'
32-
mode '0755'
33-
action :create_if_missing
26+
mode '0644'
3427
end
3528

36-
bash "Configure enroot" do
37-
user 'root'
38-
code <<-ENROOT_CONFIGURE
39-
set -e
40-
ENROOT_CONFIG_RELEASE=pyxis
41-
SHARED_DIR=#{node['cluster']['shared_dir']}
42-
NONROOT_USER=#{node['cluster']['cluster_user']}
43-
mkdir -p ${SHARED_DIR}/enroot
44-
chown ${NONROOT_USER} ${SHARED_DIR}/enroot
45-
ENROOT_CACHE_PATH=${SHARED_DIR}/enroot envsubst < /tmp/enroot.template.conf > /tmp/enroot.conf
46-
mv /tmp/enroot.conf /etc/enroot/enroot.conf
47-
chmod 0644 /etc/enroot/enroot.conf
48-
49-
mkdir -p /tmp/enroot
50-
chmod 1777 /tmp/enroot
51-
mkdir -p /tmp/enroot/data
52-
chmod 1777 /tmp/enroot/data
53-
54-
chmod 1777 ${SHARED_DIR}/enroot
29+
directory node['cluster']['enroot']['persistent_dir'] do
30+
owner 'root'
31+
group 'root'
32+
mode '1777'
33+
recursive true
34+
end
5535

56-
mkdir -p ${SHARED_DIR}/pyxis/
57-
chown ${NONROOT_USER} ${SHARED_DIR}/pyxis/
58-
sed -i '${s/$/ runtime_path=${SHARED_DIR}\\/pyxis/}' /opt/slurm/etc/plugstack.conf.d/pyxis.conf
59-
SHARED_DIR=${SHARED_DIR} envsubst < /opt/slurm/etc/plugstack.conf.d/pyxis.conf > /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf
60-
mv /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf /opt/slurm/etc/plugstack.conf.d/pyxis.conf
36+
directory node['cluster']['enroot']['temporary_dir'] do
37+
owner 'root'
38+
group 'root'
39+
mode '1777'
40+
recursive true
41+
end
6142

62-
ENROOT_CONFIGURE
63-
retries 3
64-
retry_delay 5
43+
# We assume the Enroot temporary dir to be a temporary folder in /run.
44+
# Folders in /run must be defined in /usr/lib/tmpfiles.d, otherwise they get
45+
# deleted on node boot.
46+
template "/usr/lib/tmpfiles.d/enroot.conf" do
47+
source 'enroot/tmpfiles/enroot.conf.erb'
48+
owner 'root'
49+
group 'root'
50+
mode '0644'
6551
end
6652
end
6753

cookbooks/aws-parallelcluster-platform/spec/unit/recipes/directories_spec.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
is_expected.to create_directory(node['cluster']['shared_dir'])
3737
end
3838

39+
it 'creates examples directory' do
40+
is_expected.to create_directory(node['cluster']['examples_dir'])
41+
end
42+
3943
it 'creates log directory' do
4044
is_expected.to create_directory(node['cluster']['log_base_dir']).with(
4145
owner: 'root',

cookbooks/aws-parallelcluster-platform/spec/unit/resources/enroot_spec.rb

Lines changed: 61 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,9 @@ def self.setup(chef_run)
99
end
1010
end
1111
end
12-
13-
def self.configure(chef_run)
14-
chef_run.converge_dsl('aws-parallelcluster-platform') do
15-
enroot 'configure' do
16-
action :configure
17-
end
18-
end
19-
end
2012
end
2113

22-
describe 'enroot:package_version' do
14+
describe 'aws-parallelcluster-platform::enroot:package_version' do
2315
for_all_oses do |platform, version|
2416
context "on #{platform}#{version}" do
2517
cached(:chef_run) do
@@ -39,7 +31,7 @@ def self.configure(chef_run)
3931
end
4032
end
4133

42-
describe 'enroot:arch_suffix' do
34+
describe 'aws-parallelcluster-platform::enroot:arch_suffix' do
4335
for_all_oses do |platform, version|
4436
context "on #{platform}#{version} - arm" do
4537
cached(:chef_run) do
@@ -81,15 +73,73 @@ def self.configure(chef_run)
8173
end
8274
end
8375

84-
describe 'enroot:setup' do
76+
describe 'aws-parallelcluster-platform::enroot:setup' do
8577
for_all_oses do |platform, version|
8678
context "on #{platform}#{version}" do
79+
cached(:enroot_persistent_dir) { '/path/to/enroot/persistent/dir' }
80+
cached(:enroot_temporary_dir) { '/path/to/enroot/temporary/dir' }
81+
8782
let(:chef_run) do
8883
runner(platform: platform, version: version, step_into: ['enroot']) do |node|
8984
node.override['cluster']['enroot']['version'] = package_version
85+
node.override['cluster']['enroot']['persistent_dir'] = enroot_persistent_dir
86+
node.override['cluster']['enroot']['temporary_dir'] = enroot_temporary_dir
9087
end
9188
end
9289

90+
before do
91+
ConvergeEnroot.setup(chef_run)
92+
end
93+
94+
it 'creates the Enroot configuration' do
95+
is_expected.to create_template('/etc/enroot/enroot.conf').with(
96+
source: 'enroot/enroot.conf.erb',
97+
owner: 'root',
98+
group: 'root',
99+
mode: '0644'
100+
)
101+
end
102+
103+
# it 'the Enroot configuration has the correct content' do
104+
# is_expected.to render_file('/etc/enroot/enroot.conf')
105+
# .with_content("ENROOT_RUNTIME_PATH #{enroot_temporary_dir}/runtime/user-$(id -u)")
106+
# .with_content("ENROOT_DATA_PATH #{enroot_temporary_dir}/data/user-$(id -u)")
107+
# .with_content("ENROOT_CONFIG_PATH #{enroot_persistent_dir}/config/user-$(id -u)")
108+
# .with_content("ENROOT_CACHE_PATH #{enroot_persistent_dir}/cache/group-$(id -g)")
109+
# end
110+
111+
it 'creates the Enroot persistent directory' do
112+
is_expected.to create_directory(enroot_persistent_dir).with(
113+
owner: 'root',
114+
group: 'root',
115+
mode: '1777',
116+
recursive: true
117+
)
118+
end
119+
120+
it 'creates the Enroot temporary directory' do
121+
is_expected.to create_directory(enroot_temporary_dir).with(
122+
owner: 'root',
123+
group: 'root',
124+
mode: '1777',
125+
recursive: true
126+
)
127+
end
128+
129+
it 'creates the Enroot tmpfiles.d configuration' do
130+
is_expected.to create_template('/usr/lib/tmpfiles.d/enroot.conf').with(
131+
source: 'enroot/tmpfiles/enroot.conf.erb',
132+
owner: 'root',
133+
group: 'root',
134+
mode: '0644'
135+
)
136+
end
137+
138+
# it 'the Enroot tmpfile.d configuration has the correct content' do
139+
# is_expected.to render_file('/usr/lib/tmpfiles.d/enroot.conf')
140+
# .with_content("D #{enroot_temporary_dir} 0777 root root")
141+
# end
142+
93143
context 'when nvidia is enabled' do
94144
before do
95145
stubs_for_provider('enroot') do |resource|
@@ -128,44 +178,3 @@ def self.configure(chef_run)
128178
end
129179
end
130180
end
131-
132-
describe 'enroot:configure' do
133-
for_all_oses do |platform, version|
134-
context "on #{platform}#{version}" do
135-
let(:chef_run) do
136-
runner(platform: platform, version: version, step_into: ['enroot'])
137-
end
138-
139-
context 'when enroot is installed' do
140-
before do
141-
stubs_for_provider('enroot') do |resource|
142-
allow(resource).to receive(:enroot_installed).and_return(true)
143-
end
144-
ConvergeEnroot.configure(chef_run)
145-
end
146-
it 'run configure enroot script' do
147-
is_expected.to run_bash('Configure enroot')
148-
.with(retries: 3)
149-
.with(retry_delay: 5)
150-
.with(user: 'root')
151-
end
152-
end
153-
154-
context 'when enroot is not installed' do
155-
before do
156-
stubs_for_provider('enroot') do |resource|
157-
allow(resource).to receive(:enroot_installed).and_return(false)
158-
end
159-
ConvergeEnroot.configure(chef_run)
160-
end
161-
162-
it 'does not run configure enroot script' do
163-
is_expected.not_to run_bash('Configure enroot')
164-
.with(retries: 3)
165-
.with(retry_delay: 5)
166-
.with(user: 'root')
167-
end
168-
end
169-
end
170-
end
171-
end

cookbooks/aws-parallelcluster-platform/files/enroot/enroot.template.conf renamed to cookbooks/aws-parallelcluster-platform/templates/enroot/enroot.conf.erb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#ENROOT_LIBRARY_PATH /usr/lib/enroot
22
#ENROOT_SYSCONF_PATH /etc/enroot
3-
ENROOT_RUNTIME_PATH /tmp/enroot/user-$(id -u)
4-
ENROOT_CONFIG_PATH ${ENROOT_CONFIG_PATH}
5-
ENROOT_CACHE_PATH ${ENROOT_CACHE_PATH}
6-
ENROOT_DATA_PATH /tmp/enroot/data/user-$(id -u)
3+
ENROOT_RUNTIME_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/runtime/user-$(id -u)
4+
ENROOT_DATA_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/data/user-$(id -u)
5+
ENROOT_CONFIG_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/config/user-$(id -u)
6+
ENROOT_CACHE_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/cache/group-$(id -g)
77
#ENROOT_TEMP_PATH ${TMPDIR:-/tmp}
88

99
# Gzip program used to uncompress digest layers.
@@ -68,4 +68,4 @@ ENROOT_RESTRICT_DEV no
6868
#all_proxy
6969
#no_proxy
7070
#http_proxy
71-
#https_proxy
71+
#https_proxy
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
D <%= node['cluster']['enroot']['temporary_dir'] %> 0777 root root

cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,36 @@
1414

1515
expected_enroot_version = node['cluster']['enroot']['version']
1616

17-
describe "gdrcopy version is expected to be #{expected_enroot_version}" do
17+
describe "enroot version is expected to be #{expected_enroot_version}" do
1818
subject { command('enroot version').stdout.strip() }
1919
it { should eq expected_enroot_version }
2020
end
21+
22+
persistent_dirs = %w[/etc/enroot /var/enroot]
23+
persistent_dirs.each do |path|
24+
describe directory(path) do
25+
it { should exist }
26+
its('owner') { should eq 'root' }
27+
its('group') { should eq 'root' }
28+
its('mode') { should cmp '01777' }
29+
end
30+
end
31+
32+
temporary_dirs = [ "/run/enroot" ]
33+
temporary_dirs.each do |path|
34+
describe directory(path) do
35+
it { should exist }
36+
its('owner') { should eq 'root' }
37+
its('group') { should eq 'root' }
38+
its('mode') { should cmp '01777' }
39+
end
40+
end
2141
end
2242

2343
control 'tag:config_enroot_enabled_on_graphic_instances' do
2444
only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) }
2545

26-
describe file("/opt/parallelcluster/shared/enroot") do
46+
describe file("/var/enroot/cache-group-1000") do
2747
it { should exist }
2848
its('group') { should eq 'root' }
2949
end unless os_properties.redhat_on_docker?

cookbooks/aws-parallelcluster-shared/attributes/cluster.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
default['cluster']['license_dir'] = "#{node['cluster']['base_dir']}/licenses"
55
default['cluster']['configs_dir'] = "#{node['cluster']['base_dir']}/configs"
66
default['cluster']['shared_dir'] = "#{node['cluster']['base_dir']}/shared"
7+
default['cluster']['examples_dir'] = "#{node['cluster']['base_dir']}/examples"
78
default['cluster']['shared_dir_login_nodes'] = "#{node['cluster']['base_dir']}/shared_login_nodes"
89
default['cluster']['log_base_dir'] = '/var/log/parallelcluster'
910
default['cluster']['etc_dir'] = '/etc/parallelcluster'

0 commit comments

Comments
 (0)