Skip to content

Commit a4a202e

Browse files
committed
Fix the way Pyxis and Enroot are configured.
1. Pyxis is disabled by default. In particular, the SPANK config file and the Pyxis config file required to enable it are stored in `/opt/parallelcluster/examples` folder so that they are ineffective but can be used by the user to enable Pyxisby simply moving them to the expected location. 2. Move Pyxis and Enroot configuration to build time (there was no reason to configure Pyxis and Enroot at runtime) 3. Changed Pyxis runtime path to `/run/pyxis`. As per [documentation](https://github.com/NVIDIA/pyxis/wiki/Setup#slurm-plugstack-configuration) a tmpfs should be used. Asd a consequence, we needed to define a tmpfiles config to make sure that the dedicated folder is not deleted at boot time. 4. Changed Enroot paths, following the suggestion in the [documentation](https://github.com/NVIDIA/pyxis/wiki/Setup#enroot-configuration-example) 1. Using tmpfs storage for `ENROOT_RUNTIME_PATH` and `ENROOT_DATA_PATH` 2. Using a persistent local storage for `ENROOT_CACHE_PATH` and `ENROOT_CONFIG_PATH`. 5. *Minor*: Moved Pyxis attributes from platform cookbook to slurm cookbook because Pyxis is a SLURM plugin so it would be conceptually wrong to have its attributes defined in platform cookbook. Signed-off-by: Giacomo Marciani <[email protected]>
1 parent e87af87 commit a4a202e

File tree

19 files changed

+475
-108
lines changed

19 files changed

+475
-108
lines changed

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
# ArmPL
1010
default['conditions']['arm_pl_supported'] = arm_instance?
1111

12-
# Enroot + Pyxis
12+
# Enroot
1313
default['cluster']['enroot']['version'] = '3.4.1'
14-
default['cluster']['pyxis']['version'] = '0.20.0'
14+
default['cluster']['enroot']['temporary_dir'] = '/run/enroot'
15+
default['cluster']['enroot']['persistent_dir'] = '/var/enroot'
1516

1617
# NVidia
1718
default['cluster']['nvidia']['enabled'] = 'no'

cookbooks/aws-parallelcluster-platform/recipes/config.rb

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,3 @@
2626
include_recipe 'aws-parallelcluster-platform::supervisord_config'
2727
fetch_config 'Fetch and load cluster configs'
2828
include_recipe 'aws-parallelcluster-platform::config_login' if node['cluster']['node_type'] == 'LoginNode'
29-
enroot 'Configure Enroot' do
30-
action :configure
31-
end

cookbooks/aws-parallelcluster-platform/recipes/install/directories.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
directory node['cluster']['license_dir']
2222
directory node['cluster']['configs_dir']
2323
directory node['cluster']['shared_dir']
24+
directory node['cluster']['examples_dir']
2425
directory node['cluster']['shared_dir_login_nodes']
2526

2627
# Create ParallelCluster log folder

cookbooks/aws-parallelcluster-platform/resources/enroot/partial/_enroot_common.rb

Lines changed: 26 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# frozen_string_literal: true
22
#
3-
# Copyright:: 2013-2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
# Copyright:: 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License").
66
# You may not use this file except in compliance with the License.
@@ -17,51 +17,40 @@
1717

1818
action :setup do
1919
return if on_docker?
20-
action_install_package
21-
end
20+
return if enroot_installed
2221

23-
action :configure do
24-
return if on_docker?
25-
return unless enroot_installed
22+
action_install_package
2623

27-
cookbook_file "/tmp/enroot.template.conf" do
28-
source 'enroot/enroot.template.conf'
29-
cookbook 'aws-parallelcluster-platform'
24+
template "/etc/enroot/enroot.conf" do
25+
source 'enroot/enroot.conf.erb'
3026
owner 'root'
3127
group 'root'
32-
mode '0755'
28+
mode '0644'
3329
action :create_if_missing
3430
end
3531

36-
bash "Configure enroot" do
37-
user 'root'
38-
code <<-ENROOT_CONFIGURE
39-
set -e
40-
ENROOT_CONFIG_RELEASE=pyxis
41-
SHARED_DIR=#{node['cluster']['shared_dir']}
42-
NONROOT_USER=#{node['cluster']['cluster_user']}
43-
mkdir -p ${SHARED_DIR}/enroot
44-
chown ${NONROOT_USER} ${SHARED_DIR}/enroot
45-
ENROOT_CACHE_PATH=${SHARED_DIR}/enroot envsubst < /tmp/enroot.template.conf > /tmp/enroot.conf
46-
mv /tmp/enroot.conf /etc/enroot/enroot.conf
47-
chmod 0644 /etc/enroot/enroot.conf
48-
49-
mkdir -p /tmp/enroot
50-
chmod 1777 /tmp/enroot
51-
mkdir -p /tmp/enroot/data
52-
chmod 1777 /tmp/enroot/data
53-
54-
chmod 1777 ${SHARED_DIR}/enroot
32+
directory node['cluster']['enroot']['persistent_dir'] do
33+
owner 'root'
34+
group 'root'
35+
mode '1777'
36+
recursive true
37+
end
5538

56-
mkdir -p ${SHARED_DIR}/pyxis/
57-
chown ${NONROOT_USER} ${SHARED_DIR}/pyxis/
58-
sed -i '${s/$/ runtime_path=${SHARED_DIR}\\/pyxis/}' /opt/slurm/etc/plugstack.conf.d/pyxis.conf
59-
SHARED_DIR=${SHARED_DIR} envsubst < /opt/slurm/etc/plugstack.conf.d/pyxis.conf > /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf
60-
mv /opt/slurm/etc/plugstack.conf.d/pyxis.tmp.conf /opt/slurm/etc/plugstack.conf.d/pyxis.conf
39+
directory node['cluster']['enroot']['temporary_dir'] do
40+
owner 'root'
41+
group 'root'
42+
mode '1777'
43+
recursive true
44+
end
6145

62-
ENROOT_CONFIGURE
63-
retries 3
64-
retry_delay 5
46+
# We assume the Enroot temporary dir to be a temporary folder in /run.
47+
# Folders in /run must be defined in /usr/lib/tmpfiles.d, otherwise they get
48+
# deleted on node boot.
49+
template "/usr/lib/tmpfiles.d/enroot.conf" do
50+
source 'enroot/tmpfiles/enroot.conf.erb'
51+
owner 'root'
52+
group 'root'
53+
mode '0644'
6554
end
6655
end
6756

cookbooks/aws-parallelcluster-platform/spec/unit/recipes/directories_spec.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
is_expected.to create_directory(node['cluster']['shared_dir'])
3737
end
3838

39+
it 'creates examples directory' do
40+
is_expected.to create_directory(node['cluster']['examples_dir'])
41+
end
42+
3943
it 'creates log directory' do
4044
is_expected.to create_directory(node['cluster']['log_base_dir']).with(
4145
owner: 'root',

cookbooks/aws-parallelcluster-platform/spec/unit/resources/enroot_spec.rb

Lines changed: 124 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,9 @@ def self.setup(chef_run)
99
end
1010
end
1111
end
12-
13-
def self.configure(chef_run)
14-
chef_run.converge_dsl('aws-parallelcluster-platform') do
15-
enroot 'configure' do
16-
action :configure
17-
end
18-
end
19-
end
2012
end
2113

22-
describe 'enroot:package_version' do
14+
describe 'aws-parallelcluster-platform::enroot:package_version' do
2315
for_all_oses do |platform, version|
2416
context "on #{platform}#{version}" do
2517
cached(:chef_run) do
@@ -39,7 +31,34 @@ def self.configure(chef_run)
3931
end
4032
end
4133

42-
describe 'enroot:arch_suffix' do
34+
describe 'aws-parallelcluster-platform::enroot:enroot_installed' do
35+
for_all_oses do |platform, version|
36+
context "on #{platform}#{version}" do
37+
binary = '/usr/bin/enroot'
38+
[true, false].each do |binary_exist|
39+
context "when binary #{binary} does #{'not ' unless binary_exist}exist" do
40+
cached(:chef_run) do
41+
allow(File).to receive(:exist?).with(binary).and_return(binary_exist)
42+
runner = runner(platform: platform, version: version, step_into: ['enroot'])
43+
ConvergeEnroot.setup(runner)
44+
end
45+
46+
cached(:resource) do
47+
chef_run.find_resource('enroot', 'setup')
48+
end
49+
50+
expected_result = binary_exist
51+
52+
it "returns #{expected_result}" do
53+
expect(resource.enroot_installed).to eq(expected_result)
54+
end
55+
end
56+
end
57+
end
58+
end
59+
end
60+
61+
describe 'aws-parallelcluster-platform::enroot:arch_suffix' do
4362
for_all_oses do |platform, version|
4463
context "on #{platform}#{version} - arm" do
4564
cached(:chef_run) do
@@ -81,15 +100,109 @@ def self.configure(chef_run)
81100
end
82101
end
83102

84-
describe 'enroot:setup' do
103+
describe 'aws-parallelcluster-platform::enroot:setup' do
85104
for_all_oses do |platform, version|
86105
context "on #{platform}#{version}" do
106+
cached(:enroot_persistent_dir) { '/path/to/enroot/persistent/dir' }
107+
cached(:enroot_temporary_dir) { '/path/to/enroot/temporary/dir' }
108+
109+
context "when enroot is already installed" do
110+
let(:chef_run) do
111+
stubs_for_resource('enroot') do |res|
112+
allow(res).to receive(:enroot_installed).and_return(true)
113+
end
114+
runner(platform: platform, version: version, step_into: ['enroot']) do |node|
115+
node.override['cluster']['enroot']['version'] = package_version
116+
node.override['cluster']['enroot']['persistent_dir'] = enroot_persistent_dir
117+
node.override['cluster']['enroot']['temporary_dir'] = enroot_temporary_dir
118+
end
119+
end
120+
121+
before do
122+
ConvergeEnroot.setup(chef_run)
123+
end
124+
125+
it 'does not create the Enroot configuration' do
126+
is_expected.not_to create_template('/etc/enroot/enroot.conf')
127+
end
128+
129+
it 'does not create the Enroot persistent directory' do
130+
is_expected.not_to create_directory(enroot_persistent_dir)
131+
end
132+
133+
it 'does not create the Enroot temporary directory' do
134+
is_expected.not_to create_directory(enroot_temporary_dir)
135+
end
136+
137+
it 'does not create the Enroot tmpfiles.d configuration' do
138+
is_expected.not_to create_template('/usr/lib/tmpfiles.d/enroot.conf')
139+
end
140+
end
141+
87142
let(:chef_run) do
143+
stubs_for_resource('enroot') do |res|
144+
allow(res).to receive(:enroot_installed).and_return(false)
145+
end
88146
runner(platform: platform, version: version, step_into: ['enroot']) do |node|
89147
node.override['cluster']['enroot']['version'] = package_version
148+
node.override['cluster']['enroot']['persistent_dir'] = enroot_persistent_dir
149+
node.override['cluster']['enroot']['temporary_dir'] = enroot_temporary_dir
90150
end
91151
end
92152

153+
before do
154+
ConvergeEnroot.setup(chef_run)
155+
end
156+
157+
it 'creates the Enroot configuration' do
158+
is_expected.to create_if_missing_template('/etc/enroot/enroot.conf').with(
159+
source: 'enroot/enroot.conf.erb',
160+
owner: 'root',
161+
group: 'root',
162+
mode: '0644'
163+
)
164+
end
165+
166+
# it 'the Enroot configuration has the correct content' do
167+
# is_expected.to render_file('/etc/enroot/enroot.conf')
168+
# .with_content("ENROOT_RUNTIME_PATH #{enroot_temporary_dir}/runtime/user-$(id -u)")
169+
# .with_content("ENROOT_DATA_PATH #{enroot_temporary_dir}/data/user-$(id -u)")
170+
# .with_content("ENROOT_CONFIG_PATH #{enroot_persistent_dir}/config/user-$(id -u)")
171+
# .with_content("ENROOT_CACHE_PATH #{enroot_persistent_dir}/cache/group-$(id -g)")
172+
# end
173+
174+
it 'creates the Enroot persistent directory' do
175+
is_expected.to create_directory(enroot_persistent_dir).with(
176+
owner: 'root',
177+
group: 'root',
178+
mode: '1777',
179+
recursive: true
180+
)
181+
end
182+
183+
it 'creates the Enroot temporary directory' do
184+
is_expected.to create_directory(enroot_temporary_dir).with(
185+
owner: 'root',
186+
group: 'root',
187+
mode: '1777',
188+
recursive: true
189+
)
190+
end
191+
192+
it 'creates the Enroot tmpfiles.d configuration' do
193+
is_expected.to create_template('/usr/lib/tmpfiles.d/enroot.conf').with(
194+
source: 'enroot/tmpfiles/enroot.conf.erb',
195+
owner: 'root',
196+
group: 'root',
197+
mode: '0644'
198+
)
199+
end
200+
201+
# it 'the Enroot tmpfile.d configuration has the correct content' do
202+
# is_expected.to render_file('/usr/lib/tmpfiles.d/enroot.conf')
203+
# .with_content("D #{enroot_temporary_dir} 0777 root root")
204+
# end
205+
93206
context 'when nvidia is enabled' do
94207
before do
95208
stubs_for_provider('enroot') do |resource|
@@ -128,44 +241,3 @@ def self.configure(chef_run)
128241
end
129242
end
130243
end
131-
132-
describe 'enroot:configure' do
133-
for_all_oses do |platform, version|
134-
context "on #{platform}#{version}" do
135-
let(:chef_run) do
136-
runner(platform: platform, version: version, step_into: ['enroot'])
137-
end
138-
139-
context 'when enroot is installed' do
140-
before do
141-
stubs_for_provider('enroot') do |resource|
142-
allow(resource).to receive(:enroot_installed).and_return(true)
143-
end
144-
ConvergeEnroot.configure(chef_run)
145-
end
146-
it 'run configure enroot script' do
147-
is_expected.to run_bash('Configure enroot')
148-
.with(retries: 3)
149-
.with(retry_delay: 5)
150-
.with(user: 'root')
151-
end
152-
end
153-
154-
context 'when enroot is not installed' do
155-
before do
156-
stubs_for_provider('enroot') do |resource|
157-
allow(resource).to receive(:enroot_installed).and_return(false)
158-
end
159-
ConvergeEnroot.configure(chef_run)
160-
end
161-
162-
it 'does not run configure enroot script' do
163-
is_expected.not_to run_bash('Configure enroot')
164-
.with(retries: 3)
165-
.with(retry_delay: 5)
166-
.with(user: 'root')
167-
end
168-
end
169-
end
170-
end
171-
end

cookbooks/aws-parallelcluster-platform/files/enroot/enroot.template.conf renamed to cookbooks/aws-parallelcluster-platform/templates/enroot/enroot.conf.erb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#ENROOT_LIBRARY_PATH /usr/lib/enroot
22
#ENROOT_SYSCONF_PATH /etc/enroot
3-
ENROOT_RUNTIME_PATH /tmp/enroot/user-$(id -u)
4-
ENROOT_CONFIG_PATH ${ENROOT_CONFIG_PATH}
5-
ENROOT_CACHE_PATH ${ENROOT_CACHE_PATH}
6-
ENROOT_DATA_PATH /tmp/enroot/data/user-$(id -u)
3+
ENROOT_RUNTIME_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/runtime/user-$(id -u)
4+
ENROOT_DATA_PATH <%= node['cluster']['enroot']['temporary_dir'] %>/data/user-$(id -u)
5+
ENROOT_CONFIG_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/config/user-$(id -u)
6+
ENROOT_CACHE_PATH <%= node['cluster']['enroot']['persistent_dir'] %>/cache/group-$(id -g)
77
#ENROOT_TEMP_PATH ${TMPDIR:-/tmp}
88

99
# Gzip program used to uncompress digest layers.
@@ -68,4 +68,4 @@ ENROOT_RESTRICT_DEV no
6868
#all_proxy
6969
#no_proxy
7070
#http_proxy
71-
#https_proxy
71+
#https_proxy
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
D <%= node['cluster']['enroot']['temporary_dir'] %> 0777 root root

cookbooks/aws-parallelcluster-platform/test/controls/enroot_spec.rb

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,36 @@
1414

1515
expected_enroot_version = node['cluster']['enroot']['version']
1616

17-
describe "gdrcopy version is expected to be #{expected_enroot_version}" do
17+
describe "enroot version is expected to be #{expected_enroot_version}" do
1818
subject { command('enroot version').stdout.strip() }
1919
it { should eq expected_enroot_version }
2020
end
21+
22+
persistent_dirs = %w(/etc/enroot /var/enroot)
23+
persistent_dirs.each do |path|
24+
describe directory(path) do
25+
it { should exist }
26+
its('owner') { should eq 'root' }
27+
its('group') { should eq 'root' }
28+
its('mode') { should cmp '01777' }
29+
end
30+
end
31+
32+
temporary_dirs = [ "/run/enroot" ]
33+
temporary_dirs.each do |path|
34+
describe directory(path) do
35+
it { should exist }
36+
its('owner') { should eq 'root' }
37+
its('group') { should eq 'root' }
38+
its('mode') { should cmp '01777' }
39+
end
40+
end
2141
end
2242

2343
control 'tag:config_enroot_enabled_on_graphic_instances' do
2444
only_if { !os_properties.on_docker? && ['yes', true].include?(node['cluster']['nvidia']['enabled']) }
2545

26-
describe file("/opt/parallelcluster/shared/enroot") do
46+
describe file("/var/enroot/cache-group-1000") do
2747
it { should exist }
2848
its('group') { should eq 'root' }
2949
end unless os_properties.redhat_on_docker?

0 commit comments

Comments
 (0)