Skip to content

Commit 1dc6711

Browse files
committed
Fix supervisord service not enabled on Ubuntu
Change supervisord service script from SysVinit to Systemd for all OS. Add kitchen test to verify service is enabled. Also add check for other required services This solves https://github.com/aws/aws-parallelcluster/wiki/ParallelCluster-3.0.0-on-Ubuntu-18-and-20:-scaling-daemon-is-down-after-a-head-node-reboot Signed-off-by: Luca Carrogu <[email protected]>
1 parent 5bf5b66 commit 1dc6711

File tree

6 files changed

+64
-330
lines changed

6 files changed

+64
-330
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,16 @@ This file is used to list changes made in each version of the AWS ParallelCluste
66
3.x.x
77
------
88

9+
**CHANGES**
10+
- Change supervisord service script from SysVinit to Systemd.
11+
- Do not configure GPUs in Slurm when Nvidia driver is not installed.
12+
-
13+
**BUG FIXES**
14+
- Fix supervisord service not enabled on Ubuntu.
15+
16+
3.x.x
17+
------
18+
919
**CHANGES**
1020
- Do not configure GPUs in Slurm when Nvidia driver is not installed.
1121

recipes/base_install.rb

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -174,13 +174,14 @@
174174
mode "0644"
175175
end
176176

177-
# Put init script in place
178-
template "supervisord-init" do
179-
source 'supervisord-init.erb'
180-
path "/etc/init.d/supervisord"
177+
# Put supervisord service in place
178+
template "supervisord-service" do
179+
source 'supervisord-service.erb'
180+
path "/etc/systemd/system/supervisord.service"
181181
owner "root"
182182
group "root"
183-
mode "0755"
183+
mode "0644"
184+
only_if { node['init_package'] == 'systemd' }
184185
end
185186

186187
# AMI cleanup script

recipes/tests.rb

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@
5858
# Slurm
5959
###################
6060
if node['cluster']['scheduler'] == 'slurm'
61+
execute 'check munge service is enabled' do
62+
command "systemctl is-enabled munge"
63+
end
6164
case node['cluster']['node_type']
6265
when 'HeadNode'
6366
execute 'execute sinfo' do
@@ -86,6 +89,9 @@
8689
execute 'ensure-pmix-shared-library-can-be-found' do
8790
command '/opt/pmix/bin/pmix_info'
8891
end
92+
execute 'check slurmctld service is enabled' do
93+
command "systemctl is-enabled slurmctld"
94+
end
8995
when 'ComputeFleet'
9096
execute 'ls slurm root' do
9197
command "ls /opt/slurm"
@@ -128,6 +134,10 @@
128134
command chrony_check_command
129135
end
130136

137+
execute 'check chrony service is enabled' do
138+
command "systemctl is-enabled #{node['cluster']['chrony']['service']}"
139+
end
140+
131141
execute 'check chrony conf' do
132142
command "chronyc waitsync 30; chronyc tracking | grep -i reference | grep 169.254.169.123"
133143
user node['cluster']['cluster_user']
@@ -155,6 +165,9 @@
155165
end
156166

157167
if node['conditions']['dcv_supported'] && node['cluster']['dcv_enabled'] == "head_node" && node['cluster']['node_type'] == "HeadNode"
168+
execute 'check dcvserver service is enabled' do
169+
command "systemctl is-enabled dcvserver"
170+
end
158171
execute 'check systemd default runlevel' do
159172
command "systemctl get-default | grep -i graphical.target"
160173
end
@@ -369,6 +382,9 @@ module load intelmpi && mpirun --help | grep '#{node['cluster']['intelmpi']['kit
369382
EPHEMERAL
370383
user node['cluster']['cluster_user']
371384
end
385+
execute 'check setup-ephemeral service is enabled' do
386+
command "systemctl is-enabled setup-ephemeral"
387+
end
372388

373389
###################
374390
# Pcluster AWSBatch CLI
@@ -459,3 +475,18 @@ module load intelmpi && mpirun --help | grep '#{node['cluster']['intelmpi']['kit
459475
NOFFTW
460476
end
461477
end
478+
479+
###################
480+
# Verify required service are enabled
481+
###################
482+
if node['cluster']['node_type'] == 'HeadNode'
483+
execute 'check parallelcluster-iptables service is enabled' do
484+
command "systemctl is-enabled parallelcluster-iptables"
485+
end
486+
end
487+
execute 'check supervisord service is enabled' do
488+
command "systemctl is-enabled supervisord"
489+
end
490+
execute 'check ec2blkdev service is enabled' do
491+
command "systemctl is-enabled ec2blkdev"
492+
end

templates/default/supervisord-init.erb

Lines changed: 0 additions & 152 deletions
This file was deleted.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Downloaded from:
2+
# https://git.launchpad.net/ubuntu/+source/supervisor/tree/debian/supervisor.service
3+
[Unit]
4+
Description=Supervisor process control system for UNIX
5+
Documentation=http://supervisord.org
6+
After=network.target
7+
8+
[Service]
9+
ExecStart=<%= node['cluster']['cookbook_virtualenv_path'] %>/bin/supervisord -n -c /etc/supervisord.conf
10+
ExecStop=<%= node['cluster']['cookbook_virtualenv_path'] %>/bin/supervisorctl $OPTIONS shutdown
11+
ExecReload=<%= node['cluster']['cookbook_virtualenv_path'] %>/bin/supervisorctl -c /etc/supervisord.conf $OPTIONS reload
12+
KillMode=process
13+
Restart=on-failure
14+
RestartSec=50s
15+
16+
[Install]
17+
WantedBy=multi-user.target

0 commit comments

Comments
 (0)