Skip to content

Commit 5bf5b66

Browse files
committed
change gres conf and partition conf with no gpu
Signed-off-by: chenwany <[email protected]>
1 parent d35da53 commit 5bf5b66

File tree

12 files changed

+61
-46
lines changed

12 files changed

+61
-46
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ aws-parallelcluster-cookbook CHANGELOG
33

44
This file is used to list changes made in each version of the AWS ParallelCluster cookbook.
55

6+
3.x.x
7+
------
8+
9+
**CHANGES**
10+
- Do not configure GPUs in Slurm when Nvidia driver is not installed.
11+
612
3.0.0
713
------
814

files/default/slurm/pcluster_slurm_config_generator.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,8 @@ def generate_slurm_config_files(
6363
is_default_queue = True # The first queue in the queues list is the default queue
6464
for queue in queues:
6565
for file_type in ["partition", "gres"]:
66-
if file_type == "gres" and no_gpu:
67-
continue
6866
_generate_queue_config(
69-
queue["Name"], queue, is_default_queue, file_type, env, pcluster_subdirectory, dryrun
67+
queue["Name"], queue, is_default_queue, file_type, env, pcluster_subdirectory, dryrun, no_gpu=no_gpu
7068
)
7169
is_default_queue = False
7270

@@ -109,14 +107,23 @@ def _get_head_node_private_ip():
109107
return _get_metadata("local-ipv4")
110108

111109

112-
def _generate_queue_config(queue_name, queue_config, is_default_queue, file_type, jinja_env, output_dir, dryrun):
110+
def _generate_queue_config(
111+
queue_name, queue_config, is_default_queue, file_type, jinja_env, output_dir, dryrun, no_gpu=False
112+
):
113113
log.info("Generating slurm_parallelcluster_%s_%s.conf", queue_name, file_type)
114114
rendered_template = jinja_env.get_template(f"slurm_parallelcluster_queue_{file_type}.conf").render(
115-
queue_name=queue_name, queue_config=queue_config, is_default_queue=is_default_queue
115+
queue_name=queue_name, queue_config=queue_config, is_default_queue=is_default_queue, no_gpu=no_gpu
116116
)
117117
if not dryrun:
118118
filename = path.join(output_dir, f"slurm_parallelcluster_{queue_name}_{file_type}.conf")
119-
_write_rendered_template_to_file(rendered_template, filename)
119+
if file_type == "gres" and no_gpu:
120+
_write_rendered_template_to_file(
121+
"# This file is automatically generated by pcluster\n"
122+
"# Skipping GPUs configuration because Nvidia driver is not installed",
123+
filename,
124+
)
125+
else:
126+
_write_rendered_template_to_file(rendered_template, filename)
120127

121128

122129
def _generate_slurm_parallelcluster_configs(

files/default/slurm/templates/slurm_parallelcluster_queue_partition.conf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77
{% set instance_type = compute_resource.InstanceType%}
88
{% set sanitized_compute_name = compute_resource.Name | sanify_name %}
99
{% if static_size > 0 %}
10-
NodeName={{ queue_name }}-st-{{ sanitized_compute_name }}-[1-{{ static_size }}] CPUs={{ compute_resource | vcpus }} State=CLOUD Feature=static,{{ instance_type }},{{ compute_resource.Name }}{% if compute_resource.Efa.Enabled %},efa{% endif %}{% if instance_type | gpus > 0 %},gpu Gres=gpu:{{ instance_type | gpu_type }}:{{ instance_type | gpus }}{% endif %}
10+
NodeName={{ queue_name }}-st-{{ sanitized_compute_name }}-[1-{{ static_size }}] CPUs={{ compute_resource | vcpus }} State=CLOUD Feature=static,{{ instance_type }},{{ compute_resource.Name }}{% if compute_resource.Efa.Enabled %},efa{% endif %}{% if instance_type | gpus > 0 %},gpu{% if not no_gpu %} Gres=gpu:{{ instance_type | gpu_type }}:{{ instance_type | gpus }}{% endif %}{% endif %}
1111

1212
{% endif %}
1313
{% if dynamic_size > 0 %}
14-
NodeName={{ queue_name }}-dy-{{ sanitized_compute_name }}-[1-{{ dynamic_size }}] CPUs={{ compute_resource | vcpus }} State=CLOUD Feature=dynamic,{{ instance_type }},{{ compute_resource.Name }}{% if compute_resource.Efa.Enabled %},efa{% endif %}{% if instance_type | gpus > 0 %},gpu Gres=gpu:{{ instance_type | gpu_type }}:{{ instance_type | gpus }}{% endif %}
14+
NodeName={{ queue_name }}-dy-{{ sanitized_compute_name }}-[1-{{ dynamic_size }}] CPUs={{ compute_resource | vcpus }} State=CLOUD Feature=dynamic,{{ instance_type }},{{ compute_resource.Name }}{% if compute_resource.Efa.Enabled %},efa{% endif %}{% if instance_type | gpus > 0 %},gpu{% if not no_gpu %} Gres=gpu:{{ instance_type | gpu_type }}:{{ instance_type | gpus }}{% endif %}{% endif %}
1515

1616
{% endif %}
1717
{% if static_size > 0 or dynamic_size > 0 %}

libraries/helpers.rb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -132,15 +132,12 @@ def graphic_instance?
132132
end
133133

134134
#
135-
# Check if the nvidia drive is installed
135+
# Check if Nvidia driver is installed
136136
#
137137
def nvidia_installed?
138-
nvidia_installed = Mixlib::ShellOut.new("which nvidia-smi")
139-
nvidia_installed.run_command
140-
141-
Chef::Log.info("Nvidia drive is not installed") if nvidia_installed.stdout.strip.empty?
142-
143-
!nvidia_installed.stdout.strip.empty?
138+
nvidia_installed = ::File.exist?('/usr/bin/nvidia-smi')
139+
Chef::Log.warn("Nvidia driver is not installed") unless nvidia_installed
140+
nvidia_installed
144141
end
145142

146143
#

recipes/compute_slurm_config.rb

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,8 @@
3535
retry_delay 6
3636
end
3737

38-
# Check to see if there is GPU on the instance, only execute run_nvidiasmi if there is GPU and nvidia installed
39-
if graphic_instance?
40-
if nvidia_installed?
41-
execute "run_nvidiasmi" do
42-
command 'nvidia-smi'
43-
end
44-
else
45-
Chef::Log.warn("GPU instance but no Nvidia drivers found")
46-
end
47-
end
38+
# Check to see if is GPU instance with Nvidia installed
39+
Chef::Log.warn("GPU instance but no Nvidia drivers found") if graphic_instance? && !nvidia_installed?
4840

4941
cookbook_file '/etc/systemd/system/slurmd.service' do
5042
source 'slurmd.service'

recipes/head_node_slurm_config.rb

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,16 +91,11 @@
9191
end
9292

9393
# Generate pcluster specific configs
94+
no_gpu = nvidia_installed? ? "" : "--no-gpu"
9495
execute "generate_pcluster_slurm_configs" do
95-
if nvidia_installed?
96-
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py"\
97-
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
98-
" --input-file #{node['cluster']['cluster_config_path']} --instance-types-data #{node['cluster']['instance_types_data_path']}"
99-
else
100-
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py"\
101-
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
102-
" --input-file #{node['cluster']['cluster_config_path']} --instance-types-data #{node['cluster']['instance_types_data_path']} --no-gpu"
103-
end
96+
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py"\
97+
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
98+
" --input-file #{node['cluster']['cluster_config_path']} --instance-types-data #{node['cluster']['instance_types_data_path']} #{no_gpu}"
10499
end
105100

106101
# all other OSs use /sys/fs/cgroup, which is the default

recipes/update_head_node_slurm.rb

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,16 +34,11 @@
3434
retry_delay 5
3535
end
3636
# Generate pcluster specific configs
37+
no_gpu = nvidia_installed? ? "" : "--no-gpu"
3738
execute "generate_pcluster_slurm_configs" do
38-
if nvidia_installed?
39-
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \
40-
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
41-
" --input-file #{updated_cluster_config_path} --instance-types-data #{node['cluster']['instance_types_data_path']}"
42-
else
43-
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \
44-
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
45-
" --input-file #{updated_cluster_config_path} --instance-types-data #{node['cluster']['instance_types_data_path']} --no-gpu"
46-
end
39+
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \
40+
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
41+
" --input-file #{updated_cluster_config_path} --instance-types-data #{node['cluster']['instance_types_data_path']} #{no_gpu}"
4742
end
4843

4944
execute 'stop clustermgtd' do

test/unit/slurm/test_slurm_config_generator.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
import os
22

3+
import pytest
34
import slurm
45
from assertpy import assert_that
56
from slurm.pcluster_slurm_config_generator import generate_slurm_config_files
67

78

8-
def test_generate_slurm_config_files(mocker, test_datadir, tmpdir):
9+
@pytest.mark.parametrize(
10+
"no_gpu",
11+
[False, True],
12+
)
13+
def test_generate_slurm_config_files(mocker, test_datadir, tmpdir, no_gpu):
914
input_file = str(test_datadir / "sample_input.yaml")
1015
instance_types_data = str(test_datadir / "sample_instance_types_data.json")
1116

@@ -14,12 +19,16 @@ def test_generate_slurm_config_files(mocker, test_datadir, tmpdir):
1419
"slurm.pcluster_slurm_config_generator._get_head_node_private_ip", return_value="ip.1.0.0.0", autospec=True
1520
)
1621
template_directory = os.path.dirname(slurm.__file__) + "/templates"
17-
generate_slurm_config_files(tmpdir, template_directory, input_file, instance_types_data, dryrun=False, no_gpu=False)
22+
generate_slurm_config_files(
23+
tmpdir, template_directory, input_file, instance_types_data, dryrun=False, no_gpu=no_gpu
24+
)
1825

1926
for queue in ["efa", "gpu", "multiple_spot"]:
2027
for file_type in ["partition", "gres"]:
2128
file_name = f"pcluster/slurm_parallelcluster_{queue}_{file_type}.conf"
22-
_assert_files_are_equal(tmpdir / file_name, test_datadir / "expected_outputs" / file_name)
29+
no_nvidia = "_no_gpu" if (queue == "gpu" or file_type == "gres") and no_gpu else ""
30+
output_file_name = f"pcluster/slurm_parallelcluster_{queue}_{file_type}{no_nvidia}.conf"
31+
_assert_files_are_equal(tmpdir / file_name, test_datadir / "expected_outputs" / output_file_name)
2332

2433
for file in ["slurm_parallelcluster.conf", "slurm_parallelcluster_gres.conf"]:
2534
_assert_files_are_equal(tmpdir / file, test_datadir / "expected_outputs" / file)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# This file is automatically generated by pcluster
2+
# Skipping GPUs configuration because Nvidia driver is not installed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# This file is automatically generated by pcluster
2+
# Skipping GPUs configuration because Nvidia driver is not installed

0 commit comments

Comments
 (0)