Skip to content

Commit d35da53

Browse files
yuleiwanchenwany
authored andcommitted
3.0 Make slurm work when using GPU instances with no NVIDIA drivers installed
* if nvidia drive is not installed on AMI of headnode, don't configure gres slurm gpu configuration Signed-off-by: Yulei Wang <[email protected]>
1 parent d29abb5 commit d35da53

File tree

6 files changed

+56
-12
lines changed

6 files changed

+56
-12
lines changed

files/default/slurm/pcluster_slurm_config_generator.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ class CriticalError(Exception):
3030
pass
3131

3232

33-
def generate_slurm_config_files(output_directory, template_directory, input_file, instance_types_data_path, dryrun):
33+
def generate_slurm_config_files(
34+
output_directory, template_directory, input_file, instance_types_data_path, dryrun, no_gpu
35+
):
3436
"""
3537
Generate Slurm configuration files.
3638
@@ -61,6 +63,8 @@ def generate_slurm_config_files(output_directory, template_directory, input_file
6163
is_default_queue = True # The first queue in the queues list is the default queue
6264
for queue in queues:
6365
for file_type in ["partition", "gres"]:
66+
if file_type == "gres" and no_gpu:
67+
continue
6468
_generate_queue_config(
6569
queue["Name"], queue, is_default_queue, file_type, env, pcluster_subdirectory, dryrun
6670
)
@@ -272,9 +276,21 @@ def main():
272276
required=False,
273277
default=False,
274278
)
279+
parser.add_argument(
280+
"--no-gpu",
281+
action="store_true",
282+
help="no gpu configuration",
283+
required=False,
284+
default=False,
285+
)
275286
args = parser.parse_args()
276287
generate_slurm_config_files(
277-
args.output_directory, args.template_directory, args.input_file, args.instance_types_data, args.dryrun
288+
args.output_directory,
289+
args.template_directory,
290+
args.input_file,
291+
args.instance_types_data,
292+
args.dryrun,
293+
args.no_gpu,
278294
)
279295
except Exception as e:
280296
log.exception("Failed to generate slurm configurations, exception: %s", e)

libraries/helpers.rb

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,18 @@ def graphic_instance?
131131
!has_gpu.stdout.strip.empty?
132132
end
133133

134+
#
135+
# Check if the nvidia drive is installed
136+
#
137+
def nvidia_installed?
138+
nvidia_installed = Mixlib::ShellOut.new("which nvidia-smi")
139+
nvidia_installed.run_command
140+
141+
Chef::Log.info("Nvidia drive is not installed") if nvidia_installed.stdout.strip.empty?
142+
143+
!nvidia_installed.stdout.strip.empty?
144+
end
145+
134146
#
135147
# Check if the AMI is bootstrapped
136148
#

recipes/compute_slurm_config.rb

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,14 @@
3535
retry_delay 6
3636
end
3737

38-
# Check to see if there is GPU on the instance, only execute run_nvidiasmi if there is GPU
38+
# Check to see if there is GPU on the instance, only execute run_nvidiasmi if there is GPU and nvidia installed
3939
if graphic_instance?
40-
execute "run_nvidiasmi" do
41-
command 'nvidia-smi'
40+
if nvidia_installed?
41+
execute "run_nvidiasmi" do
42+
command 'nvidia-smi'
43+
end
44+
else
45+
Chef::Log.warn("GPU instance but no Nvidia drivers found")
4246
end
4347
end
4448

recipes/head_node_slurm_config.rb

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,15 @@
9292

9393
# Generate pcluster specific configs
9494
execute "generate_pcluster_slurm_configs" do
95-
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py"\
96-
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
97-
" --input-file #{node['cluster']['cluster_config_path']} --instance-types-data #{node['cluster']['instance_types_data_path']}"
95+
if nvidia_installed?
96+
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py"\
97+
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
98+
" --input-file #{node['cluster']['cluster_config_path']} --instance-types-data #{node['cluster']['instance_types_data_path']}"
99+
else
100+
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py"\
101+
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
102+
" --input-file #{node['cluster']['cluster_config_path']} --instance-types-data #{node['cluster']['instance_types_data_path']} --no-gpu"
103+
end
98104
end
99105

100106
# all other OSs use /sys/fs/cgroup, which is the default

recipes/update_head_node_slurm.rb

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,15 @@
3535
end
3636
# Generate pcluster specific configs
3737
execute "generate_pcluster_slurm_configs" do
38-
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \
39-
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
40-
" --input-file #{updated_cluster_config_path} --instance-types-data #{node['cluster']['instance_types_data_path']}"
38+
if nvidia_installed?
39+
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \
40+
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
41+
" --input-file #{updated_cluster_config_path} --instance-types-data #{node['cluster']['instance_types_data_path']}"
42+
else
43+
command "#{node['cluster']['cookbook_virtualenv_path']}/bin/python #{node['cluster']['scripts_dir']}/slurm/pcluster_slurm_config_generator.py" \
44+
" --output-directory /opt/slurm/etc/ --template-directory #{node['cluster']['scripts_dir']}/slurm/templates/"\
45+
" --input-file #{updated_cluster_config_path} --instance-types-data #{node['cluster']['instance_types_data_path']} --no-gpu"
46+
end
4147
end
4248

4349
execute 'stop clustermgtd' do

test/unit/slurm/test_slurm_config_generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def test_generate_slurm_config_files(mocker, test_datadir, tmpdir):
1414
"slurm.pcluster_slurm_config_generator._get_head_node_private_ip", return_value="ip.1.0.0.0", autospec=True
1515
)
1616
template_directory = os.path.dirname(slurm.__file__) + "/templates"
17-
generate_slurm_config_files(tmpdir, template_directory, input_file, instance_types_data, dryrun=False)
17+
generate_slurm_config_files(tmpdir, template_directory, input_file, instance_types_data, dryrun=False, no_gpu=False)
1818

1919
for queue in ["efa", "gpu", "multiple_spot"]:
2020
for file_type in ["partition", "gres"]:

0 commit comments

Comments
 (0)