Skip to content

Commit 76dcada

Browse files
committed
[TEMPORARY][B200] Add nvlsm precondition check
1 parent 26a710d commit 76dcada

File tree

3 files changed

+30
-6
lines changed

3 files changed

+30
-6
lines changed

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_common.rb

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
action :install do
1919
return unless nvlsm_installation_enabled?
2020

21+
action_install_precondition
2122
action_install_nvlsm_dependencies
2223
action_install_nvlsm
2324

@@ -27,6 +28,19 @@
2728
# node_attributes 'dump node attributes'
2829
end
2930

31+
action :install_precondition do
32+
bash "NVLSM Preconditions" do
33+
user 'root'
34+
cwd node['cluster']['sources_dir']
35+
code <<-CODE
36+
set -e
37+
#{nvidia_nvlsm_install_preconditions_commands}
38+
CODE
39+
retries 3
40+
retry_delay 5
41+
end
42+
end
43+
3044
action :install_nvlsm_dependencies do
3145
package nvidia_nvlsm_dependencies do
3246
retries 3
@@ -55,10 +69,10 @@
5569
bash "Install nvlsm" do
5670
user 'root'
5771
cwd node['cluster']['sources_dir']
58-
code <<-NVIDIA_IMEX
72+
code <<-CODE
5973
set -e
6074
#{nvidia_nvlsm_install_commands}
61-
NVIDIA_IMEX
75+
CODE
6276
retries 3
6377
retry_delay 5
6478
end
@@ -91,6 +105,10 @@ def nvidia_nvlsm_install_commands
91105
# OS dependent
92106
end
93107

108+
def nvidia_nvlsm_install_preconditions_commands
109+
# OS dependent
110+
end
111+
94112
def nvidia_nvlsm_dependencies
95113
# OS dependent
96114
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_debian.rb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ def nvidia_nvlsm_install_commands
2929
end
3030

3131
def nvidia_nvlsm_dependencies
32-
# %(linux-modules-extra-aws infiniband-diags ibutils)
33-
%(infiniband-diags ibutils)
32+
%(linux-modules-extra-aws infiniband-diags ibutils)
33+
end
34+
35+
def nvidia_nvlsm_install_preconditions_commands
36+
"apt-cache policy"
3437
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_rhel.rb

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ def nvidia_nvlsm_install_commands
2929
end
3030

3131
def nvidia_nvlsm_dependencies
32-
# %(kernel-modules-extra-aws infiniband-diags libibumad)
33-
%(infiniband-diags libibumad)
32+
%(kernel-modules-extra-aws infiniband-diags libibumad)
33+
end
34+
35+
def nvidia_nvlsm_install_preconditions_commands
36+
"yum repolist all"
3437
end

0 commit comments

Comments
 (0)