Skip to content

Commit 4279f70

Browse files
committed
[B200] Debugging
1 parent e740d85 commit 4279f70

File tree

5 files changed

+67
-31
lines changed

5 files changed

+67
-31
lines changed

cookbooks/aws-parallelcluster-platform/recipes/install.rb

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,33 +14,34 @@
1414
include_recipe "aws-parallelcluster-platform::sudo_install"
1515
include_recipe "aws-parallelcluster-platform::users"
1616
include_recipe "aws-parallelcluster-platform::disable_services"
17-
bash "Check kernel before repo setup" do
18-
code <<-CODE
19-
set -x
20-
uname -a
21-
yum info infiniband-diags libibumad
22-
yum provides infiniband-diags libibumad
23-
CODE
24-
end
17+
# bash "Check kernel before repo setup" do
18+
# code <<-CODE
19+
# set -x
20+
# uname -a
21+
# yum info infiniband-diags libibumad
22+
# yum provides infiniband-diags libibumad
23+
# CODE
24+
# end
2525
package_repos 'setup the repositories'
26-
bash "Check kernel after repo setup" do
27-
code <<-CODE
28-
set -x
29-
uname -a
30-
yum info infiniband-diags libibumad
31-
yum provides infiniband-diags libibumad
32-
CODE
33-
end
26+
# bash "Check kernel after repo setup" do
27+
# code <<-CODE
28+
# set -x
29+
# uname -a
30+
# yum info infiniband-diags libibumad
31+
# yum provides infiniband-diags libibumad
32+
# CODE
33+
# end
3434
include_recipe "aws-parallelcluster-platform::directories"
35+
# nvidia_nvlsm 'Install Nvidia NVLink Subnet Manager'
3536
install_packages 'Install OS and extra packages'
36-
bash "Check kernel after extra packages" do
37-
code <<-CODE
38-
set -x
39-
uname -a
40-
yum info infiniband-diags libibumad
41-
yum provides infiniband-diags libibumad
42-
CODE
43-
end
37+
# bash "Check kernel after extra packages" do
38+
# code <<-CODE
39+
# set -x
40+
# uname -a
41+
# yum info infiniband-diags libibumad
42+
# yum provides infiniband-diags libibumad
43+
# CODE
44+
# end
4445
include_recipe "aws-parallelcluster-platform::cookbook_virtualenv"
4546
include_recipe "aws-parallelcluster-platform::awscli"
4647
unless alinux2023_on_docker? # Running this recipe on Alinux 2023 docker generates false failure.

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_common.rb

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
user 'root'
3030
cwd node['cluster']['sources_dir']
3131
code <<-CODE
32-
set -e
32+
set -ex
3333
#{nvidia_nvlsm_install_preconditions_commands}
3434
CODE
3535
retries 3
@@ -38,10 +38,20 @@
3838
end
3939

4040
action :install_nvlsm_dependencies do
41-
package nvidia_nvlsm_dependencies do
41+
bash "Install nvlsm dependencies" do
42+
user 'root'
43+
code <<-CODE
44+
set -ex
45+
#{nvidia_nvlsm_install_dependencies_commands}
46+
CODE
4247
retries 3
4348
retry_delay 5
4449
end
50+
# package nvidia_nvlsm_dependencies do
51+
# options '--verbose'
52+
# retries 3
53+
# retry_delay 5
54+
# end
4555

4656
# Make sure kernel module for Infiniband is loaded at instance boot time
4757
cookbook_file 'infiniband.conf' do
@@ -67,7 +77,7 @@
6777
user 'root'
6878
cwd node['cluster']['sources_dir']
6979
code <<-CODE
70-
set -e
80+
set -ex
7181
#{nvidia_nvlsm_install_commands}
7282
CODE
7383
retries 3
@@ -103,6 +113,10 @@ def nvidia_nvlsm_install_preconditions_commands
103113
# OS dependent
104114
end
105115

116+
def nvidia_nvlsm_install_dependencies_commands
117+
# OS dependent
118+
end
119+
106120
def nvidia_nvlsm_dependencies
107121
# OS dependent
108122
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_debian.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,7 @@ def nvidia_nvlsm_dependencies
4040
def nvidia_nvlsm_install_preconditions_commands
4141
"uname -a ; apt-cache policy ; apt-mark showhold"
4242
end
43+
44+
def nvidia_nvlsm_install_dependencies_commands
45+
"apt -o Debug::pkgProblemResolver=1 install -y infiniband-diags ibutils"
46+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_rhel.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,7 @@ def nvidia_nvlsm_dependencies
3939
def nvidia_nvlsm_install_preconditions_commands
4040
"uname -a ; yum repolist all ; yum versionlock list ; yum info kernel-modules-extra-aws infiniband-diags libibumad ; yum provides kernel-modules-extra-aws infiniband-diags libibumad"
4141
end
42+
43+
def nvidia_nvlsm_install_dependencies_commands
44+
"yum install -yv infiniband-diags libibumad"
45+
end

cookbooks/aws-parallelcluster-platform/spec/unit/resources/nvidia_nvlsm_spec.rb

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,9 @@ def self.install(chef_run)
144144

145145
cached(:nvlsm_installation_commands) do
146146
if %(redhat rocky amazon).include?(platform)
147-
" set -e\n yum install -y #{nvlsm_package_full_name} && yum versionlock nvlsm\n"
147+
" set -ex\n yum install -y #{nvlsm_package_full_name} && yum versionlock nvlsm\n"
148148
else
149-
" set -e\n dpkg -i #{nvlsm_package_full_name} && apt-mark hold nvlsm\n"
149+
" set -ex\n dpkg -i #{nvlsm_package_full_name} && apt-mark hold nvlsm\n"
150150
end
151151
end
152152
cached(:nvlsm_dependencies) do
@@ -156,11 +156,24 @@ def self.install(chef_run)
156156
%(infiniband-diags ibutils)
157157
end
158158
end
159+
cached(:nvlsm_dependencies_installation_commands) do
160+
if %(redhat rocky amazon).include?(platform)
161+
" set -ex\n yum install -yv #{nvlsm_dependencies}\n"
162+
else
163+
" set -ex\n apt -o Debug::pkgProblemResolver=1 install -y #{nvlsm_dependencies}\n"
164+
end
165+
end
159166

160167
it 'installs dependencies of nvlsm' do
161-
is_expected.to install_package(nvlsm_dependencies).with(
168+
# is_expected.to install_package(nvlsm_dependencies).with(
169+
# retries: 3,
170+
# retry_delay: 5
171+
# )
172+
is_expected.to run_bash("Install nvlsm dependencies").with(
173+
user: 'root',
162174
retries: 3,
163-
retry_delay: 5
175+
retry_delay: 5,
176+
code: nvlsm_dependencies_installation_commands
164177
)
165178
end
166179

0 commit comments

Comments
 (0)