Skip to content

Commit 002a77f

Browse files
committed
[B200] Debugging
1 parent e740d85 commit 002a77f

File tree

4 files changed

+48
-25
lines changed

4 files changed

+48
-25
lines changed

cookbooks/aws-parallelcluster-platform/recipes/install.rb

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,33 +14,34 @@
1414
include_recipe "aws-parallelcluster-platform::sudo_install"
1515
include_recipe "aws-parallelcluster-platform::users"
1616
include_recipe "aws-parallelcluster-platform::disable_services"
17-
bash "Check kernel before repo setup" do
18-
code <<-CODE
19-
set -x
20-
uname -a
21-
yum info infiniband-diags libibumad
22-
yum provides infiniband-diags libibumad
23-
CODE
24-
end
17+
# bash "Check kernel before repo setup" do
18+
# code <<-CODE
19+
# set -x
20+
# uname -a
21+
# yum info infiniband-diags libibumad
22+
# yum provides infiniband-diags libibumad
23+
# CODE
24+
# end
2525
package_repos 'setup the repositories'
26-
bash "Check kernel after repo setup" do
27-
code <<-CODE
28-
set -x
29-
uname -a
30-
yum info infiniband-diags libibumad
31-
yum provides infiniband-diags libibumad
32-
CODE
33-
end
26+
# bash "Check kernel after repo setup" do
27+
# code <<-CODE
28+
# set -x
29+
# uname -a
30+
# yum info infiniband-diags libibumad
31+
# yum provides infiniband-diags libibumad
32+
# CODE
33+
# end
3434
include_recipe "aws-parallelcluster-platform::directories"
35+
# nvidia_nvlsm 'Install Nvidia NVLink Subnet Manager'
3536
install_packages 'Install OS and extra packages'
36-
bash "Check kernel after extra packages" do
37-
code <<-CODE
38-
set -x
39-
uname -a
40-
yum info infiniband-diags libibumad
41-
yum provides infiniband-diags libibumad
42-
CODE
43-
end
37+
# bash "Check kernel after extra packages" do
38+
# code <<-CODE
39+
# set -x
40+
# uname -a
41+
# yum info infiniband-diags libibumad
42+
# yum provides infiniband-diags libibumad
43+
# CODE
44+
# end
4445
include_recipe "aws-parallelcluster-platform::cookbook_virtualenv"
4546
include_recipe "aws-parallelcluster-platform::awscli"
4647
unless alinux2023_on_docker? # Running this recipe on Alinux 2023 docker generates false failure.

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_common.rb

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,20 @@
3838
end
3939

4040
action :install_nvlsm_dependencies do
41-
package nvidia_nvlsm_dependencies do
41+
bash "Install nvlsm dependencies" do
42+
user 'root'
43+
code <<-CODE
44+
set -ex
45+
#{nvidia_nvlsm_install_dependencies_commands}
46+
CODE
4247
retries 3
4348
retry_delay 5
4449
end
50+
# package nvidia_nvlsm_dependencies do
51+
# options '--verbose'
52+
# retries 3
53+
# retry_delay 5
54+
# end
4555

4656
# Make sure kernel module for Infiniband is loaded at instance boot time
4757
cookbook_file 'infiniband.conf' do
@@ -103,6 +113,10 @@ def nvidia_nvlsm_install_preconditions_commands
103113
# OS dependent
104114
end
105115

116+
def nvidia_nvlsm_install_dependencies_commands
117+
# OS dependent
118+
end
119+
106120
def nvidia_nvlsm_dependencies
107121
# OS dependent
108122
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_debian.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,7 @@ def nvidia_nvlsm_dependencies
4040
def nvidia_nvlsm_install_preconditions_commands
4141
"uname -a ; apt-cache policy ; apt-mark showhold"
4242
end
43+
44+
def nvidia_nvlsm_install_dependencies_commands
45+
"apt -o Debug::pkgProblemResolver=1 install -y infiniband-diags ibutils"
46+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_rhel.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,7 @@ def nvidia_nvlsm_dependencies
3939
def nvidia_nvlsm_install_preconditions_commands
4040
"uname -a ; yum repolist all ; yum versionlock list ; yum info kernel-modules-extra-aws infiniband-diags libibumad ; yum provides kernel-modules-extra-aws infiniband-diags libibumad"
4141
end
42+
43+
def nvidia_nvlsm_install_dependencies_commands
44+
"yum install -yv infiniband-diags libibumad"
45+
end

0 commit comments

Comments
 (0)