Skip to content

Commit ad4d280

Browse files
committed
[DONOTMERGE][B200] Add debugging commands.
1 parent e3908eb commit ad4d280

File tree

4 files changed

+35
-4
lines changed

4 files changed

+35
-4
lines changed

cookbooks/aws-parallelcluster-platform/recipes/install.rb

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,36 @@
1414
include_recipe "aws-parallelcluster-platform::sudo_install"
1515
include_recipe "aws-parallelcluster-platform::users"
1616
include_recipe "aws-parallelcluster-platform::disable_services"
17+
bash "Check kernel before repo setup" do
18+
code <<-CODE
19+
set -x
20+
uname -a
21+
yum info infiniband-diags libibumad
22+
yum provides infiniband-diags libibumad
23+
yum install --assumeno infiniband-diags libibumad
24+
CODE
25+
end
1726
package_repos 'setup the repositories'
27+
bash "Check kernel after repo setup" do
28+
code <<-CODE
29+
set -x
30+
uname -a
31+
yum info infiniband-diags libibumad
32+
yum provides infiniband-diags libibumad
33+
yum install --assumeno infiniband-diags libibumad
34+
CODE
35+
end
1836
include_recipe "aws-parallelcluster-platform::directories"
1937
install_packages 'Install OS and extra packages'
38+
bash "Check kernel after extra packages" do
39+
code <<-CODE
40+
set -x
41+
uname -a
42+
yum info infiniband-diags libibumad
43+
yum provides infiniband-diags libibumad
44+
yum install --assumeno infiniband-diags libibumad
45+
CODE
46+
end
2047
include_recipe "aws-parallelcluster-platform::cookbook_virtualenv"
2148
include_recipe "aws-parallelcluster-platform::awscli"
2249
unless alinux2023_on_docker? # Running this recipe on Alinux 2023 docker generates false failure.

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/nvidia_nvlsm_alinux2023.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,7 @@
2222
def platform
2323
"amzn#{node['platform_version'].to_i}"
2424
end
25+
26+
def nvidia_nvlsm_dependencies
27+
%(infiniband-diags libibumad)
28+
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_debian.rb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ def nvidia_nvlsm_dependencies
3434
# because otherwise the kernel pinning would block the installation. Once we figure out how to make this installation
3535
# work with kernel pinning, we will move the installation here.
3636
# %(linux-modules-extra-aws infiniband-diags ibutils)
37+
%(infiniband-diags ibutils)
3738
[]
3839
end
3940

4041
def nvidia_nvlsm_install_preconditions_commands
41-
"apt-cache policy ; apt-mark showhold"
42+
"uname -a ; apt-cache policy ; apt-mark showhold"
4243
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_rhel.rb

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,9 @@ def nvidia_nvlsm_dependencies
3333
# The installation has been temporarily moved to the ParallelCluster component, before cookbook execution,
3434
# because otherwise the kernel pinning would block the installation. Once we figure out how to make this installation
3535
# work with kernel pinning, we will move the installation here.
36-
# %(kernel-modules-extra-aws infiniband-diags libibumad)
37-
[]
36+
%(infiniband-diags libibumad)
3837
end
3938

4039
def nvidia_nvlsm_install_preconditions_commands
41-
"yum repolist all ; yum versionlock list"
40+
"uname -a ; yum repolist all ; yum versionlock list ; yum info kernel-modules-extra-aws infiniband-diags libibumad ; yum provides kernel-modules-extra-aws infiniband-diags libibumad"
4241
end

0 commit comments

Comments
 (0)