Skip to content

Commit 158cb2d

Browse files
committed
[SQUASH][B200] Fix support B200.
1 parent 769a853 commit 158cb2d

File tree

3 files changed

+199
-6
lines changed

3 files changed

+199
-6
lines changed

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_common.rb

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,10 @@
1818
action :install do
1919
return unless nvlsm_installation_enabled?
2020

21+
#TODO Remove precondition check
2122
action_install_precondition
2223
action_install_nvlsm_dependencies
2324
action_install_nvlsm
24-
25-
# Save nvlsm version in Node Attributes for InSpec Tests
26-
# node.default['cluster']['nvidia']['nvlsm']['version'] = nvidia_nvlsm_full_version
27-
# node.default['cluster']['nvidia']['nvlsm']['package'] = nvidia_nvlsm_package
28-
# node_attributes 'dump node attributes'
2925
end
3026

3127
action :install_precondition do

cookbooks/aws-parallelcluster-platform/resources/nvidia_nvlsm/partial/_nvidia_nvlsm_debian.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ def nvidia_nvlsm_dependencies
3535
# work with kernel pinning, we will move the installation here.
3636
# %(linux-modules-extra-aws infiniband-diags ibutils)
3737
%(infiniband-diags ibutils)
38-
[]
3938
end
4039

4140
def nvidia_nvlsm_install_preconditions_commands
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
require 'spec_helper'
2+
3+
cluster_artifacts_s3_url = 'https://aws_region-aws-parallelcluster.s3.AWS_REGION.AWS_DOMAIN'
4+
source_dir = 'SOURCE_DIR'
5+
arch_suffix_rhel = {
6+
'x86_64' => 'x86_64',
7+
'aarch64' => 'aarch64',
8+
}.freeze
9+
arch_suffix_debian = {
10+
'x86_64' => 'amd64',
11+
'aarch64' => 'arm64',
12+
}.freeze
13+
14+
class ConvergeNvidiaNvlsm
15+
def self.install(chef_run)
16+
chef_run.converge_dsl('aws-parallelcluster-platform') do
17+
nvidia_nvlsm 'install' do
18+
action :install
19+
end
20+
end
21+
end
22+
end
23+
24+
describe 'nvidia_nvlsm:nvlsm_installation_enabled?' do
25+
for_all_oses do |platform, version|
26+
context "on #{platform}#{version}" do
27+
cached(:chef_run) do
28+
runner(platform: platform, version: version, step_into: ['nvidia_nvlsm'])
29+
end
30+
cached(:resource) do
31+
ConvergeNvidiaNvlsm.install(chef_run)
32+
chef_run.find_resource('nvidia_nvlsm', 'install')
33+
end
34+
35+
if "#{platform}#{version}" == 'amazon2'
36+
it 'on Amazon Linux 2, nvlsm installation is disabled' do
37+
expect(resource.nvlsm_installation_enabled?).to eq(false)
38+
end
39+
else
40+
context "when nvidia is not enabled" do
41+
before do
42+
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
43+
end
44+
45+
it 'nvlsm installation is disabled' do
46+
expect(resource.nvlsm_installation_enabled?).to eq(false)
47+
end
48+
end
49+
50+
context "when nvlsm is already installed" do
51+
before do
52+
allow(File).to receive(:exist?).with('/opt/nvidia/nvlsm/sbin/nvlsm').and_return(true)
53+
end
54+
55+
it 'nvlsm installation is disabled' do
56+
expect(resource.nvlsm_installation_enabled?).to eq(false)
57+
end
58+
end
59+
60+
context "when nvlsm is already installed" do
61+
before do
62+
allow_any_instance_of(Object).to receive(:nvidia_enabled?).and_return(false)
63+
end
64+
65+
it 'nvlsm installation is disabled' do
66+
expect(resource.nvlsm_installation_enabled?).to eq(false)
67+
end
68+
end
69+
70+
context "when nvlsm installation is requested to be skipped via chef attribute" do
71+
cached(:chef_run) do
72+
runner(platform: platform, version: version) do |node|
73+
node.override['cluster']['nvidia']['nvlsm']['skip'] = true
74+
end
75+
end
76+
77+
it 'nvlsm installation is disabled' do
78+
expect(resource.nvlsm_installation_enabled?).to eq(false)
79+
end
80+
end
81+
end
82+
end
83+
end
84+
end
85+
86+
describe 'nvidia_nvlsm:install' do
87+
for_all_oses do |platform, version|
88+
context "on #{platform}#{version}" do
89+
context 'when nvlsm installation is disabled' do
90+
cached(:chef_run) do
91+
stubs_for_resource('nvidia_nvlsm') do |res|
92+
allow(res).to receive(:nvlsm_installation_enabled?).and_return(false)
93+
end
94+
runner = runner(platform: platform, version: version, step_into: ['nvidia_nvlsm'])
95+
ConvergeNvidiaNvlsm.install(runner)
96+
end
97+
cached(:node) { chef_run.node }
98+
99+
it 'does not install nvlsm' do
100+
is_expected.not_to run_bash("Install nvlsm")
101+
end
102+
end
103+
104+
%w(x86_64 aarch64).each do |arch|
105+
context "when nvlsm installation is enabled on #{arch}" do
106+
cached(:chef_run) do
107+
stubs_for_resource('nvidia_nvlsm') do |res|
108+
allow(res).to receive(:nvlsm_installation_enabled?).and_return(true)
109+
end
110+
runner = runner(platform: platform, version: version, step_into: ['nvidia_nvlsm']) do |node|
111+
node.override['cluster']['artifacts_s3_url'] = cluster_artifacts_s3_url
112+
node.override['cluster']['sources_dir'] = source_dir
113+
node.automatic['kernel']['machine'] = arch
114+
end
115+
ConvergeNvidiaNvlsm.install(runner)
116+
end
117+
cached(:node) { chef_run.node }
118+
119+
cached(:nvlsm_version) { "2025.03.9-1" }
120+
cached(:nvlsm_package_full_name) do
121+
if %(redhat rocky amazon).include?(platform)
122+
"nvlsm-#{nvlsm_version}.#{arch_suffix_rhel[arch]}.rpm"
123+
else
124+
"nvlsm_#{nvlsm_version}_#{arch_suffix_debian[arch]}.deb"
125+
end
126+
end
127+
cached(:nvlsm_checksum) do
128+
if %(redhat rocky amazon).include?(platform)
129+
"88d5e52183bb5ee763eb864bbd119b591e7f45af32c52bd7ba0aa8f74fc19057"
130+
else
131+
"61f280e469624c43eecb0e08305452887e02f73e4763252a41f728d1843f1cc5"
132+
end
133+
end
134+
cached(:nvlsm_url) do
135+
os_directory = if platform == 'amazon'
136+
"amzn#{version}"
137+
elsif %(redhat rocky).include?(platform)
138+
"rhel#{version}"
139+
else
140+
"#{platform}#{version.delete('.')}"
141+
end
142+
"#{cluster_artifacts_s3_url}/dependencies/nvidia_nvlsm/#{os_directory}/#{nvlsm_package_full_name}"
143+
end
144+
145+
cached(:nvlsm_installation_commands) do
146+
if %(redhat rocky amazon).include?(platform)
147+
" set -e\n yum install -y #{nvlsm_package_full_name} && yum versionlock nvlsm\n"
148+
else
149+
" set -e\n dpkg -i #{nvlsm_package_full_name} && apt-mark hold nvlsm\n"
150+
end
151+
end
152+
cached(:nvlsm_dependencies) do
153+
if %(redhat rocky amazon).include?(platform)
154+
%(infiniband-diags libibumad)
155+
else
156+
%(infiniband-diags ibutils)
157+
end
158+
end
159+
160+
it 'installs dependencies of nvlsm' do
161+
is_expected.to install_package(nvlsm_dependencies).with(
162+
retries: 3,
163+
retry_delay: 5
164+
)
165+
end
166+
167+
it 'configures infiniband kernel module to be loaded at boot time' do
168+
is_expected.to create_cookbook_file('/etc/modules-load.d/parallelcluster-infiniband.conf')
169+
.with(source: 'infiniband/infiniband.conf')
170+
.with(user: 'root')
171+
.with(group: 'root')
172+
.with(mode: '0644')
173+
end
174+
175+
it 'downloads nvlsm' do
176+
is_expected.to create_if_missing_remote_file("#{source_dir}/#{nvlsm_package_full_name}").with(
177+
source: nvlsm_url,
178+
checksum: nvlsm_checksum,
179+
mode: '0644',
180+
retries: 3,
181+
retry_delay: 5
182+
)
183+
end
184+
185+
it 'installs nvlsm' do
186+
is_expected.to run_bash("Install nvlsm").with(
187+
user: 'root',
188+
cwd: source_dir,
189+
retries: 3,
190+
retry_delay: 5,
191+
code: nvlsm_installation_commands
192+
)
193+
end
194+
end
195+
end
196+
end
197+
end
198+
end

0 commit comments

Comments
 (0)