Skip to content

Commit 2b45939

Browse files
author
Himani Anil Deshpande
committed
[Fabric] Install NVIDIA Fabric manager for ARM instances
1 parent 20f863d commit 2b45939

File tree

5 files changed

+58
-37
lines changed

5 files changed

+58
-37
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
3030
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.
3131
- Add support for GB200 instance types.
3232
- Install nvidia-imex for all OSs except AL2.
33+
- Install nvidia-fabricmanager for ARM instances for all OSs except AL2.
3334

3435
**BUG FIXES**
3536
- Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures.

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,7 @@ def fabric_manager_version
2828
def platform
2929
'rhel7'
3030
end
31+
32+
def _fabric_manager_enabled
33+
!arm_instance? && _nvidia_enabled
34+
end

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,7 @@
4040
end
4141

4242
def _fabric_manager_enabled
43-
# NVIDIA Fabric Manager not present on ARM
44-
!arm_instance? && _nvidia_enabled
43+
_nvidia_enabled
4544
end
4645

4746
def _nvidia_enabled

cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb

Lines changed: 51 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -118,45 +118,62 @@ def self.configure(chef_run)
118118
end
119119

120120
describe 'fabric_manager:_fabric_manager_enabled' do
121-
context 'when on arm' do
122-
cached(:chef_run) do
123-
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
124-
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
125-
end
126-
cached(:resource) do
127-
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
128-
chef_run.find_resource('fabric_manager', 'setup')
129-
end
130-
it "is not enabled" do
131-
expect(resource._fabric_manager_enabled).to eq(false)
132-
end
133-
end
121+
for_all_oses do |platform, version|
122+
context "on #{platform}#{version}" do
123+
context 'when on arm' do
124+
# cached(:chef_run) do
125+
# runner(platform: platform, version: version, step_into: ['fabric_manager'])
126+
# end
127+
# cached(:resource) do
128+
# ConvergeFabricManager.setup(chef_run)
129+
# chef_run.find_resource('fabric_manager', 'setup')
130+
# end
131+
cached(:chef_run) do
132+
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
133+
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'], platform: platform, version: version)
134+
end
135+
cached(:resource) do
136+
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
137+
chef_run.find_resource('fabric_manager', 'setup')
138+
end
139+
if platform == 'amazon' && version == '2'
140+
it "is not enabled" do
141+
expect(resource._fabric_manager_enabled).to eq(false)
142+
end
143+
else
144+
it "is enabled" do
145+
expect(resource._fabric_manager_enabled).to eq(true)
146+
end
147+
end
148+
end
134149

135-
context 'when not on arm' do
136-
cached(:chef_run) do
137-
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
138-
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
139-
end
150+
context 'when not on arm' do
151+
cached(:chef_run) do
152+
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
153+
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
154+
end
140155

141-
context 'when nvidia enabled' do
142-
cached(:resource) do
143-
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
144-
chef_run.find_resource('fabric_manager', 'setup')
145-
end
156+
context 'when nvidia enabled' do
157+
cached(:resource) do
158+
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
159+
chef_run.find_resource('fabric_manager', 'setup')
160+
end
146161

147-
it "is enabled" do
148-
expect(resource._fabric_manager_enabled).to eq(true)
149-
end
150-
end
162+
it "is enabled" do
163+
expect(resource._fabric_manager_enabled).to eq(true)
164+
end
165+
end
151166

152-
context 'when nvidia not enabled' do
153-
cached(:resource) do
154-
ConvergeFabricManager.setup(chef_run, nvidia_enabled: false)
155-
chef_run.find_resource('fabric_manager', 'setup')
156-
end
167+
context 'when nvidia not enabled' do
168+
cached(:resource) do
169+
ConvergeFabricManager.setup(chef_run, nvidia_enabled: false)
170+
chef_run.find_resource('fabric_manager', 'setup')
171+
end
157172

158-
it "is not enabled" do
159-
expect(resource._fabric_manager_enabled).to eq(false)
173+
it "is not enabled" do
174+
expect(resource._fabric_manager_enabled).to eq(false)
175+
end
176+
end
160177
end
161178
end
162179
end

cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# See the License for the specific language governing permissions and limitations under the License.
1111

1212
control 'tag:install_expected_versions_of_nvidia_fabric_manager_installed' do
13-
only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) }
13+
only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !os_properties.alinux2? }
1414

1515
describe package(node['cluster']['nvidia']['fabricmanager']['package']) do
1616
it { should be_installed }

0 commit comments

Comments
 (0)