Skip to content

Commit f516bba

Browse files
himani2411Himani Anil Deshpande
andauthored
[Fabric] Install NVIDIA Fabric manager for ARM instances (aws#3014)
Co-authored-by: Himani Anil Deshpande <[email protected]>
1 parent 5866b82 commit f516bba

File tree

5 files changed

+51
-37
lines changed

5 files changed

+51
-37
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
3131
- Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management.
3232
- Add support for GB200 instance types.
3333
- Install nvidia-imex for all OSs except AL2.
34+
- Install nvidia-fabricmanager for ARM instances for all OSs except AL2.
3435

3536
**BUG FIXES**
3637
- Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures.

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/fabric_manager_amazon2.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,7 @@ def fabric_manager_version
2828
def platform
2929
'rhel7'
3030
end
31+
32+
def _fabric_manager_enabled
33+
!arm_instance? && _nvidia_enabled
34+
end

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,7 @@
4040
end
4141

4242
def _fabric_manager_enabled
43-
# NVIDIA Fabric Manager not present on ARM
44-
!arm_instance? && _nvidia_enabled
43+
_nvidia_enabled
4544
end
4645

4746
def _nvidia_enabled

cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb

Lines changed: 44 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -118,45 +118,55 @@ def self.configure(chef_run)
118118
end
119119

120120
describe 'fabric_manager:_fabric_manager_enabled' do
121-
context 'when on arm' do
122-
cached(:chef_run) do
123-
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
124-
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
125-
end
126-
cached(:resource) do
127-
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
128-
chef_run.find_resource('fabric_manager', 'setup')
129-
end
130-
it "is not enabled" do
131-
expect(resource._fabric_manager_enabled).to eq(false)
132-
end
133-
end
121+
for_all_oses do |platform, version|
122+
context "on #{platform}#{version}" do
123+
context 'when on arm' do
124+
cached(:chef_run) do
125+
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(true)
126+
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'], platform: platform, version: version)
127+
end
128+
cached(:resource) do
129+
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
130+
chef_run.find_resource('fabric_manager', 'setup')
131+
end
132+
if platform == 'amazon' && version == '2'
133+
it "is not enabled" do
134+
expect(resource._fabric_manager_enabled).to eq(false)
135+
end
136+
else
137+
it "is enabled" do
138+
expect(resource._fabric_manager_enabled).to eq(true)
139+
end
140+
end
141+
end
134142

135-
context 'when not on arm' do
136-
cached(:chef_run) do
137-
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
138-
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
139-
end
143+
context 'when not on arm' do
144+
cached(:chef_run) do
145+
allow_any_instance_of(Object).to receive(:arm_instance?).and_return(false)
146+
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
147+
end
140148

141-
context 'when nvidia enabled' do
142-
cached(:resource) do
143-
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
144-
chef_run.find_resource('fabric_manager', 'setup')
145-
end
149+
context 'when nvidia enabled' do
150+
cached(:resource) do
151+
ConvergeFabricManager.setup(chef_run, nvidia_enabled: true)
152+
chef_run.find_resource('fabric_manager', 'setup')
153+
end
146154

147-
it "is enabled" do
148-
expect(resource._fabric_manager_enabled).to eq(true)
149-
end
150-
end
155+
it "is enabled" do
156+
expect(resource._fabric_manager_enabled).to eq(true)
157+
end
158+
end
151159

152-
context 'when nvidia not enabled' do
153-
cached(:resource) do
154-
ConvergeFabricManager.setup(chef_run, nvidia_enabled: false)
155-
chef_run.find_resource('fabric_manager', 'setup')
156-
end
160+
context 'when nvidia not enabled' do
161+
cached(:resource) do
162+
ConvergeFabricManager.setup(chef_run, nvidia_enabled: false)
163+
chef_run.find_resource('fabric_manager', 'setup')
164+
end
157165

158-
it "is not enabled" do
159-
expect(resource._fabric_manager_enabled).to eq(false)
166+
it "is not enabled" do
167+
expect(resource._fabric_manager_enabled).to eq(false)
168+
end
169+
end
160170
end
161171
end
162172
end

cookbooks/aws-parallelcluster-platform/test/controls/nvidia_fabric_manager_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# See the License for the specific language governing permissions and limitations under the License.
1111

1212
control 'tag:install_expected_versions_of_nvidia_fabric_manager_installed' do
13-
only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) }
13+
only_if { !os_properties.arm? && ['yes', true, 'true'].include?(node['cluster']['nvidia']['enabled']) && !os_properties.alinux2? }
1414

1515
describe package(node['cluster']['nvidia']['fabricmanager']['package']) do
1616
it { should be_installed }

0 commit comments

Comments
 (0)