Skip to content

Commit 236b641

Browse files
DCGM upgrade
Signed-off-by: Hanwen <[email protected]>
1 parent faad16d commit 236b641

File tree

3 files changed

+61
-39
lines changed

3 files changed

+61
-39
lines changed

cookbooks/aws-parallelcluster-platform/attributes/platform.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# NVidia
1818
default['cluster']['nvidia']['enabled'] = 'no'
1919
default['cluster']['nvidia']['driver_version'] = '570.172.08'
20-
default['cluster']['nvidia']['dcgm_version'] = '4-cuda12-4.2.3-2.'
20+
default['cluster']['nvidia']['dcgm_version'] = '4.2.3-2.'
2121
if platform?('amazon') && node['platform_version'] == "2"
2222
default['cluster']['nvidia']['driver_version'] = '550.127.08'
2323
default['cluster']['nvidia']['dcgm_version'] = '3.3.6-1-'

cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_debian.rb

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,34 +13,45 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
action :install_package do
16-
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.deb" do
17-
source "#{dcgm_url}"
18-
mode '0644'
19-
retries 3
20-
retry_delay 5
21-
action :create_if_missing
16+
if package_version.start_with?("3.")
17+
packages_urls_list = [dcgm_package]
18+
else
19+
packages_urls_list = [dcgm4_core_package, dcgm4_package]
2220
end
21+
for package in packages_urls_list
22+
remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.deb" do
23+
source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}_#{package_version}_#{arch_suffix}.deb"
24+
mode '0644'
25+
retries 3
26+
retry_delay 5
27+
action :create_if_missing
28+
end
2329

24-
bash "Install #{dcgm_package}" do
25-
user 'root'
26-
cwd node['cluster']['sources_dir']
27-
code <<-DCGM_INSTALL
28-
set -e
29-
dpkg -i #{dcgm_package}-#{package_version}.deb
30-
DCGM_INSTALL
31-
retries 3
32-
retry_delay 5
30+
bash "Install #{package}" do
31+
user 'root'
32+
cwd node['cluster']['sources_dir']
33+
code <<-DCGM_INSTALL
34+
set -e
35+
dpkg -i #{package}-#{package_version}.deb
36+
DCGM_INSTALL
37+
retries 3
38+
retry_delay 5
39+
end
3340
end
3441
end
3542

36-
def dcgm_url
37-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}_#{package_version}_#{arch_suffix}.deb"
38-
end
39-
4043
def dcgm_package
4144
'datacenter-gpu-manager'
4245
end
4346

47+
def dcgm4_package
48+
"#{dcgm_package}-4-cuda12"
49+
end
50+
51+
def dcgm4_core_package
52+
"#{dcgm_package}-4-core"
53+
end
54+
4455
def arch_suffix
4556
arm_instance? ? 'arm64' : 'amd64'
4657
end

cookbooks/aws-parallelcluster-platform/resources/nvidia_dcgm/partial/_nvidia_dcgm_rhel.rb

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,34 +13,45 @@
1313
# See the License for the specific language governing permissions and limitations under the License.
1414

1515
action :install_package do
16-
remote_file "#{node['cluster']['sources_dir']}/#{dcgm_package}-#{package_version}.rpm" do
17-
source "#{dcgm_url}"
18-
mode '0644'
19-
retries 3
20-
retry_delay 5
21-
action :create_if_missing
16+
if package_version.start_with?("3.")
17+
packages_urls_list = [dcgm_package]
18+
else
19+
packages_urls_list = [dcgm4_core_package, dcgm4_package]
2220
end
21+
for package in packages_urls_list
22+
remote_file "#{node['cluster']['sources_dir']}/#{package}-#{package_version}.rpm" do
23+
source "#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{package}-#{package_version}-#{arch_suffix}.rpm"
24+
mode '0644'
25+
retries 3
26+
retry_delay 5
27+
action :create_if_missing
28+
end
2329

24-
bash "Install #{dcgm_package}" do
25-
user 'root'
26-
cwd node['cluster']['sources_dir']
27-
code <<-DCGM_INSTALL
28-
set -e
29-
yum install -y #{dcgm_package}-#{package_version}.rpm
30-
DCGM_INSTALL
31-
retries 3
32-
retry_delay 5
30+
bash "Install #{package}" do
31+
user 'root'
32+
cwd node['cluster']['sources_dir']
33+
code <<-DCGM_INSTALL
34+
set -e
35+
yum install -y #{package}-#{package_version}.rpm
36+
DCGM_INSTALL
37+
retries 3
38+
retry_delay 5
39+
end
3340
end
3441
end
3542

36-
def dcgm_url
37-
"#{node['cluster']['artifacts_s3_url']}/dependencies/nvidia_dcgm/#{platform}/#{dcgm_package}-#{package_version}#{arch_suffix}.rpm"
38-
end
39-
4043
def dcgm_package
4144
'datacenter-gpu-manager'
4245
end
4346

47+
def dcgm4_package
48+
"#{dcgm_package}-4-cuda12"
49+
end
50+
51+
def dcgm4_core_package
52+
"#{dcgm_package}-4-core"
53+
end
54+
4455
def arch_suffix
4556
arm_instance? ? 'aarch64' : 'x86_64'
4657
end

0 commit comments

Comments
 (0)