File tree Expand file tree Collapse file tree 3 files changed +61
-39
lines changed
cookbooks/aws-parallelcluster-platform
resources/nvidia_dcgm/partial Expand file tree Collapse file tree 3 files changed +61
-39
lines changed Original file line number Diff line number Diff line change 1717# NVidia
1818default [ 'cluster' ] [ 'nvidia' ] [ 'enabled' ] = 'no'
1919default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '570.172.08'
20- default [ 'cluster' ] [ 'nvidia' ] [ 'dcgm_version' ] = '4-cuda12-4 .2.3-2.'
20+ default [ 'cluster' ] [ 'nvidia' ] [ 'dcgm_version' ] = '4.2.3-2.'
2121if platform? ( 'amazon' ) && node [ 'platform_version' ] == "2"
2222 default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '550.127.08'
2323 default [ 'cluster' ] [ 'nvidia' ] [ 'dcgm_version' ] = '3.3.6-1-'
Original file line number Diff line number Diff line change 1313# See the License for the specific language governing permissions and limitations under the License.
1414
1515action :install_package do
16- remote_file "#{ node [ 'cluster' ] [ 'sources_dir' ] } /#{ dcgm_package } -#{ package_version } .deb" do
17- source "#{ dcgm_url } "
18- mode '0644'
19- retries 3
20- retry_delay 5
21- action :create_if_missing
16+ if package_version . start_with? ( "3." )
17+ packages_urls_list = [ dcgm_package ]
18+ else
19+ packages_urls_list = [ dcgm4_core_package , dcgm4_package ]
2220 end
21+ for package in packages_urls_list
22+ remote_file "#{ node [ 'cluster' ] [ 'sources_dir' ] } /#{ package } -#{ package_version } .deb" do
23+ source "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/nvidia_dcgm/#{ platform } /#{ package } _#{ package_version } _#{ arch_suffix } .deb"
24+ mode '0644'
25+ retries 3
26+ retry_delay 5
27+ action :create_if_missing
28+ end
2329
24- bash "Install #{ dcgm_package } " do
25- user 'root'
26- cwd node [ 'cluster' ] [ 'sources_dir' ]
27- code <<-DCGM_INSTALL
28- set -e
29- dpkg -i #{ dcgm_package } -#{ package_version } .deb
30- DCGM_INSTALL
31- retries 3
32- retry_delay 5
30+ bash "Install #{ package } " do
31+ user 'root'
32+ cwd node [ 'cluster' ] [ 'sources_dir' ]
33+ code <<-DCGM_INSTALL
34+ set -e
35+ dpkg -i #{ package } -#{ package_version } .deb
36+ DCGM_INSTALL
37+ retries 3
38+ retry_delay 5
39+ end
3340 end
3441end
3542
36- def dcgm_url
37- "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/nvidia_dcgm/#{ platform } /#{ dcgm_package } _#{ package_version } _#{ arch_suffix } .deb"
38- end
39-
4043def dcgm_package
4144 'datacenter-gpu-manager'
4245end
4346
47+ def dcgm4_package
48+ "#{ dcgm_package } -4-cuda12"
49+ end
50+
51+ def dcgm4_core_package
52+ "#{ dcgm_package } -4-core"
53+ end
54+
4455def arch_suffix
4556 arm_instance? ? 'arm64' : 'amd64'
4657end
Original file line number Diff line number Diff line change 1313# See the License for the specific language governing permissions and limitations under the License.
1414
1515action :install_package do
16- remote_file "#{ node [ 'cluster' ] [ 'sources_dir' ] } /#{ dcgm_package } -#{ package_version } .rpm" do
17- source "#{ dcgm_url } "
18- mode '0644'
19- retries 3
20- retry_delay 5
21- action :create_if_missing
16+ if package_version . start_with? ( "3." )
17+ packages_urls_list = [ dcgm_package ]
18+ else
19+ packages_urls_list = [ dcgm4_core_package , dcgm4_package ]
2220 end
21+ for package in packages_urls_list
22+ remote_file "#{ node [ 'cluster' ] [ 'sources_dir' ] } /#{ package } -#{ package_version } .rpm" do
23+ source "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/nvidia_dcgm/#{ platform } /#{ package } -#{ package_version } #{ arch_suffix } .rpm"
24+ mode '0644'
25+ retries 3
26+ retry_delay 5
27+ action :create_if_missing
28+ end
2329
24- bash "Install #{ dcgm_package } " do
25- user 'root'
26- cwd node [ 'cluster' ] [ 'sources_dir' ]
27- code <<-DCGM_INSTALL
28- set -e
29- yum install -y #{ dcgm_package } -#{ package_version } .rpm
30- DCGM_INSTALL
31- retries 3
32- retry_delay 5
30+ bash "Install #{ package } " do
31+ user 'root'
32+ cwd node [ 'cluster' ] [ 'sources_dir' ]
33+ code <<-DCGM_INSTALL
34+ set -e
35+ yum install -y #{ package } -#{ package_version } .rpm
36+ DCGM_INSTALL
37+ retries 3
38+ retry_delay 5
39+ end
3340 end
3441end
3542
36- def dcgm_url
37- "#{ node [ 'cluster' ] [ 'artifacts_s3_url' ] } /dependencies/nvidia_dcgm/#{ platform } /#{ dcgm_package } -#{ package_version } #{ arch_suffix } .rpm"
38- end
39-
4043def dcgm_package
4144 'datacenter-gpu-manager'
4245end
4346
47+ def dcgm4_package
48+ "#{ dcgm_package } -4-cuda12"
49+ end
50+
51+ def dcgm4_core_package
52+ "#{ dcgm_package } -4-core"
53+ end
54+
4455def arch_suffix
4556 arm_instance? ? 'aarch64' : 'x86_64'
4657end
You can’t perform that action at this time.
0 commit comments