Skip to content

Commit 0ba8c68

Browse files
rsareddy0329Roja Reddy Sareddy
andauthored
release_v2: Add secondary Helm chart and installation script for HyperPod Restricted Instance Group (RIG) use (#125)
Co-authored-by: Roja Reddy Sareddy <[email protected]>
1 parent d52279d commit 0ba8c68

File tree

5 files changed

+894
-0
lines changed

5 files changed

+894
-0
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Patterns to ignore when building packages.
2+
# This supports shell glob matching, relative path matching, and
3+
# negation (prefixed with !). Only one pattern per line.
4+
.DS_Store
5+
# Common VCS dirs
6+
.git/
7+
.gitignore
8+
.bzr/
9+
.bzrignore
10+
.hg/
11+
.hgignore
12+
.svn/
13+
# Common backup files
14+
*.swp
15+
*.bak
16+
*.tmp
17+
*.orig
18+
*~
19+
# Various IDEs
20+
.project
21+
.idea/
22+
*.tmproj
23+
.vscode/
24+
# HyperPod
25+
*.nonrig.yaml
26+
*.nonrig.yml
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
apiVersion: v2
2+
name: hyperpod-helm-chart-for-rig
3+
description: A Helm chart for Kubernetes
4+
5+
# A chart can be either an 'application' or a 'library' chart.
6+
#
7+
# Application charts are a collection of templates that can be packaged into versioned archives
8+
# to be deployed.
9+
#
10+
# Library charts provide useful utilities or functions for the chart developer. They're included as
11+
# a dependency of application charts to inject those utilities and functions into the rendering
12+
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
13+
type: application
14+
15+
# This is the chart version. This version number should be incremented each time you make changes
16+
# to the chart and its templates, including the app version.
17+
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18+
version: 0.1.0
19+
20+
# This is the version number of the application being deployed. This version number should be
21+
# incremented each time you make changes to the application. Versions are not expected to
22+
# follow Semantic Versioning. They should reflect the version the application is using.
23+
# It is recommended to use it with quotes.
24+
appVersion: "1.16.0"
25+
26+
dependencies:
27+
- name: nvidia-device-plugin
28+
version: "0.16.1"
29+
repository: https://nvidia.github.io/k8s-device-plugin
30+
condition: nvidia-device-plugin.devicePlugin.enabled
31+
- name: coredns
32+
version: "0.1.0"
33+
repository: "file://charts/coredns"
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
# OVERRIDE values for HyperPodHelmChart for RIG.
2+
# This is a YAML-formatted file.
3+
# Declare variables to be passed into your templates.
4+
5+
# value_safe_name of dependencies - see install_rig_dependencies.sh
6+
#
7+
# Note:
8+
#
9+
# The format is add-on: {keys:...} .
10+
# Helm wil AUTOMATICALLY SCOPE the add-on values when resolving each dependency
11+
# In other words, what is exposed to each dependency template is
12+
# {{ .Values.tolerations }} , NOT {{ .Values.add-on.tolerations }}
13+
14+
15+
# This is a special case. We will not use values.yaml to override the values for this add-on
16+
# It will still be rendered by Helm using special logic.
17+
# See install_rig_dependncies.yaml
18+
#aws-cni:
19+
20+
21+
coredns:
22+
tolerations:
23+
- effect: NoSchedule
24+
key: node-role.kubernetes.io/control-plane
25+
- key: CriticalAddonsOnly
26+
operator: Exists
27+
- effect: NoSchedule
28+
operator: Equal
29+
key: "sagemaker.amazonaws.com/RestrictedNode"
30+
value: "Worker"
31+
nodeSelector:
32+
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
33+
34+
mpi-operator:
35+
tolerations:
36+
- key: "sagemaker.amazonaws.com/node-health-status"
37+
operator: "Equal"
38+
value: "Unschedulable"
39+
effect: "NoSchedule"
40+
- effect: NoSchedule
41+
operator: Equal
42+
key: "sagemaker.amazonaws.com/RestrictedNode"
43+
value: "Worker"
44+
nodeSelector:
45+
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
46+
47+
neuron-device-plugin:
48+
tolerations:
49+
- key: CriticalAddonsOnly
50+
operator: Exists
51+
- key: "aws.amazon.com/neuron"
52+
operator: Exists
53+
effect: NoSchedule
54+
- key: "sagemaker.amazonaws.com/node-health-status"
55+
operator: Equal
56+
value: Unschedulable
57+
effect: NoSchedule
58+
- effect: NoSchedule
59+
operator: Equal
60+
key: "sagemaker.amazonaws.com/RestrictedNode"
61+
value: "Worker"
62+
nodeSelector:
63+
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
64+
65+
training-operators:
66+
tolerations:
67+
- effect: NoSchedule
68+
operator: Equal
69+
key: "sagemaker.amazonaws.com/RestrictedNode"
70+
value: "Worker"
71+
nodeSelector:
72+
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
73+
74+
# Overrides for externally-maintained Helm charts
75+
aws-efa-k8s-device-plugin:
76+
devicePlugin:
77+
enabled: true
78+
supportedInstanceLabels:
79+
values:
80+
- ml.c5n.9xlarge
81+
- ml.c5n.18xlarge
82+
- ml.g5.8xlarge
83+
- ml.g5.12xlarge
84+
- ml.g5.16xlarge
85+
- ml.g5.24xlarge
86+
- ml.g5.48xlarge
87+
- ml.g6.8xlarge
88+
- ml.g6.12xlarge
89+
- ml.g6.16xlarge
90+
- ml.g6.24xlarge
91+
- ml.g6.48xlarge
92+
- ml.g6e.8xlarge
93+
- ml.g6e.12xlarge
94+
- ml.g6e.16xlarge
95+
- ml.g6e.24xlarge
96+
- ml.g6e.48xlarge
97+
- ml.gr6.8xlarge
98+
- ml.i3en.large
99+
- ml.i3en.xlarge
100+
- ml.i3en.2xlarge
101+
- ml.i3en.3xlarge
102+
- ml.i3en.6xlarge
103+
- ml.i3en.12xlarge
104+
- ml.i3en.24xlarge
105+
- ml.m7i.large
106+
- ml.m7i.xlarge
107+
- ml.m7i.2xlarge
108+
- ml.m7i.4xlarge
109+
- ml.m7i.8xlarge
110+
- ml.m7i.12xlarge
111+
- ml.m7i.16xlarge
112+
- ml.m7i.24xlarge
113+
- ml.m7i.48xlarge
114+
- ml.p4d.24xlarge
115+
- ml.p4de.24xlarge
116+
- ml.p5.48xlarge
117+
- ml.p5e.48xlarge
118+
- ml.p5en.48xlarge
119+
- ml.r7i.large
120+
- ml.r7i.xlarge
121+
- ml.r7i.2xlarge
122+
- ml.r7i.4xlarge
123+
- ml.r7i.8xlarge
124+
- ml.r7i.12xlarge
125+
- ml.r7i.16xlarge
126+
- ml.r7i.24xlarge
127+
- ml.r7i.48xlarge
128+
- ml.trn1.32xlarge
129+
- ml.trn1n.32xlarge
130+
- ml.trn2.48xlarge
131+
tolerations:
132+
- key: CriticalAddonsOnly
133+
operator: Exists
134+
- effect: NoSchedule
135+
key: aws.amazon.com/efa
136+
operator: Exists
137+
- key: sagemaker.amazonaws.com/node-health-status
138+
operator: "Equal"
139+
value: "Unschedulable"
140+
effect: "NoSchedule"
141+
- effect: NoSchedule
142+
operator: Equal
143+
key: "sagemaker.amazonaws.com/RestrictedNode"
144+
value: "Worker"
145+
nodeSelector:
146+
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
147+
148+
nvidia-device-plugin:
149+
devicePlugin:
150+
enabled: true
151+
allowDefaultNamespace: true
152+
namespaceOverride: "kube-system"
153+
affinity:
154+
nodeAffinity:
155+
requiredDuringSchedulingIgnoredDuringExecution:
156+
nodeSelectorTerms:
157+
- matchExpressions:
158+
- key: node.kubernetes.io/instance-type
159+
operator: In
160+
values:
161+
- ml.g5.xlarge
162+
- ml.g5.2xlarge
163+
- ml.g5.4xlarge
164+
- ml.g5.8xlarge
165+
- ml.g5.12xlarge
166+
- ml.g5.16xlarge
167+
- ml.g5.24xlarge
168+
- ml.g5.48xlarge
169+
- ml.g6.xlarge
170+
- ml.g6.2xlarge
171+
- ml.g6.4xlarge
172+
- ml.g6.8xlarge
173+
- ml.g6.16xlarge
174+
- ml.g6.12xlarge
175+
- ml.g6.24xlarge
176+
- ml.g6.48xlarge
177+
- ml.g6e.xlarge
178+
- ml.g6e.2xlarge
179+
- ml.g6e.4xlarge
180+
- ml.g6e.8xlarge
181+
- ml.g6e.12xlarge
182+
- ml.g6e.16xlarge
183+
- ml.g6e.24xlarge
184+
- ml.g6e.48xlarge
185+
- ml.gr6.4xlarge
186+
- ml.gr6.8xlarge
187+
- ml.p4d.24xlarge
188+
- ml.p4de.24xlarge
189+
- ml.p5.48xlarge
190+
- ml.p5e.48xlarge
191+
- ml.p5en.48xlarge
192+
tolerations:
193+
- key: nvidia.com/gpu
194+
operator: Exists
195+
effect: NoSchedule
196+
- key: sagemaker.amazonaws.com/node-health-status
197+
operator: Equal
198+
value: Unschedulable
199+
effect: NoSchedule
200+
- effect: NoSchedule
201+
operator: Equal
202+
key: "sagemaker.amazonaws.com/RestrictedNode"
203+
value: "Worker"
204+
nodeSelector:
205+
"sagemaker.amazonaws.com/instance-group-type": "Restricted"

0 commit comments

Comments
 (0)