Skip to content

Commit e9725a2

Browse files
committed
Add installation script for RIG Worker dependencies
1 parent 30dac6a commit e9725a2

File tree

4 files changed

+333
-0
lines changed

4 files changed

+333
-0
lines changed

helm_chart/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
HyperPodHelmChartForRIG/charts/*/templates/
2+
HyperPodHelmChartForRIG/charts/*.tgz
3+
HyperPodHelmChart/charts/*.tgz
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
apiVersion: v2
2+
name: hyperpod-helm-chart-for-rig
3+
description: A Helm chart for Kubernetes
4+
5+
# A chart can be either an 'application' or a 'library' chart.
6+
#
7+
# Application charts are a collection of templates that can be packaged into versioned archives
8+
# to be deployed.
9+
#
10+
# Library charts provide useful utilities or functions for the chart developer. They're included as
11+
# a dependency of application charts to inject those utilities and functions into the rendering
12+
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
13+
type: application
14+
15+
# This is the chart version. This version number should be incremented each time you make changes
16+
# to the chart and its templates, including the app version.
17+
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18+
version: 0.1.0
19+
20+
# This is the version number of the application being deployed. This version number should be
21+
# incremented each time you make changes to the application. Versions are not expected to
22+
# follow Semantic Versioning. They should reflect the version the application is using.
23+
# It is recommended to use it with quotes.
24+
appVersion: "1.16.0"
25+
26+
dependencies:
27+
- name: training-operators
28+
version: "0.1.0"
29+
repository: "file://charts/training-operators"
30+
- name: nvidia-device-plugin
31+
version: "0.16.1"
32+
repository: https://nvidia.github.io/k8s-device-plugin
33+
condition: nvidia-device-plugin.devicePlugin.enabled
34+
- name: aws-efa-k8s-device-plugin
35+
version: "0.5.3"
36+
repository: https://aws.github.io/eks-charts/
37+
condition: aws-efa-k8s-device-plugin.devicePlugin.enabled
38+
- name: neuron-device-plugin
39+
version: "0.1.0"
40+
repository: "file://charts/neuron-device-plugin"
41+
condition: neuron-device-plugin.devicePlugin.enabled
42+
- name: health-monitoring-agent
43+
version: "0.1.0"
44+
repository: "file://charts/health-monitoring-agent"
45+
condition: health_monitoring_agent.enabled
46+
- name: mpi-operator
47+
version: "0.1.0"
48+
repository: "file://charts/mpi-operator"
49+
condition: mpi_operator.enabled
50+
- name: coredns
51+
version: "0.1.0"
52+
repository: "file://charts/coredns"
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
# OVERRIDE values for HyperPodHelmChart for RIG.
2+
# This is a YAML-formatted file.
3+
# Declare variables to be passed into your templates.
4+
5+
# value_safe_name of dependencies - see install_rig_dependencies.sh
6+
#
7+
# Note:
8+
#
9+
# The format is add-on: {keys:...} .
10+
# Helm wil AUTOMATICALLY SCOPE the add-on values when resolving each dependency
11+
# In other words, what is exposed to each dependency template is
12+
# {{ .Values.tolerations }} , NOT {{ .Values.add-on.tolerations }}
13+
coredns:
14+
tolerations:
15+
- effect: NoSchedule
16+
key: node-role.kubernetes.io/control-plane
17+
- key: CriticalAddonsOnly
18+
operator: Exists
19+
- effect: NoSchedule
20+
operator: Exists
21+
key: "sagemaker.amazonaws.com/RestrictedNode"
22+
value: "Worker"
23+
nodeSelector:
24+
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
25+
26+
health-monitoring-agent:
27+
tolerations:
28+
- effect: NoSchedule
29+
operator: Exists
30+
- effect: NoExecute
31+
operator: Exists
32+
- effect: NoSchedule
33+
operator: Exists
34+
key: "sagemaker.amazonaws.com/RestrictedNode"
35+
value: "Worker"
36+
nodeSelector: {}
37+
38+
mpi-operator:
39+
tolerations:
40+
- key: "sagemaker.amazonaws.com/node-health-status"
41+
operator: "Equal"
42+
value: "Unschedulable"
43+
effect: "NoSchedule"
44+
- effect: NoSchedule
45+
operator: Exists
46+
key: "sagemaker.amazonaws.com/RestrictedNode"
47+
value: "Worker"
48+
nodeSelector:
49+
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
50+
51+
neuron-device-plugin:
52+
tolerations:
53+
- key: CriticalAddonsOnly
54+
operator: Exists
55+
- key: "aws.amazon.com/neuron"
56+
operator: Exists
57+
effect: NoSchedule
58+
- key: "sagemaker.amazonaws.com/node-health-status"
59+
operator: Equal
60+
value: Unschedulable
61+
effect: NoSchedule
62+
- effect: NoSchedule
63+
operator: Exists
64+
key: "sagemaker.amazonaws.com/RestrictedNode"
65+
value: "Worker"
66+
nodeSelector: {}
67+
68+
training-operators:
69+
tolerations:
70+
- effect: NoSchedule
71+
operator: Exists
72+
key: "sagemaker.amazonaws.com/RestrictedNode"
73+
value: "Worker"
74+
nodeSelector:
75+
"sagemaker.amazonaws.com/instance-group-type": "Restricted"
76+
77+
# Overrides for externally-maintained Helm charts
78+
aws-efa-k8s-device-plugin:
79+
devicePlugin:
80+
enabled: true
81+
supportedInstanceLabels:
82+
values:
83+
- ml.c5n.9xlarge
84+
- ml.c5n.18xlarge
85+
- ml.g5.8xlarge
86+
- ml.g5.12xlarge
87+
- ml.g5.16xlarge
88+
- ml.g5.24xlarge
89+
- ml.g5.48xlarge
90+
- ml.g6.8xlarge
91+
- ml.g6.12xlarge
92+
- ml.g6.16xlarge
93+
- ml.g6.24xlarge
94+
- ml.g6.48xlarge
95+
- ml.g6e.8xlarge
96+
- ml.g6e.12xlarge
97+
- ml.g6e.16xlarge
98+
- ml.g6e.24xlarge
99+
- ml.g6e.48xlarge
100+
- ml.gr6.8xlarge
101+
- ml.i3en.large
102+
- ml.i3en.xlarge
103+
- ml.i3en.2xlarge
104+
- ml.i3en.3xlarge
105+
- ml.i3en.6xlarge
106+
- ml.i3en.12xlarge
107+
- ml.i3en.24xlarge
108+
- ml.m7i.large
109+
- ml.m7i.xlarge
110+
- ml.m7i.2xlarge
111+
- ml.m7i.4xlarge
112+
- ml.m7i.8xlarge
113+
- ml.m7i.12xlarge
114+
- ml.m7i.16xlarge
115+
- ml.m7i.24xlarge
116+
- ml.m7i.48xlarge
117+
- ml.p4d.24xlarge
118+
- ml.p4de.24xlarge
119+
- ml.p5.48xlarge
120+
- ml.p5e.48xlarge
121+
- ml.p5en.48xlarge
122+
- ml.r7i.large
123+
- ml.r7i.xlarge
124+
- ml.r7i.2xlarge
125+
- ml.r7i.4xlarge
126+
- ml.r7i.8xlarge
127+
- ml.r7i.12xlarge
128+
- ml.r7i.16xlarge
129+
- ml.r7i.24xlarge
130+
- ml.r7i.48xlarge
131+
- ml.trn1.32xlarge
132+
- ml.trn1n.32xlarge
133+
- ml.trn2.48xlarge
134+
tolerations:
135+
- key: CriticalAddonsOnly
136+
operator: Exists
137+
- effect: NoSchedule
138+
key: aws.amazon.com/efa
139+
operator: Exists
140+
- key: sagemaker.amazonaws.com/node-health-status
141+
operator: "Equal"
142+
value: "Unschedulable"
143+
effect: "NoSchedule"
144+
- effect: NoSchedule
145+
operator: Exists
146+
key: "sagemaker.amazonaws.com/RestrictedNode"
147+
value: "Worker"
148+
nodeSelector: {}
149+
150+
nvidia-device-plugin:
151+
devicePlugin:
152+
enabled: true
153+
allowDefaultNamespace: true
154+
namespaceOverride: "kube-system"
155+
affinity:
156+
nodeAffinity:
157+
requiredDuringSchedulingIgnoredDuringExecution:
158+
nodeSelectorTerms:
159+
- matchExpressions:
160+
- key: node.kubernetes.io/instance-type
161+
operator: In
162+
values:
163+
- ml.g5.xlarge
164+
- ml.g5.2xlarge
165+
- ml.g5.4xlarge
166+
- ml.g5.8xlarge
167+
- ml.g5.12xlarge
168+
- ml.g5.16xlarge
169+
- ml.g5.24xlarge
170+
- ml.g5.48xlarge
171+
- ml.g6.xlarge
172+
- ml.g6.2xlarge
173+
- ml.g6.4xlarge
174+
- ml.g6.8xlarge
175+
- ml.g6.16xlarge
176+
- ml.g6.12xlarge
177+
- ml.g6.24xlarge
178+
- ml.g6.48xlarge
179+
- ml.g6e.xlarge
180+
- ml.g6e.2xlarge
181+
- ml.g6e.4xlarge
182+
- ml.g6e.8xlarge
183+
- ml.g6e.12xlarge
184+
- ml.g6e.16xlarge
185+
- ml.g6e.24xlarge
186+
- ml.g6e.48xlarge
187+
- ml.gr6.4xlarge
188+
- ml.gr6.8xlarge
189+
- ml.p4d.24xlarge
190+
- ml.p4de.24xlarge
191+
- ml.p5.48xlarge
192+
- ml.p5e.48xlarge
193+
- ml.p5en.48xlarge
194+
tolerations:
195+
- key: nvidia.com/gpu
196+
operator: Exists
197+
effect: NoSchedule
198+
- key: sagemaker.amazonaws.com/node-health-status
199+
operator: Equal
200+
value: Unschedulable
201+
effect: NoSchedule
202+
- effect: NoSchedule
203+
operator: Exists
204+
key: "sagemaker.amazonaws.com/RestrictedNode"
205+
value: "Worker"
206+
nodeSelector: {}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/bin/bash
2+
3+
SRC_DIR="HyperPodHelmChart"
4+
OUTPUT_DIR="HyperPodHelmChartForRIG"
5+
6+
# Format: "<eks|hyperpod>,namespace,<k8s_name|chart_dir>"
7+
add_ons=(
8+
"eks,kube-system,coredns"
9+
"hp,kube-system,mpi-operator"
10+
"hp,kube-system,neuron-device-plugin"
11+
"hp,kube-system,health-monitoring-agent"
12+
"hp,kube-system,training-operators"
13+
)
14+
15+
fetch_yaml_and_enable_overrides() {
16+
local resources=("${!1}")
17+
18+
rm -rf $OUTPUT_DIR/charts
19+
20+
for resource in "${resources[@]}"; do
21+
IFS=',' read -r scope namespace name <<< "$resource"
22+
echo "Processing $scope add-on called $name in namespace $namespace..."
23+
24+
value_safe_name=${name//-/_} # Convert hyphens to underscores
25+
cp -r $SRC_DIR/charts/$name $OUTPUT_DIR/charts/$name
26+
rm -rf $OUTPUT_DIR/charts/$name/templates
27+
rm -f $OUTPUT_DIR/charts/$name/*.tgz
28+
mkdir -p $OUTPUT_DIR/charts/$name/templates
29+
30+
if [ "$scope" = "eks" ]; then
31+
kubectl get deployment $name -n $namespace -o yaml | \
32+
yq 'select(.kind == "Deployment" or .kind == "DaemonSet")' - | yq e "
33+
.spec.template.spec.nodeSelector = \"NODESELECTORS\" |
34+
.spec.template.spec.tolerations = \"TOLERATIONS\"
35+
" - | \
36+
sed "s/NODESELECTORS/\n{{ toYaml (index .Values \"nodeSelector\") | indent 8 }}/" |
37+
sed "s/TOLERATIONS/\n{{ toYaml (index .Values \"tolerations\" ) | indent 8 }}/" \
38+
> $OUTPUT_DIR/charts/$name/templates/$name.yml
39+
40+
41+
cat << EOF > $OUTPUT_DIR/charts/$name/Chart.yaml
42+
apiVersion: v2
43+
name: $name
44+
version: 0.1.0
45+
appVersion: 1.0
46+
description: A Helm chart for setting up $name in RIG Workers
47+
EOF
48+
49+
50+
else
51+
helm template $name $SRC_DIR/charts/$name -f $SRC_DIR/values.yaml -f $SRC_DIR/charts/$name/values.yaml --debug | \
52+
yq 'select(.kind == "Deployment" or .kind == "DaemonSet")' - | yq e "
53+
.spec.template.spec.nodeSelector = \"NODESELECTORS\" |
54+
.spec.template.spec.tolerations = \"TOLERATIONS\"
55+
" - | \
56+
sed "s/NODESELECTORS/\n{{ toYaml (index .Values \"nodeSelector\") | indent 8 }}/" |
57+
sed "s/TOLERATIONS/\n{{ toYaml (index .Values \"tolerations\" ) | indent 8 }}/" \
58+
> $OUTPUT_DIR/charts/$name/templates/$name.yml
59+
fi
60+
done
61+
}
62+
63+
if ! command -v yq &> /dev/null; then
64+
echo "Error: yq is required but not installed."
65+
exit 1
66+
fi
67+
fetch_yaml_and_enable_overrides add_ons[@]
68+
helm dependencies update ./HyperPodHelmChartForRIG # This needs to be run after any dependency template change before "helm <template | install>"
69+
helm template rig-dependencies ./HyperPodHelmChartForRIG --namespace kube-system -f ./HyperPodHelmChartForRIG/values.yaml
70+
echo "Templates generated in $OUTPUT_DIR"
71+
echo ""
72+
echo ""

0 commit comments

Comments
 (0)