Skip to content

Commit 61d18a6

Browse files
authored
Patch MPI operator installation for RIGs (#161)
1 parent 9fd8d42 commit 61d18a6

File tree

1 file changed

+81
-6
lines changed

1 file changed

+81
-6
lines changed

helm_chart/install_rig_dependencies.sh

Lines changed: 81 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,14 @@ set_script_variables() {
2020
exit 1
2121
fi
2222
TRAINING_OPERATORS=$STANDARD_HELM_RELEASE_NAME-training-operators
23+
MPI_OPERATOR=$STANDARD_HELM_RELEASE_NAME-mpi-operator
2324
EFA=$STANDARD_HELM_RELEASE_NAME-aws-efa-k8s-device-plugin
2425
PATCH_ONLY=(
2526
#
2627
# These objects do not need entirely separate YAML; we just need to patch them to make them work with RIG
2728
#
2829
"$TRAINING_OPERATORS"
30+
"$MPI_OPERATOR"
2931
"$EFA"
3032
)
3133
add_ons=(
@@ -34,7 +36,7 @@ set_script_variables() {
3436
#
3537
"eks,kube-system,aws-node,daemonset"
3638
"eks,kube-system,coredns,deployment"
37-
#"hp,kube-system,mpi-operator,deployment"
39+
"hp,kube-system,$MPI_OPERATOR,deployment"
3840
#"hp,kube-system,neuron-device-plugin,daemonset"
3941
"hp,kubeflow,$TRAINING_OPERATORS,deployment"
4042
"hp,kube-system,$EFA,daemonset"
@@ -286,6 +288,65 @@ override_training_operators() {
286288
]'
287289
}
288290

291+
override_mpi_operator() {
292+
#####################################################
293+
# mpi-operator dependency needs to be present
294+
# to schedule MPI jobs but by by default, only
295+
# tolerates non-RIG nodes
296+
#
297+
# Therefore, needs to tolerate RIG node taint,
298+
# but still prefer scheduling onto non-RIG
299+
# in case cluster consists of both non-RIG and RIG
300+
#
301+
# NOTE: this based on the original Helm installation
302+
# of mpi-operator Deployment.
303+
# There are no affinities, but there are tolerations
304+
# as of commit
305+
# https://github.com/aws/sagemaker-hyperpod-cli/blob/d2130e919f3a53ad1cbacf4759edecbbbcdeda0b/helm_chart/HyperPodHelmChart/charts/mpi-operator/values.yaml#L20-L24
306+
#####################################################
307+
308+
# Using kubectl directly since relatively simple patch
309+
# that does not require new separeate file/deployments specific for RIG
310+
kubectl patch deployment $MPI_OPERATOR -n kube-system --type=json -p='[
311+
{
312+
"op": "add",
313+
"path": "/spec/template/spec/tolerations/-",
314+
"value": {
315+
"key": "sagemaker.amazonaws.com/RestrictedNode",
316+
"value": "Worker",
317+
"effect": "NoSchedule"
318+
}
319+
},
320+
{
321+
"op": "add",
322+
"path": "/spec/template/spec/affinity",
323+
"value": {
324+
"nodeAffinity": {
325+
"preferredDuringSchedulingIgnoredDuringExecution": [
326+
{
327+
"weight": 100,
328+
"preference": {
329+
"matchExpressions": [
330+
{
331+
"key": "sagemaker.amazonaws.com/instance-group-type",
332+
"operator": "NotIn",
333+
"values": ["Restricted"]
334+
}
335+
]
336+
}
337+
}
338+
]
339+
}
340+
}
341+
},
342+
{
343+
"op": "add",
344+
"path": "/metadata/annotations/rig.hyperpod.patch~1mpi-operator",
345+
"value": "{\"timestamp\": \"'$DATETIME'\"}"
346+
}
347+
]'
348+
}
349+
289350
override_efa() {
290351
#####################################################
291352
# aws-efa-k8s-device-plugin dependency needs to be present
@@ -452,7 +513,7 @@ confirm_installation_with_user() {
452513
echo "🔧 Installing Helm chart..."
453514
helm upgrade --install $RIG_HELM_RELEASE ./HyperPodHelmChartForRIG --namespace kube-system -f ./HyperPodHelmChartForRIG/values.yaml
454515
if [ $? -ne 0 ]; then
455-
echo "RIG Helm Installation Failed. Exiting (0/4 steps completed)..."
516+
echo "RIG Helm Installation Failed. Exiting (0/5 steps completed)..."
456517
return 1
457518
fi
458519

@@ -461,7 +522,7 @@ confirm_installation_with_user() {
461522
if [ "$patched" = "false" ]; then
462523
kubectl apply -f HyperPodHelmChartForRIG/charts/aws-node/templates/daemonset.nonrig.yaml -n kube-system
463524
if [ $? -ne 0 ]; then
464-
echo "RIG Helm Installation Failed (aws-node). Exiting (only 1/4 steps completed)..."
525+
echo "RIG Helm Installation Failed (aws-node). Exiting (only 1/5 steps completed)..."
465526
return 1
466527
fi
467528
else
@@ -473,27 +534,39 @@ confirm_installation_with_user() {
473534
if [ "$patched" = "false" ]; then
474535
override_training_operators
475536
if [ $? -ne 0 ]; then
476-
echo "RIG Helm Installation Failed (training-operator). Exiting (only 2/4 steps completed)..."
537+
echo "RIG Helm Installation Failed (training-operator). Exiting (only 2/5 steps completed)..."
477538
return 1
478539
fi
479540
else
480541
echo "Found annotation 'rig.hyperpod.patch/training-operators'. Skipping patching for RIG..."
481542
fi
482543

544+
# mpi-operator needs specific patch
545+
patched=$(kubectl get deployments $MPI_OPERATOR -n kube-system -o yaml | yq e '.metadata.annotations | has("rig.hyperpod.patch/mpi-operator")' -)
546+
if [ "$patched" = "false" ]; then
547+
override_mpi_operator
548+
if [ $? -ne 0 ]; then
549+
echo "RIG Helm Installation Failed (mpi-operator). Exiting (only 3/5 steps completed)..."
550+
return 1
551+
fi
552+
else
553+
echo "Found annotation 'rig.hyperpod.patch/mpi-operator'. Skipping patching for RIG..."
554+
fi
555+
483556
# efa needs specific patch
484557
patched=$(kubectl get daemonset $EFA -n kube-system -o yaml | yq e '.metadata.annotations | has("rig.hyperpod.patch/aws-efa-k8s-device-plugin")' -)
485558
if [ "$patched" = "false" ]; then
486559
override_efa
487560
if [ $? -ne 0 ]; then
488-
echo "RIG Helm Installation Failed (aws-efa-k8s-device-plugin). Exiting (only 3/4 steps completed)..."
561+
echo "RIG Helm Installation Failed (aws-efa-k8s-device-plugin). Exiting (only 4/5 steps completed)..."
489562
return 1
490563
fi
491564
else
492565
echo "Found annotation 'rig.hyperpod.patch/aws-efa-k8s-device-plugin'. Skipping patching for RIG..."
493566
fi
494567

495568
echo ""
496-
echo "✅ RIG Helm Installation Succeeded (4/4 steps completed)."
569+
echo "✅ RIG Helm Installation Succeeded (5/5 steps completed)."
497570
echo ""
498571

499572
# Warn user about CNI start up
@@ -589,6 +662,8 @@ main() {
589662
ensure_yq_installed
590663

591664
set_script_variables
665+
666+
assert_not_already_installed
592667

593668
assert_not_already_installed
594669

0 commit comments

Comments
 (0)