@@ -20,12 +20,14 @@ set_script_variables() {
2020 exit 1
2121 fi
2222 TRAINING_OPERATORS=$STANDARD_HELM_RELEASE_NAME -training-operators
23+ MPI_OPERATOR=$STANDARD_HELM_RELEASE_NAME -mpi-operator
2324 EFA=$STANDARD_HELM_RELEASE_NAME -aws-efa-k8s-device-plugin
2425 PATCH_ONLY=(
2526 #
2627 # These objects do not need entirely separate YAML; we just need to patch them to make them work with RIG
2728 #
2829 " $TRAINING_OPERATORS "
30+ " $MPI_OPERATOR "
2931 " $EFA "
3032 )
3133 add_ons=(
@@ -34,7 +36,7 @@ set_script_variables() {
3436 #
3537 " eks,kube-system,aws-node,daemonset"
3638 " eks,kube-system,coredns,deployment"
37- # "hp,kube-system,mpi-operator ,deployment"
39+ " hp,kube-system,$MPI_OPERATOR ,deployment"
3840 # "hp,kube-system,neuron-device-plugin,daemonset"
3941 " hp,kubeflow,$TRAINING_OPERATORS ,deployment"
4042 " hp,kube-system,$EFA ,daemonset"
@@ -286,6 +288,65 @@ override_training_operators() {
286288 ]'
287289}
288290
291+ override_mpi_operator () {
292+ # ####################################################
293+ # mpi-operator dependency needs to be present
294+ # to schedule MPI jobs but by by default, only
295+ # tolerates non-RIG nodes
296+ #
297+ # Therefore, needs to tolerate RIG node taint,
298+ # but still prefer scheduling onto non-RIG
299+ # in case cluster consists of both non-RIG and RIG
300+ #
301+ # NOTE: this based on the original Helm installation
302+ # of mpi-operator Deployment.
303+ # There are no affinities, but there are tolerations
304+ # as of commit
305+ # https://github.com/aws/sagemaker-hyperpod-cli/blob/d2130e919f3a53ad1cbacf4759edecbbbcdeda0b/helm_chart/HyperPodHelmChart/charts/mpi-operator/values.yaml#L20-L24
306+ # ####################################################
307+
308+ # Using kubectl directly since relatively simple patch
309+ # that does not require new separeate file/deployments specific for RIG
310+ kubectl patch deployment $MPI_OPERATOR -n kube-system --type=json -p=' [
311+ {
312+ "op": "add",
313+ "path": "/spec/template/spec/tolerations/-",
314+ "value": {
315+ "key": "sagemaker.amazonaws.com/RestrictedNode",
316+ "value": "Worker",
317+ "effect": "NoSchedule"
318+ }
319+ },
320+ {
321+ "op": "add",
322+ "path": "/spec/template/spec/affinity",
323+ "value": {
324+ "nodeAffinity": {
325+ "preferredDuringSchedulingIgnoredDuringExecution": [
326+ {
327+ "weight": 100,
328+ "preference": {
329+ "matchExpressions": [
330+ {
331+ "key": "sagemaker.amazonaws.com/instance-group-type",
332+ "operator": "NotIn",
333+ "values": ["Restricted"]
334+ }
335+ ]
336+ }
337+ }
338+ ]
339+ }
340+ }
341+ },
342+ {
343+ "op": "add",
344+ "path": "/metadata/annotations/rig.hyperpod.patch~1mpi-operator",
345+ "value": "{\"timestamp\": \"' $DATETIME ' \"}"
346+ }
347+ ]'
348+ }
349+
289350override_efa () {
290351 # ####################################################
291352 # aws-efa-k8s-device-plugin dependency needs to be present
@@ -452,7 +513,7 @@ confirm_installation_with_user() {
452513 echo " 🔧 Installing Helm chart..."
453514 helm upgrade --install $RIG_HELM_RELEASE ./HyperPodHelmChartForRIG --namespace kube-system -f ./HyperPodHelmChartForRIG/values.yaml
454515 if [ $? -ne 0 ]; then
455- echo " RIG Helm Installation Failed. Exiting (0/4 steps completed)..."
516+ echo " RIG Helm Installation Failed. Exiting (0/5 steps completed)..."
456517 return 1
457518 fi
458519
@@ -461,7 +522,7 @@ confirm_installation_with_user() {
461522 if [ " $patched " = " false" ]; then
462523 kubectl apply -f HyperPodHelmChartForRIG/charts/aws-node/templates/daemonset.nonrig.yaml -n kube-system
463524 if [ $? -ne 0 ]; then
464- echo " RIG Helm Installation Failed (aws-node). Exiting (only 1/4 steps completed)..."
525+ echo " RIG Helm Installation Failed (aws-node). Exiting (only 1/5 steps completed)..."
465526 return 1
466527 fi
467528 else
@@ -473,27 +534,39 @@ confirm_installation_with_user() {
473534 if [ " $patched " = " false" ]; then
474535 override_training_operators
475536 if [ $? -ne 0 ]; then
476- echo " RIG Helm Installation Failed (training-operator). Exiting (only 2/4 steps completed)..."
537+ echo " RIG Helm Installation Failed (training-operator). Exiting (only 2/5 steps completed)..."
477538 return 1
478539 fi
479540 else
480541 echo " Found annotation 'rig.hyperpod.patch/training-operators'. Skipping patching for RIG..."
481542 fi
482543
544+ # mpi-operator needs specific patch
545+ patched=$( kubectl get deployments $MPI_OPERATOR -n kube-system -o yaml | yq e ' .metadata.annotations | has("rig.hyperpod.patch/mpi-operator")' -)
546+ if [ " $patched " = " false" ]; then
547+ override_mpi_operator
548+ if [ $? -ne 0 ]; then
549+ echo " RIG Helm Installation Failed (mpi-operator). Exiting (only 3/5 steps completed)..."
550+ return 1
551+ fi
552+ else
553+ echo " Found annotation 'rig.hyperpod.patch/mpi-operator'. Skipping patching for RIG..."
554+ fi
555+
483556 # efa needs specific patch
484557 patched=$( kubectl get daemonset $EFA -n kube-system -o yaml | yq e ' .metadata.annotations | has("rig.hyperpod.patch/aws-efa-k8s-device-plugin")' -)
485558 if [ " $patched " = " false" ]; then
486559 override_efa
487560 if [ $? -ne 0 ]; then
488- echo " RIG Helm Installation Failed (aws-efa-k8s-device-plugin). Exiting (only 3/4 steps completed)..."
561+ echo " RIG Helm Installation Failed (aws-efa-k8s-device-plugin). Exiting (only 4/5 steps completed)..."
489562 return 1
490563 fi
491564 else
492565 echo " Found annotation 'rig.hyperpod.patch/aws-efa-k8s-device-plugin'. Skipping patching for RIG..."
493566 fi
494567
495568 echo " "
496- echo " ✅ RIG Helm Installation Succeeded (4/4 steps completed)."
569+ echo " ✅ RIG Helm Installation Succeeded (5/5 steps completed)."
497570 echo " "
498571
499572 # Warn user about CNI start up
@@ -589,6 +662,8 @@ main() {
589662 ensure_yq_installed
590663
591664 set_script_variables
665+
666+ assert_not_already_installed
592667
593668 assert_not_already_installed
594669
0 commit comments