Sakkara support (#118)

dgrove-oss · Lixiang "Eric" Luo · web-flow · commit 7840fa12380f · 2024-12-19T13:48:31.000-05:00
* Add support for Sakkara

Co-authored-by: Lixiang "Eric" Luo &lt;lgl@@users.noreply.github.com&gt;

* drop bypassCoscheduler; add test for setting schedulerName

---------

Co-authored-by: Lixiang "Eric" Luo &lt;lgl@@users.noreply.github.com&gt;
diff --git a/tools/pytorchjob-generator/chart/README.md b/tools/pytorchjob-generator/chart/README.md
@@ -59,7 +59,7 @@ customize the Jobs generated by the tool.
 | initContainers | array | `nil` | List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference.  |
 | autopilotHealthChecks | array | No pre-flight checks are enabled. | Autopilot health checks. List of labels enabling one or more system health pre-flight checks. |
 | hostIgnoreList | array | `nil` | List of host names on which the Job must not be scheduled (to avoid faulty nodes). |
-| bypassCoscheduler | boolean | `false` | If true, use the default Kubernetes scheduler instead of the co-scheduler. ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set to true when explicitly directed to do so by a cluster admin!*** |
+| schedulerName | string | `nil` | If non-nil, use the specified Kubernetes scheduler. ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** |
 | serviceAccountName | string | the default service account for the namespace will be used. | Service account to be used for running the Job |
 
 ### Fault Tolerance
diff --git a/tools/pytorchjob-generator/chart/templates/_helpers.tpl b/tools/pytorchjob-generator/chart/templates/_helpers.tpl
@@ -34,8 +34,8 @@ metadata:
 {{- if ne .Values.terminationGracePeriodSeconds nil }}
 terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
 {{- end }}
-{{- if .Values.bypassCoscheduler }}
-schedulerName: default-scheduler
+{{- if .Values.schedulerName }}
+schedulerName: {{ .Values.schedulerName }}
 {{- end }}
 priorityClassName: {{ .Values.priority }}
 affinity:
@@ -81,8 +81,14 @@ envFrom:
   - configMapRef:
       name: {{ .Values.ncclGdrEnvConfigMap }}
 {{- end }}
-{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap }}
+{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName "sakkara" ) }}
 env:
+    {{- if eq .Values.schedulerName "sakkara" }}
+    - name: SAKKARA_RANK
+      valueFrom:
+        fieldRef:
+          fieldPath: metadata.labels['sakkara.member.rank']
+    {{- end }}
     {{- if .Values.topologyFileConfigMap }}
     - name: NCCL_TOPO_FILE
       value: /var/run/nvidia-topologyd/virtualTopology.xml
@@ -146,6 +152,10 @@ command:
       #
       # User commands
       #
+      {{- if eq .Values.schedulerName "sakkara" }}
+      echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
+      export RANK=$SAKKARA_RANK
+      {{- end }}
       {{- range $command := .Values.setupCommands }}
       {{ $command }}
       {{- end }}
diff --git a/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap b/tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap
@@ -1362,3 +1362,153 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
                         - emptyDir:
                             medium: Memory
                           name: dshm
+scheduler can be set:
+  1: |
+    apiVersion: workload.codeflare.dev/v1beta2
+    kind: AppWrapper
+    metadata:
+      annotations:
+        workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.6
+      labels:
+        kueue.x-k8s.io/queue-name: default-queue
+      name: my-job
+      namespace: my-namespace
+    spec:
+      components:
+        - template:
+            apiVersion: kubeflow.org/v1
+            kind: PyTorchJob
+            metadata:
+              name: my-job
+            spec:
+              pytorchReplicaSpecs:
+                Master:
+                  replicas: 1
+                  restartPolicy: Never
+                  template:
+                    spec:
+                      affinity:
+                        nodeAffinity:
+                          requiredDuringSchedulingIgnoredDuringExecution:
+                            nodeSelectorTerms:
+                              - matchExpressions:
+                                  - key: autopilot.ibm.com/gpuhealth
+                                    operator: NotIn
+                                    values:
+                                      - ERR
+                                      - TESTING
+                                      - EVICT
+                      containers:
+                        - command:
+                            - sh
+                            - -c
+                            - |
+                              echo "Environment variables set by the kubeflow training operator:"
+                              echo ${MASTER_ADDR}:${MASTER_PORT}
+                              echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
+                              echo My global rank is ${RANK} / ${WORLD_SIZE}
+                              echo "Other injected environment variables:"
+                              echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
+                              #
+                              # User commands
+                              #
+                              echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
+                              export RANK=$SAKKARA_RANK
+                              git clone https://github.com/dbarnett/python-helloworld
+                              cd python-helloworld
+                              echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
+                              torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
+                          env:
+                            - name: SAKKARA_RANK
+                              valueFrom:
+                                fieldRef:
+                                  fieldPath: metadata.labels['sakkara.member.rank']
+                          image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
+                          imagePullPolicy: IfNotPresent
+                          name: pytorch
+                          resources:
+                            limits:
+                              cpu: 500m
+                              memory: 1Gi
+                              nvidia.com/gpu: 8
+                              nvidia.com/roce_gdr: 0
+                            requests:
+                              cpu: 500m
+                              memory: 1Gi
+                              nvidia.com/gpu: 8
+                              nvidia.com/roce_gdr: 0
+                          volumeMounts:
+                            - mountPath: /dev/shm
+                              name: dshm
+                      imagePullSecrets: []
+                      priorityClassName: default-priority
+                      schedulerName: sakkara
+                      volumes:
+                        - emptyDir:
+                            medium: Memory
+                          name: dshm
+                Worker:
+                  replicas: 3
+                  restartPolicy: Never
+                  template:
+                    spec:
+                      affinity:
+                        nodeAffinity:
+                          requiredDuringSchedulingIgnoredDuringExecution:
+                            nodeSelectorTerms:
+                              - matchExpressions:
+                                  - key: autopilot.ibm.com/gpuhealth
+                                    operator: NotIn
+                                    values:
+                                      - ERR
+                                      - TESTING
+                                      - EVICT
+                      containers:
+                        - command:
+                            - sh
+                            - -c
+                            - |
+                              echo "Environment variables set by the kubeflow training operator:"
+                              echo ${MASTER_ADDR}:${MASTER_PORT}
+                              echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
+                              echo My global rank is ${RANK} / ${WORLD_SIZE}
+                              echo "Other injected environment variables:"
+                              echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
+                              #
+                              # User commands
+                              #
+                              echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
+                              export RANK=$SAKKARA_RANK
+                              git clone https://github.com/dbarnett/python-helloworld
+                              cd python-helloworld
+                              echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
+                              torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
+                          env:
+                            - name: SAKKARA_RANK
+                              valueFrom:
+                                fieldRef:
+                                  fieldPath: metadata.labels['sakkara.member.rank']
+                          image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
+                          imagePullPolicy: IfNotPresent
+                          name: pytorch
+                          resources:
+                            limits:
+                              cpu: 500m
+                              memory: 1Gi
+                              nvidia.com/gpu: 8
+                              nvidia.com/roce_gdr: 0
+                            requests:
+                              cpu: 500m
+                              memory: 1Gi
+                              nvidia.com/gpu: 8
+                              nvidia.com/roce_gdr: 0
+                          volumeMounts:
+                            - mountPath: /dev/shm
+                              name: dshm
+                      imagePullSecrets: []
+                      priorityClassName: default-priority
+                      schedulerName: sakkara
+                      volumes:
+                        - emptyDir:
+                            medium: Memory
+                          name: dshm
diff --git a/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml b/tools/pytorchjob-generator/chart/tests/helloworld_test.yaml
@@ -86,6 +86,13 @@ tests:
       path: metadata.namespace
       value: testing-ns
 
+- it: scheduler can be set
+  set:
+    schedulerName: sakkara
+  asserts:
+  - matchSnapshot:
+    path: spec.components[0].template
+
 - it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts
   set:
     sshGitCloneConfig.secretName: my-git-secret
diff --git a/tools/pytorchjob-generator/chart/values.schema.json b/tools/pytorchjob-generator/chart/values.schema.json
@@ -114,7 +114,10 @@
             { "type": "null" },
             { "type": "array" }
         ]},
-        "bypassCoscheduler": { "type": "boolean" },
+        "schedulerName": { "oneOf": [
+            { "type": "null" },
+            { "type": "string", "enum": ["sakkara", "scheduler-plugins-scheduler", "default-scheduler" ] }
+        ]},
         "serviceAccountName":  { "oneOf" : [
             { "type": "null" },
             { "$ref": "#/$defs/rfc1123Label" }
diff --git a/tools/pytorchjob-generator/chart/values.yaml b/tools/pytorchjob-generator/chart/values.yaml
@@ -211,11 +211,11 @@ hostIgnoreList:
 #    - a100-large-drlfv-worker-3-with-secondary-nw5qh
 #    - a100-large-drlfv-worker-3-with-secondary-lb7ch
 
-# -- (boolean) If true, use the default Kubernetes scheduler instead of the co-scheduler.
-# ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set
-# to true when explicitly directed to do so by a cluster admin!***
+# -- (string) If non-nil, use the specified Kubernetes scheduler.
+# ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this
+# to any non-nil value should only be done when explicitly directed to do so by a cluster admin!***
 # @section -- Advanced Options
-bypassCoscheduler: false
+schedulerName:
 
 # -- (string) Service account to be used for running the Job
 # @section -- Advanced Options
diff --git a/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml b/tools/pytorchjob-generator/examples/helloworld-sakkara.settings.yaml
@@ -0,0 +1,26 @@
+jobName: my-job               # name of the generated AppWrapper and PyTorchJob objects (required)
+queueName: default-queue      # local queue to submit to (default: default-queue)
+
+schedulerName: sakkara
+# If additional constraints are used, specify the configmap here:
+#customLabels:
+#  - key: sakkara.group.name
+#    value: my-topogrp-0
+
+numPods: 4                    # total pod count including master and worker pods (default: 1)
+numCpusPerPod: 500m           # requested number of cpus per pod (default: 1)
+numGpusPerPod: 8              # requested number of gpus per pod (default: 0)
+totalMemoryPerPod: 1Gi        # requested amount of memory per pod (default: 1Gi)
+
+priority: default-priority    # default-priority (default), low-priority, or high-priority
+
+# container image for the pods (required)
+containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
+
+# setup commands to run in each pod (optional)
+setupCommands:
+- git clone https://github.com/dbarnett/python-helloworld
+- cd python-helloworld
+
+# main program to invoke via torchrun (optional)
+mainProgram: helloworld.py