Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tools/pytorchjob-generator/chart/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ customize the Jobs generated by the tool.
| initContainers | array | `nil` | List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference. |
| autopilotHealthChecks | array | No pre-flight checks are enabled. | Autopilot health checks. List of labels enabling one or more system health pre-flight checks. |
| hostIgnoreList | array | `nil` | List of host names on which the Job must not be scheduled (to avoid faulty nodes). |
| bypassCoscheduler | boolean | `false` | If true, use the default Kubernetes scheduler instead of the co-scheduler. ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set to true when explicitly directed to do so by a cluster admin!*** |
| schedulerName | string | `nil` | If non-nil, use the specified Kubernetes scheduler. ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** |
| serviceAccountName | string | the default service account for the namespace will be used. | Service account to be used for running the Job |

### Fault Tolerance
Expand Down
16 changes: 13 additions & 3 deletions tools/pytorchjob-generator/chart/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ metadata:
{{- if ne .Values.terminationGracePeriodSeconds nil }}
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
{{- end }}
{{- if .Values.bypassCoscheduler }}
schedulerName: default-scheduler
{{- if .Values.schedulerName }}
schedulerName: {{ .Values.schedulerName }}
{{- end }}
priorityClassName: {{ .Values.priority }}
affinity:
Expand Down Expand Up @@ -81,8 +81,14 @@ envFrom:
- configMapRef:
name: {{ .Values.ncclGdrEnvConfigMap }}
{{- end }}
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap }}
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName "sakkara" ) }}
env:
{{- if eq .Values.schedulerName "sakkara" }}
- name: SAKKARA_RANK
valueFrom:
fieldRef:
fieldPath: metadata.labels['sakkara.member.rank']
{{- end }}
{{- if .Values.topologyFileConfigMap }}
- name: NCCL_TOPO_FILE
value: /var/run/nvidia-topologyd/virtualTopology.xml
Expand Down Expand Up @@ -146,6 +152,10 @@ command:
#
# User commands
#
{{- if eq .Values.schedulerName "sakkara" }}
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
export RANK=$SAKKARA_RANK
{{- end }}
{{- range $command := .Values.setupCommands }}
{{ $command }}
{{- end }}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1362,3 +1362,153 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
- emptyDir:
medium: Memory
name: dshm
scheduler can be set:
1: |
apiVersion: workload.codeflare.dev/v1beta2
kind: AppWrapper
metadata:
annotations:
workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.6
labels:
kueue.x-k8s.io/queue-name: default-queue
name: my-job
namespace: my-namespace
spec:
components:
- template:
apiVersion: kubeflow.org/v1
kind: PyTorchJob
metadata:
name: my-job
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: Never
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
- -c
- |
echo "Environment variables set by the kubeflow training operator:"
echo ${MASTER_ADDR}:${MASTER_PORT}
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
echo My global rank is ${RANK} / ${WORLD_SIZE}
echo "Other injected environment variables:"
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
#
# User commands
#
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
export RANK=$SAKKARA_RANK
git clone https://github.com/dbarnett/python-helloworld
cd python-helloworld
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
env:
- name: SAKKARA_RANK
valueFrom:
fieldRef:
fieldPath: metadata.labels['sakkara.member.rank']
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
imagePullPolicy: IfNotPresent
name: pytorch
resources:
limits:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
requests:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
volumeMounts:
- mountPath: /dev/shm
name: dshm
imagePullSecrets: []
priorityClassName: default-priority
schedulerName: sakkara
volumes:
- emptyDir:
medium: Memory
name: dshm
Worker:
replicas: 3
restartPolicy: Never
template:
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: autopilot.ibm.com/gpuhealth
operator: NotIn
values:
- ERR
- TESTING
- EVICT
containers:
- command:
- sh
- -c
- |
echo "Environment variables set by the kubeflow training operator:"
echo ${MASTER_ADDR}:${MASTER_PORT}
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
echo My global rank is ${RANK} / ${WORLD_SIZE}
echo "Other injected environment variables:"
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
#
# User commands
#
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
export RANK=$SAKKARA_RANK
git clone https://github.com/dbarnett/python-helloworld
cd python-helloworld
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
env:
- name: SAKKARA_RANK
valueFrom:
fieldRef:
fieldPath: metadata.labels['sakkara.member.rank']
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
imagePullPolicy: IfNotPresent
name: pytorch
resources:
limits:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
requests:
cpu: 500m
memory: 1Gi
nvidia.com/gpu: 8
nvidia.com/roce_gdr: 0
volumeMounts:
- mountPath: /dev/shm
name: dshm
imagePullSecrets: []
priorityClassName: default-priority
schedulerName: sakkara
volumes:
- emptyDir:
medium: Memory
name: dshm
7 changes: 7 additions & 0 deletions tools/pytorchjob-generator/chart/tests/helloworld_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ tests:
path: metadata.namespace
value: testing-ns

- it: scheduler can be set
set:
schedulerName: sakkara
asserts:
- matchSnapshot:
path: spec.components[0].template

- it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts
set:
sshGitCloneConfig.secretName: my-git-secret
Expand Down
5 changes: 4 additions & 1 deletion tools/pytorchjob-generator/chart/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,10 @@
{ "type": "null" },
{ "type": "array" }
]},
"bypassCoscheduler": { "type": "boolean" },
"schedulerName": { "oneOf": [
{ "type": "null" },
{ "type": "string", "enum": ["sakkara", "scheduler-plugins-scheduler", "default-scheduler" ] }
]},
"serviceAccountName": { "oneOf" : [
{ "type": "null" },
{ "$ref": "#/$defs/rfc1123Label" }
Expand Down
8 changes: 4 additions & 4 deletions tools/pytorchjob-generator/chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -211,11 +211,11 @@ hostIgnoreList:
# - a100-large-drlfv-worker-3-with-secondary-nw5qh
# - a100-large-drlfv-worker-3-with-secondary-lb7ch

# -- (boolean) If true, use the default Kubernetes scheduler instead of the co-scheduler.
# ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set
# to true when explicitly directed to do so by a cluster admin!***
# -- (string) If non-nil, use the specified Kubernetes scheduler.
# ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this
# to any non-nil value should only be done when explicitly directed to do so by a cluster admin!***
# @section -- Advanced Options
bypassCoscheduler: false
schedulerName:

# -- (string) Service account to be used for running the Job
# @section -- Advanced Options
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required)
queueName: default-queue # local queue to submit to (default: default-queue)

schedulerName: sakkara
# If additional constraints are used, specify the configmap here:
#customLabels:
# - key: sakkara.group.name
# value: my-topogrp-0

numPods: 4 # total pod count including master and worker pods (default: 1)
numCpusPerPod: 500m # requested number of cpus per pod (default: 1)
numGpusPerPod: 8 # requested number of gpus per pod (default: 0)
totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi)

priority: default-priority # default-priority (default), low-priority, or high-priority

# container image for the pods (required)
containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126

# setup commands to run in each pod (optional)
setupCommands:
- git clone https://github.com/dbarnett/python-helloworld
- cd python-helloworld

# main program to invoke via torchrun (optional)
mainProgram: helloworld.py
Loading