Skip to content

Commit 7840fa1

Browse files
dgrove-ossLixiang "Eric" Luo
andauthored
Sakkara support (#118)
* Add support for Sakkara Co-authored-by: Lixiang "Eric" Luo <lgl@@users.noreply.github.com> * drop bypassCoscheduler; add test for setting schedulerName --------- Co-authored-by: Lixiang "Eric" Luo <lgl@@users.noreply.github.com>
1 parent 20dd9eb commit 7840fa1

File tree

7 files changed

+205
-9
lines changed

7 files changed

+205
-9
lines changed

tools/pytorchjob-generator/chart/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ customize the Jobs generated by the tool.
5959
| initContainers | array | `nil` | List of "(name, image, command[])" specifying an init containers to be run before the main job. The 'command' field is a list of commands to run in the container, see the Kubernetes entry on initContainers for reference. |
6060
| autopilotHealthChecks | array | No pre-flight checks are enabled. | Autopilot health checks. List of labels enabling one or more system health pre-flight checks. |
6161
| hostIgnoreList | array | `nil` | List of host names on which the Job must not be scheduled (to avoid faulty nodes). |
62-
| bypassCoscheduler | boolean | `false` | If true, use the default Kubernetes scheduler instead of the co-scheduler. ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set to true when explicitly directed to do so by a cluster admin!*** |
62+
| schedulerName | string | `nil` | If non-nil, use the specified Kubernetes scheduler. ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this to any non-nil value should only be done when explicitly directed to do so by a cluster admin!*** |
6363
| serviceAccountName | string | the default service account for the namespace will be used. | Service account to be used for running the Job |
6464

6565
### Fault Tolerance

tools/pytorchjob-generator/chart/templates/_helpers.tpl

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ metadata:
3434
{{- if ne .Values.terminationGracePeriodSeconds nil }}
3535
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
3636
{{- end }}
37-
{{- if .Values.bypassCoscheduler }}
38-
schedulerName: default-scheduler
37+
{{- if .Values.schedulerName }}
38+
schedulerName: {{ .Values.schedulerName }}
3939
{{- end }}
4040
priorityClassName: {{ .Values.priority }}
4141
affinity:
@@ -81,8 +81,14 @@ envFrom:
8181
- configMapRef:
8282
name: {{ .Values.ncclGdrEnvConfigMap }}
8383
{{- end }}
84-
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap }}
84+
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName "sakkara" ) }}
8585
env:
86+
{{- if eq .Values.schedulerName "sakkara" }}
87+
- name: SAKKARA_RANK
88+
valueFrom:
89+
fieldRef:
90+
fieldPath: metadata.labels['sakkara.member.rank']
91+
{{- end }}
8692
{{- if .Values.topologyFileConfigMap }}
8793
- name: NCCL_TOPO_FILE
8894
value: /var/run/nvidia-topologyd/virtualTopology.xml
@@ -146,6 +152,10 @@ command:
146152
#
147153
# User commands
148154
#
155+
{{- if eq .Values.schedulerName "sakkara" }}
156+
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
157+
export RANK=$SAKKARA_RANK
158+
{{- end }}
149159
{{- range $command := .Values.setupCommands }}
150160
{{ $command }}
151161
{{- end }}

tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1362,3 +1362,153 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
13621362
- emptyDir:
13631363
medium: Memory
13641364
name: dshm
1365+
scheduler can be set:
1366+
1: |
1367+
apiVersion: workload.codeflare.dev/v1beta2
1368+
kind: AppWrapper
1369+
metadata:
1370+
annotations:
1371+
workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.6
1372+
labels:
1373+
kueue.x-k8s.io/queue-name: default-queue
1374+
name: my-job
1375+
namespace: my-namespace
1376+
spec:
1377+
components:
1378+
- template:
1379+
apiVersion: kubeflow.org/v1
1380+
kind: PyTorchJob
1381+
metadata:
1382+
name: my-job
1383+
spec:
1384+
pytorchReplicaSpecs:
1385+
Master:
1386+
replicas: 1
1387+
restartPolicy: Never
1388+
template:
1389+
spec:
1390+
affinity:
1391+
nodeAffinity:
1392+
requiredDuringSchedulingIgnoredDuringExecution:
1393+
nodeSelectorTerms:
1394+
- matchExpressions:
1395+
- key: autopilot.ibm.com/gpuhealth
1396+
operator: NotIn
1397+
values:
1398+
- ERR
1399+
- TESTING
1400+
- EVICT
1401+
containers:
1402+
- command:
1403+
- sh
1404+
- -c
1405+
- |
1406+
echo "Environment variables set by the kubeflow training operator:"
1407+
echo ${MASTER_ADDR}:${MASTER_PORT}
1408+
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
1409+
echo My global rank is ${RANK} / ${WORLD_SIZE}
1410+
echo "Other injected environment variables:"
1411+
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
1412+
#
1413+
# User commands
1414+
#
1415+
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
1416+
export RANK=$SAKKARA_RANK
1417+
git clone https://github.com/dbarnett/python-helloworld
1418+
cd python-helloworld
1419+
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1420+
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1421+
env:
1422+
- name: SAKKARA_RANK
1423+
valueFrom:
1424+
fieldRef:
1425+
fieldPath: metadata.labels['sakkara.member.rank']
1426+
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
1427+
imagePullPolicy: IfNotPresent
1428+
name: pytorch
1429+
resources:
1430+
limits:
1431+
cpu: 500m
1432+
memory: 1Gi
1433+
nvidia.com/gpu: 8
1434+
nvidia.com/roce_gdr: 0
1435+
requests:
1436+
cpu: 500m
1437+
memory: 1Gi
1438+
nvidia.com/gpu: 8
1439+
nvidia.com/roce_gdr: 0
1440+
volumeMounts:
1441+
- mountPath: /dev/shm
1442+
name: dshm
1443+
imagePullSecrets: []
1444+
priorityClassName: default-priority
1445+
schedulerName: sakkara
1446+
volumes:
1447+
- emptyDir:
1448+
medium: Memory
1449+
name: dshm
1450+
Worker:
1451+
replicas: 3
1452+
restartPolicy: Never
1453+
template:
1454+
spec:
1455+
affinity:
1456+
nodeAffinity:
1457+
requiredDuringSchedulingIgnoredDuringExecution:
1458+
nodeSelectorTerms:
1459+
- matchExpressions:
1460+
- key: autopilot.ibm.com/gpuhealth
1461+
operator: NotIn
1462+
values:
1463+
- ERR
1464+
- TESTING
1465+
- EVICT
1466+
containers:
1467+
- command:
1468+
- sh
1469+
- -c
1470+
- |
1471+
echo "Environment variables set by the kubeflow training operator:"
1472+
echo ${MASTER_ADDR}:${MASTER_PORT}
1473+
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
1474+
echo My global rank is ${RANK} / ${WORLD_SIZE}
1475+
echo "Other injected environment variables:"
1476+
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
1477+
#
1478+
# User commands
1479+
#
1480+
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
1481+
export RANK=$SAKKARA_RANK
1482+
git clone https://github.com/dbarnett/python-helloworld
1483+
cd python-helloworld
1484+
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1485+
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1486+
env:
1487+
- name: SAKKARA_RANK
1488+
valueFrom:
1489+
fieldRef:
1490+
fieldPath: metadata.labels['sakkara.member.rank']
1491+
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
1492+
imagePullPolicy: IfNotPresent
1493+
name: pytorch
1494+
resources:
1495+
limits:
1496+
cpu: 500m
1497+
memory: 1Gi
1498+
nvidia.com/gpu: 8
1499+
nvidia.com/roce_gdr: 0
1500+
requests:
1501+
cpu: 500m
1502+
memory: 1Gi
1503+
nvidia.com/gpu: 8
1504+
nvidia.com/roce_gdr: 0
1505+
volumeMounts:
1506+
- mountPath: /dev/shm
1507+
name: dshm
1508+
imagePullSecrets: []
1509+
priorityClassName: default-priority
1510+
schedulerName: sakkara
1511+
volumes:
1512+
- emptyDir:
1513+
medium: Memory
1514+
name: dshm

tools/pytorchjob-generator/chart/tests/helloworld_test.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,13 @@ tests:
8686
path: metadata.namespace
8787
value: testing-ns
8888

89+
- it: scheduler can be set
90+
set:
91+
schedulerName: sakkara
92+
asserts:
93+
- matchSnapshot:
94+
path: spec.components[0].template
95+
8996
- it: Enabling sshGitConfig injects the envvars, volumes, and volumeMounts
9097
set:
9198
sshGitCloneConfig.secretName: my-git-secret

tools/pytorchjob-generator/chart/values.schema.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,10 @@
114114
{ "type": "null" },
115115
{ "type": "array" }
116116
]},
117-
"bypassCoscheduler": { "type": "boolean" },
117+
"schedulerName": { "oneOf": [
118+
{ "type": "null" },
119+
{ "type": "string", "enum": ["sakkara", "scheduler-plugins-scheduler", "default-scheduler" ] }
120+
]},
118121
"serviceAccountName": { "oneOf" : [
119122
{ "type": "null" },
120123
{ "$ref": "#/$defs/rfc1123Label" }

tools/pytorchjob-generator/chart/values.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -211,11 +211,11 @@ hostIgnoreList:
211211
# - a100-large-drlfv-worker-3-with-secondary-nw5qh
212212
# - a100-large-drlfv-worker-3-with-secondary-lb7ch
213213

214-
# -- (boolean) If true, use the default Kubernetes scheduler instead of the co-scheduler.
215-
# ***Setting this to true will result in GPU fragmentation on the cluster. It should only be set
216-
# to true when explicitly directed to do so by a cluster admin!***
214+
# -- (string) If non-nil, use the specified Kubernetes scheduler.
215+
# ***Setting this to the default-scheduler may result in GPU fragmentation on the cluster. Setting this
216+
# to any non-nil value should only be done when explicitly directed to do so by a cluster admin!***
217217
# @section -- Advanced Options
218-
bypassCoscheduler: false
218+
schedulerName:
219219

220220
# -- (string) Service account to be used for running the Job
221221
# @section -- Advanced Options
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required)
2+
queueName: default-queue # local queue to submit to (default: default-queue)
3+
4+
schedulerName: sakkara
5+
# If additional constraints are used, specify the configmap here:
6+
#customLabels:
7+
# - key: sakkara.group.name
8+
# value: my-topogrp-0
9+
10+
numPods: 4 # total pod count including master and worker pods (default: 1)
11+
numCpusPerPod: 500m # requested number of cpus per pod (default: 1)
12+
numGpusPerPod: 8 # requested number of gpus per pod (default: 0)
13+
totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi)
14+
15+
priority: default-priority # default-priority (default), low-priority, or high-priority
16+
17+
# container image for the pods (required)
18+
containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
19+
20+
# setup commands to run in each pod (optional)
21+
setupCommands:
22+
- git clone https://github.com/dbarnett/python-helloworld
23+
- cd python-helloworld
24+
25+
# main program to invoke via torchrun (optional)
26+
mainProgram: helloworld.py

0 commit comments

Comments
 (0)