Skip to content

Commit 02213ef

Browse files
dgrove-ossLixiang "Eric" Luo
andcommitted
Add support for Sakkara
Co-authored-by: Lixiang "Eric" Luo <lgl@@users.noreply.github.com>
1 parent 20dd9eb commit 02213ef

File tree

3 files changed

+44
-2
lines changed

3 files changed

+44
-2
lines changed

tools/pytorchjob-generator/chart/templates/_helpers.tpl

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ metadata:
3535
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
3636
{{- end }}
3737
{{- if .Values.bypassCoscheduler }}
38-
schedulerName: default-scheduler
38+
schedulerName: {{ .Values.schedulerName }}
3939
{{- end }}
4040
priorityClassName: {{ .Values.priority }}
4141
affinity:
@@ -81,8 +81,14 @@ envFrom:
8181
- configMapRef:
8282
name: {{ .Values.ncclGdrEnvConfigMap }}
8383
{{- end }}
84-
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap }}
84+
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName "sakkara" ) }}
8585
env:
86+
{{- if eq .Values.schedulerName "sakkara" }}
87+
- name: SAKKARA_RANK
88+
valueFrom:
89+
fieldRef:
90+
fieldPath: metadata.labels['sakkara.member.rank']
91+
{{- end }}
8692
{{- if .Values.topologyFileConfigMap }}
8793
- name: NCCL_TOPO_FILE
8894
value: /var/run/nvidia-topologyd/virtualTopology.xml
@@ -146,6 +152,10 @@ command:
146152
#
147153
# User commands
148154
#
155+
{{- if eq .Values.schedulerName "sakkara" }}
156+
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
157+
export RANK=$SAKKARA_RANK
158+
{{- end }}
149159
{{- range $command := .Values.setupCommands }}
150160
{{ $command }}
151161
{{- end }}

tools/pytorchjob-generator/chart/values.schema.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,10 @@
115115
{ "type": "array" }
116116
]},
117117
"bypassCoscheduler": { "type": "boolean" },
118+
"schedulerName": { "oneOf": [
119+
{ "type": "null" },
120+
{ "type": "string", "enum": ["sakkara", "default-scheduler" ] }
121+
]},
118122
"serviceAccountName": { "oneOf" : [
119123
{ "type": "null" },
120124
{ "$ref": "#/$defs/rfc1123Label" }
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
namespace: my-namespace # namespace to deploy to (required)
2+
jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required)
3+
queueName: default-queue # local queue to submit to (default: default-queue)
4+
5+
bypassCoscheduler: true
6+
schedulerName: sakkara
7+
# If additional constraints are used, specify the configmap here:
8+
#customLabels:
9+
# - key: sakkara.group.name
10+
# value: my-topogrp-0
11+
12+
numPods: 4 # total pod count including master and worker pods (default: 1)
13+
numCpusPerPod: 500m # requested number of cpus per pod (default: 1)
14+
numGpusPerPod: 8 # requested number of gpus per pod (default: 0)
15+
totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi)
16+
17+
priority: default-priority # default-priority (default), low-priority, or high-priority
18+
19+
# container image for the pods (required)
20+
containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
21+
22+
# setup commands to run in each pod (optional)
23+
setupCommands:
24+
- git clone https://github.com/dbarnett/python-helloworld
25+
- cd python-helloworld
26+
27+
# main program to invoke via torchrun (optional)
28+
mainProgram: helloworld.py

0 commit comments

Comments
 (0)