Skip to content

Commit 397ea3b

Browse files
committed
Sakkara support in MLBatch is added
1 parent 58b9291 commit 397ea3b

File tree

3 files changed

+44
-2
lines changed

3 files changed

+44
-2
lines changed

tools/pytorchjob-generator/chart/templates/_helpers.tpl

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ annotations:
3333
terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
3434
{{- end }}
3535
{{- if .Values.bypassCoscheduler }}
36-
schedulerName: default-scheduler
36+
schedulerName: {{ .Values.schedulerName }}
3737
{{- end }}
3838
priorityClassName: {{ .Values.priority }}
3939
affinity:
@@ -79,8 +79,14 @@ envFrom:
7979
- configMapRef:
8080
name: {{ .Values.ncclGdrEnvConfigMap }}
8181
{{- end }}
82-
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap }}
82+
{{- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName "sakkara" ) }}
8383
env:
84+
{{- if eq .Values.schedulerName "sakkara" }}
85+
- name: SAKKARA_RANK
86+
valueFrom:
87+
fieldRef:
88+
fieldPath: metadata.labels['sakkara.member.rank']
89+
{{- end }}
8490
{{- if .Values.topologyFileConfigMap }}
8591
- name: NCCL_TOPO_FILE
8692
value: /var/run/nvidia-topologyd/virtualTopology.xml
@@ -144,6 +150,10 @@ command:
144150
#
145151
# User commands
146152
#
153+
{{- if eq .Values.schedulerName "sakkara" }}
154+
echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
155+
export RANK=$SAKKARA_RANK
156+
{{- end }}
147157
{{- range $command := .Values.setupCommands }}
148158
{{ $command }}
149159
{{- end }}

tools/pytorchjob-generator/chart/values.schema.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@
113113
{ "type": "array" }
114114
]},
115115
"bypassCoscheduler": { "type": "boolean" },
116+
"schedulerName": { "oneOf": [
117+
{ "type": "null" },
118+
{ "type": "string", "enum": ["sakkara", "default-scheduler" ] }
119+
]},
116120
"serviceAccountName": { "oneOf" : [
117121
{ "type": "null" },
118122
{ "$ref": "#/$defs/rfc1123Label" }
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
namespace: my-namespace # namespace to deploy to (required)
2+
jobName: my-job # name of the generated AppWrapper and PyTorchJob objects (required)
3+
queueName: default-queue # local queue to submit to (default: default-queue)
4+
5+
bypassCoscheduler: true
6+
schedulerName: sakkara
7+
# If additional constraints are used, specify the configmap here:
8+
#customLabels:
9+
# - key: sakkara.group.name
10+
# value: my-topogrp-0
11+
12+
numPods: 4 # total pod count including master and worker pods (default: 1)
13+
numCpusPerPod: 500m # requested number of cpus per pod (default: 1)
14+
numGpusPerPod: 8 # requested number of gpus per pod (default: 0)
15+
totalMemoryPerPod: 1Gi # requested amount of memory per pod (default: 1Gi)
16+
17+
priority: default-priority # default-priority (default), low-priority, or high-priority
18+
19+
# container image for the pods (required)
20+
containerImage: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
21+
22+
# setup commands to run in each pod (optional)
23+
setupCommands:
24+
- git clone https://github.com/dbarnett/python-helloworld
25+
- cd python-helloworld
26+
27+
# main program to invoke via torchrun (optional)
28+
mainProgram: helloworld.py

0 commit comments

Comments
 (0)