File tree Expand file tree Collapse file tree 3 files changed +44
-2
lines changed
tools/pytorchjob-generator Expand file tree Collapse file tree 3 files changed +44
-2
lines changed Original file line number Diff line number Diff line change @@ -35,7 +35,7 @@ metadata:
3535terminationGracePeriodSeconds: { { .Values.terminationGracePeriodSeconds } }
3636{ {- end } }
3737{ {- if .Values.bypassCoscheduler } }
38- schedulerName: default-scheduler
38+ schedulerName: { { .Values.schedulerName } }
3939{ {- end } }
4040priorityClassName: { { .Values.priority } }
4141affinity:
@@ -81,8 +81,14 @@ envFrom:
8181 - configMapRef:
8282 name: { { .Values.ncclGdrEnvConfigMap } }
8383{ {- end } }
84- { {- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap } }
84+ { {- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName " sakkara " ) } }
8585env:
86+ { {- if eq .Values.schedulerName " sakkara" } }
87+ - name: SAKKARA_RANK
88+ valueFrom:
89+ fieldRef:
90+ fieldPath: metadata.labels['sakkara.member.rank']
91+ { {- end } }
8692 { {- if .Values.topologyFileConfigMap } }
8793 - name: NCCL_TOPO_FILE
8894 value: /var/run/nvidia-topologyd/virtualTopology.xml
@@ -146,6 +152,10 @@ command:
146152 #
147153 # User commands
148154 #
155+ { {- if eq .Values.schedulerName " sakkara" } }
156+ echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
157+ export RANK=$SAKKARA_RANK
158+ { {- end } }
149159 { {- range $command := .Values.setupCommands } }
150160 { { $command } }
151161 { {- end } }
Original file line number Diff line number Diff line change 115115 { "type" : " array" }
116116 ]},
117117 "bypassCoscheduler" : { "type" : " boolean" },
118+ "schedulerName" : { "oneOf" : [
119+ { "type" : " null" },
120+ { "type" : " string" , "enum" : [" sakkara" , " default-scheduler" ] }
121+ ]},
118122 "serviceAccountName" : { "oneOf" : [
119123 { "type" : " null" },
120124 { "$ref" : " #/$defs/rfc1123Label" }
Original file line number Diff line number Diff line change 1+ namespace : my-namespace # namespace to deploy to (required)
2+ jobName : my-job # name of the generated AppWrapper and PyTorchJob objects (required)
3+ queueName : default-queue # local queue to submit to (default: default-queue)
4+
5+ bypassCoscheduler : true
6+ schedulerName : sakkara
7+ # If additional constraints are used, specify the configmap here:
8+ # customLabels:
9+ # - key: sakkara.group.name
10+ # value: my-topogrp-0
11+
12+ numPods : 4 # total pod count including master and worker pods (default: 1)
13+ numCpusPerPod : 500m # requested number of cpus per pod (default: 1)
14+ numGpusPerPod : 8 # requested number of gpus per pod (default: 0)
15+ totalMemoryPerPod : 1Gi # requested amount of memory per pod (default: 1Gi)
16+
17+ priority : default-priority # default-priority (default), low-priority, or high-priority
18+
19+ # container image for the pods (required)
20+ containerImage : ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
21+
22+ # setup commands to run in each pod (optional)
23+ setupCommands :
24+ - git clone https://github.com/dbarnett/python-helloworld
25+ - cd python-helloworld
26+
27+ # main program to invoke via torchrun (optional)
28+ mainProgram : helloworld.py
You can’t perform that action at this time.
0 commit comments