File tree Expand file tree Collapse file tree 3 files changed +44
-2
lines changed
tools/pytorchjob-generator Expand file tree Collapse file tree 3 files changed +44
-2
lines changed Original file line number Diff line number Diff line change @@ -33,7 +33,7 @@ annotations:
3333terminationGracePeriodSeconds: { { .Values.terminationGracePeriodSeconds } }
3434{ {- end } }
3535{ {- if .Values.bypassCoscheduler } }
36- schedulerName: default-scheduler
36+ schedulerName: { { .Values.schedulerName } }
3737{ {- end } }
3838priorityClassName: { { .Values.priority } }
3939affinity:
@@ -79,8 +79,14 @@ envFrom:
7979 - configMapRef:
8080 name: { { .Values.ncclGdrEnvConfigMap } }
8181{ {- end } }
82- { {- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap } }
82+ { {- if or .Values.environmentVariables .Values.sshGitCloneConfig .Values.mountNVMe .Values.topologyFileConfigMap ( eq .Values.schedulerName " sakkara " ) } }
8383env:
84+ { {- if eq .Values.schedulerName " sakkara" } }
85+ - name: SAKKARA_RANK
86+ valueFrom:
87+ fieldRef:
88+ fieldPath: metadata.labels['sakkara.member.rank']
89+ { {- end } }
8490 { {- if .Values.topologyFileConfigMap } }
8591 - name: NCCL_TOPO_FILE
8692 value: /var/run/nvidia-topologyd/virtualTopology.xml
@@ -144,6 +150,10 @@ command:
144150 #
145151 # User commands
146152 #
153+ { {- if eq .Values.schedulerName " sakkara" } }
154+ echo "Sakkara is enabled: using Sakkara-assigned rank instead of the default PyTorchJob rank"
155+ export RANK=$SAKKARA_RANK
156+ { {- end } }
147157 { {- range $command := .Values.setupCommands } }
148158 { { $command } }
149159 { {- end } }
Original file line number Diff line number Diff line change 113113 { "type" : " array" }
114114 ]},
115115 "bypassCoscheduler" : { "type" : " boolean" },
116+ "schedulerName" : { "oneOf" : [
117+ { "type" : " null" },
118+ { "type" : " string" , "enum" : [" sakkara" , " default-scheduler" ] }
119+ ]},
116120 "serviceAccountName" : { "oneOf" : [
117121 { "type" : " null" },
118122 { "$ref" : " #/$defs/rfc1123Label" }
Original file line number Diff line number Diff line change 1+ namespace : my-namespace # namespace to deploy to (required)
2+ jobName : my-job # name of the generated AppWrapper and PyTorchJob objects (required)
3+ queueName : default-queue # local queue to submit to (default: default-queue)
4+
5+ bypassCoscheduler : true
6+ schedulerName : sakkara
7+ # If additional constraints are used, specify the configmap here:
8+ # customLabels:
9+ # - key: sakkara.group.name
10+ # value: my-topogrp-0
11+
12+ numPods : 4 # total pod count including master and worker pods (default: 1)
13+ numCpusPerPod : 500m # requested number of cpus per pod (default: 1)
14+ numGpusPerPod : 8 # requested number of gpus per pod (default: 0)
15+ totalMemoryPerPod : 1Gi # requested amount of memory per pod (default: 1Gi)
16+
17+ priority : default-priority # default-priority (default), low-priority, or high-priority
18+
19+ # container image for the pods (required)
20+ containerImage : ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
21+
22+ # setup commands to run in each pod (optional)
23+ setupCommands :
24+ - git clone https://github.com/dbarnett/python-helloworld
25+ - cd python-helloworld
26+
27+ # main program to invoke via torchrun (optional)
28+ mainProgram : helloworld.py
You can’t perform that action at this time.
0 commit comments