8585
8686logger : logging .Logger = logging .getLogger (__name__ )
8787
88+ # Kubernetes reserves a small amount of resources per host for the system. For
89+ # TorchX we always assume the entire host is being requested so we adjust the
90+ # requested numbers account for the node reserved resources.
91+ #
92+ # https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/
93+ RESERVED_MILLICPU = 100
94+ RESERVED_MEMMB = 1024
95+
8896RETRY_POLICIES : Mapping [str , Iterable [Mapping [str , str ]]] = {
8997 RetryPolicy .REPLICA : [],
9098 RetryPolicy .APPLICATION : [
152160
153161ANNOTATION_ISTIO_SIDECAR = "sidecar.istio.io/inject"
154162
163+ LABEL_INSTANCE_TYPE = "node.kubernetes.io/instance-type"
164+
155165
156166def sanitize_for_serialization (obj : object ) -> object :
157167 from kubernetes import client
@@ -176,21 +186,35 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
176186 V1EmptyDirVolumeSource ,
177187 )
178188
189+ # limits puts an upper cap on the resources a pod may consume.
190+ # requests is how much the scheduler allocates. We assume that the jobs will
191+ # be allocation the whole machine so requests is slightly lower than the
192+ # requested resources to account for the Kubernetes node reserved resources.
193+ limits = {}
179194 requests = {}
180195
181196 resource = role .resource
182- if resource .cpu >= 0 :
183- requests ["cpu" ] = f"{ int (resource .cpu * 1000 )} m"
184- if resource .memMB >= 0 :
185- requests ["memory" ] = f"{ int (resource .memMB )} M"
186- if resource .gpu >= 0 :
187- requests ["nvidia.com/gpu" ] = str (resource .gpu )
197+ if resource .cpu > 0 :
198+ mcpu = int (resource .cpu * 1000 )
199+ limits ["cpu" ] = f"{ mcpu } m"
200+ request_mcpu = max (mcpu - RESERVED_MILLICPU , 0 )
201+ requests ["cpu" ] = f"{ request_mcpu } m"
202+ if resource .memMB > 0 :
203+ limits ["memory" ] = f"{ int (resource .memMB )} M"
204+ request_memMB = max (int (resource .memMB ) - RESERVED_MEMMB , 0 )
205+ requests ["memory" ] = f"{ request_memMB } M"
206+ if resource .gpu > 0 :
207+ requests ["nvidia.com/gpu" ] = limits ["nvidia.com/gpu" ] = str (resource .gpu )
188208
189209 resources = V1ResourceRequirements (
190- limits = requests ,
210+ limits = limits ,
191211 requests = requests ,
192212 )
193213
214+ node_selector : Dict [str , str ] = {}
215+ if LABEL_INSTANCE_TYPE in resource .capabilities :
216+ node_selector [LABEL_INSTANCE_TYPE ] = resource .capabilities [LABEL_INSTANCE_TYPE ]
217+
194218 # To support PyTorch dataloaders we need to set /dev/shm to larger than the
195219 # 64M default so we mount an unlimited sized tmpfs directory on it.
196220 SHM_VOL = "dshm"
@@ -264,6 +288,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
264288 restart_policy = "Never" ,
265289 service_account_name = service_account ,
266290 volumes = volumes ,
291+ node_selector = node_selector ,
267292 ),
268293 metadata = V1ObjectMeta (
269294 annotations = {
@@ -416,6 +441,29 @@ class KubernetesScheduler(Scheduler, DockerWorkspace):
416441
417442 External docs: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
418443
444+ **Resources / Allocation**
445+
446+ To select a specific machine type you can add a capability to your resources
447+ with ``node.kubernetes.io/instance-type`` which will constrain the launched
448+ jobs to nodes of that instance type.
449+
450+ >>> from torchx import specs
451+ >>> specs.Resource(
452+ ... cpu=4,
453+ ... memMB=16000,
454+ ... gpu=2,
455+ ... capabilities={
456+ ... "node.kubernetes.io/instance-type": "<cloud instance type>",
457+ ... },
458+ ... )
459+ Resource(...)
460+
461+ Kubernetes may reserve some memory for the host. TorchX assumes you're
462+ scheduling on whole hosts and thus will automatically reduce the resource
463+ request by a small amount to account for the node reserved CPU and memory.
464+ If you run into scheduling issues you may need to reduce the requested CPU
465+ and memory from the host values.
466+
419467 **Compatibility**
420468
421469 .. compatibility::
0 commit comments