Skip to content

Commit de768b4

Browse files
committed
Add efa support for training jobs
1 parent 9521c41 commit de768b4

File tree

6 files changed

+209
-128
lines changed

6 files changed

+209
-128
lines changed

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,16 @@ class PyTorchJobConfig(BaseModel):
195195
default=None,
196196
description="Limit for the amount of memory in GiB",
197197
)
198+
efa: Optional[int] = Field(
199+
default=None,
200+
description="Number of EFA interfaces for the instance",
201+
ge=0
202+
)
203+
efa_limit: Optional[int] = Field(
204+
default=None,
205+
description="Limit for the number of EFA interfaces",
206+
ge=0
207+
)
198208
accelerator_partition_type: Optional[str] = Field(
199209
default=None,
200210
description="Type of accelerator partition"
@@ -453,23 +463,27 @@ def build_dict(**kwargs):
453463
requests_value = build_dict(
454464
**{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
455465
vcpu=str(self.vcpu) if self.vcpu else None,
456-
memory=str(self.memory) if self.memory else None
466+
memory=str(self.memory) if self.memory else None,
467+
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}
457468
)
458469
limits_value = build_dict(
459470
**{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
460471
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
461-
memory=str(self.memory_limit) if self.memory_limit else None
472+
memory=str(self.memory_limit) if self.memory_limit else None,
473+
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}
462474
)
463475
else:
464476
requests_value = build_dict(
465477
accelerators=str(self.accelerators) if self.accelerators else None,
466478
vcpu=str(self.vcpu) if self.vcpu else None,
467-
memory=str(self.memory) if self.memory else None
479+
memory=str(self.memory) if self.memory else None,
480+
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}
468481
)
469482
limits_value = build_dict(
470483
accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
471484
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
472-
memory=str(self.memory_limit) if self.memory_limit else None
485+
memory=str(self.memory_limit) if self.memory_limit else None,
486+
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}
473487
)
474488

475489
# Build container

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,16 @@
305305
"minimum": 0,
306306
"description": "Limit for the amount of memory in GiB"
307307
},
308+
"efa": {
309+
"type": "integer",
310+
"minimum": 0,
311+
"description": "Number of EFA interfaces for the instance"
312+
},
313+
"efa_limit": {
314+
"type": "integer",
315+
"minimum": 0,
316+
"description": "Limit for the number of EFA interfaces"
317+
},
308318
"accelerator_partition_type": {
309319
"type": "string",
310320
"enum": [

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,14 @@
9797
{%- if memory %}
9898
memory: {{ memory }}Gi
9999
{%- endif %}
100-
{%- if (node_count and node_count > 1) %}
101-
vpc.amazonaws.com/efa: 1
100+
{%- if efa and efa > 0 %}
101+
vpc.amazonaws.com/efa: {{ efa }}
102102
{%- endif %}
103103
{%- else %}
104104
requests:
105105
nvidia.com/gpu: "0"
106106
{%- endif %}
107-
{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit %}
107+
{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit or efa_limit%}
108108
limits:
109109
{%- if accelerator_partition_type and accelerator_partition_limit %}
110110
nvidia.com/{{ accelerator_partition_type }}: {{ accelerator_partition_limit }}
@@ -117,8 +117,8 @@
117117
{%- if memory_limit %}
118118
memory: {{ memory_limit }}Gi
119119
{%- endif %}
120-
{%- if (node_count and node_count > 1) %}
121-
vpc.amazonaws.com/efa: 1
120+
{%- if efa_limit and efa_limit > 0 %}
121+
vpc.amazonaws.com/efa: {{ efa_limit }}
122122
{%- endif %}
123123
{%- else %}
124124
limits:

0 commit comments

Comments
 (0)