Skip to content

Commit 85bf6a4

Browse files
authored
Add efa support in manifest for training jobs (#345)
* Update documentation for elastic training arguments * nit: Add detail descriptions for array type * Add efa support for training jobs * address comment and add unit test for efa support * fix: add efa check in quota allocation test * Modify efa arg name and fix gpu integ test
1 parent a824151 commit 85bf6a4

File tree

10 files changed

+285
-154
lines changed

10 files changed

+285
-154
lines changed

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,16 @@ class PyTorchJobConfig(BaseModel):
195195
default=None,
196196
description="Limit for the amount of memory in GiB",
197197
)
198+
efa_interfaces: Optional[int] = Field(
199+
default=None,
200+
description="Number of EFA interfaces for the instance",
201+
ge=0
202+
)
203+
efa_interfaces_limit: Optional[int] = Field(
204+
default=None,
205+
description="Limit for the number of EFA interfaces",
206+
ge=0
207+
)
198208
accelerator_partition_type: Optional[str] = Field(
199209
default=None,
200210
description="Type of accelerator partition"
@@ -453,23 +463,27 @@ def build_dict(**kwargs):
453463
requests_value = build_dict(
454464
**{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
455465
vcpu=str(self.vcpu) if self.vcpu else None,
456-
memory=str(self.memory) if self.memory else None
466+
memory=str(self.memory) if self.memory else None,
467+
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
457468
)
458469
limits_value = build_dict(
459470
**{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
460471
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
461-
memory=str(self.memory_limit) if self.memory_limit else None
472+
memory=str(self.memory_limit) if self.memory_limit else None,
473+
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
462474
)
463475
else:
464476
requests_value = build_dict(
465477
accelerators=str(self.accelerators) if self.accelerators else None,
466478
vcpu=str(self.vcpu) if self.vcpu else None,
467-
memory=str(self.memory) if self.memory else None
479+
memory=str(self.memory) if self.memory else None,
480+
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
468481
)
469482
limits_value = build_dict(
470483
accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
471484
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
472-
memory=str(self.memory_limit) if self.memory_limit else None
485+
memory=str(self.memory_limit) if self.memory_limit else None,
486+
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
473487
)
474488

475489
# Build container

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,16 @@
305305
"minimum": 0,
306306
"description": "Limit for the amount of memory in GiB"
307307
},
308+
"efa_interfaces": {
309+
"type": "integer",
310+
"minimum": 0,
311+
"description": "Number of EFA interfaces for the instance"
312+
},
313+
"efa_interfaces_limit": {
314+
"type": "integer",
315+
"minimum": 0,
316+
"description": "Limit for the number of EFA interfaces"
317+
},
308318
"accelerator_partition_type": {
309319
"type": "string",
310320
"enum": [

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,14 @@
9797
{%- if memory %}
9898
memory: {{ memory }}Gi
9999
{%- endif %}
100-
{%- if (node_count and node_count > 1) %}
101-
vpc.amazonaws.com/efa: 1
100+
{%- if efa and efa > 0 %}
101+
vpc.amazonaws.com/efa: {{ efa }}
102102
{%- endif %}
103103
{%- else %}
104104
requests:
105105
nvidia.com/gpu: "0"
106106
{%- endif %}
107-
{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit %}
107+
{%- if accelerator_partition_limit or accelerators_limit or vcpu_limit or memory_limit or efa_limit%}
108108
limits:
109109
{%- if accelerator_partition_type and accelerator_partition_limit %}
110110
nvidia.com/{{ accelerator_partition_type }}: {{ accelerator_partition_limit }}
@@ -117,8 +117,8 @@
117117
{%- if memory_limit %}
118118
memory: {{ memory_limit }}Gi
119119
{%- endif %}
120-
{%- if (node_count and node_count > 1) %}
121-
vpc.amazonaws.com/efa: 1
120+
{%- if efa_limit and efa_limit > 0 %}
121+
vpc.amazonaws.com/efa: {{ efa_limit }}
122122
{%- endif %}
123123
{%- else %}
124124
limits:

src/sagemaker/hyperpod/cli/constants/command_constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
SAGEMAKER_TRAINING_LAUNCHER_DIR = str(Path(__file__).parent.parent / "sagemaker_hyperpod_recipes")
4646
NVIDIA_GPU_RESOURCE_LIMIT_KEY = "nvidia.com/gpu"
4747
NEURON_RESOURCE_LIMIT_KEY = "aws.amazon.com/neurondevice"
48+
EFA_RESOURCE_LIMIT_KEY = "vpc.amazonaws.com/efa"
4849
AVAILABLE_ACCELERATOR_DEVICES_KEY = "AvailableAcceleratorDevices"
4950
TOTAL_ACCELERATOR_DEVICES_KEY = "TotalAcceleratorDevices"
5051
USER_NAME_LABEL_KEY = "sagemaker.user/created-by"

0 commit comments

Comments
 (0)