Skip to content

Commit 8d60407

Browse files
committed
Modify efa arg name and fix gpu integ test
1 parent 930eded commit 8d60407

File tree

6 files changed

+40
-40
lines changed

6 files changed

+40
-40
lines changed

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -195,12 +195,12 @@ class PyTorchJobConfig(BaseModel):
195195
default=None,
196196
description="Limit for the amount of memory in GiB",
197197
)
198-
efa: Optional[int] = Field(
198+
efa_interfaces: Optional[int] = Field(
199199
default=None,
200200
description="Number of EFA interfaces for the instance",
201201
ge=0
202202
)
203-
efa_limit: Optional[int] = Field(
203+
efa_interfaces_limit: Optional[int] = Field(
204204
default=None,
205205
description="Limit for the number of EFA interfaces",
206206
ge=0
@@ -464,26 +464,26 @@ def build_dict(**kwargs):
464464
**{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
465465
vcpu=str(self.vcpu) if self.vcpu else None,
466466
memory=str(self.memory) if self.memory else None,
467-
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
467+
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
468468
)
469469
limits_value = build_dict(
470470
**{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
471471
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
472472
memory=str(self.memory_limit) if self.memory_limit else None,
473-
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
473+
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
474474
)
475475
else:
476476
requests_value = build_dict(
477477
accelerators=str(self.accelerators) if self.accelerators else None,
478478
vcpu=str(self.vcpu) if self.vcpu else None,
479479
memory=str(self.memory) if self.memory else None,
480-
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
480+
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
481481
)
482482
limits_value = build_dict(
483483
accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
484484
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
485485
memory=str(self.memory_limit) if self.memory_limit else None,
486-
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
486+
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
487487
)
488488

489489
# Build container

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -305,12 +305,12 @@
305305
"minimum": 0,
306306
"description": "Limit for the amount of memory in GiB"
307307
},
308-
"efa": {
308+
"efa_interfaces": {
309309
"type": "integer",
310310
"minimum": 0,
311311
"description": "Number of EFA interfaces for the instance"
312312
},
313-
"efa_limit": {
313+
"efa_interfaces_limit": {
314314
"type": "integer",
315315
"minimum": 0,
316316
"description": "Limit for the number of EFA interfaces"

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,15 @@ def _process_replica_resources(cls, data):
152152
acc_req, acc_lim = _set_default_accelerators_val(instance_type, accelerators, accelerators_limit)
153153
_validate_accelerators_inputs(instance_type, acc_req, acc_lim)
154154

155-
efa = None
155+
efa_interfaces = None
156156
if requests.get(EFA_RESOURCE_KEY):
157-
efa = int(requests.get(EFA_RESOURCE_KEY))
157+
efa_interfaces = int(requests.get(EFA_RESOURCE_KEY))
158158

159-
efa_limit = None
159+
efa_interfaces_limit = None
160160
if limits.get(EFA_RESOURCE_KEY):
161-
efa_limit = int(limits.get(EFA_RESOURCE_KEY))
161+
efa_interfaces_limit = int(limits.get(EFA_RESOURCE_KEY))
162162

163-
_validate_efa_inputs(instance_type, efa, efa_limit)
163+
_validate_efa_inputs(instance_type, efa_interfaces, efa_interfaces_limit)
164164

165165
accelerator_partition_type, accelerator_partition_count, accelerator_partition_limit = (
166166
_get_accelerator_partition(requests, limits)
@@ -174,7 +174,7 @@ def _process_replica_resources(cls, data):
174174

175175
acc_partition_req, acc_partition_lim = _set_default_accelerator_partition_val(accelerator_partition_count, accelerator_partition_limit)
176176

177-
requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req, accelerator_partition_type, acc_partition_req, efa)
177+
requests_values = _get_resources_from_compute_quotas(instance_type, vcpu, memory, acc_req, accelerator_partition_type, acc_partition_req, efa_interfaces)
178178
if requests_values is None:
179179
requests_values = _get_resources_from_instance(instance_type, node_count=1)
180180
_trim_resource_requests(instance_type, requests_values)

src/sagemaker/hyperpod/training/quota_allocation_util.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def _get_resources_from_compute_quotas(instance_type: str,
3434
accelerators: Optional[int] = 0,
3535
accelerator_partition_type: Optional[str] = None,
3636
accelerator_partition_count: Optional[int] = None,
37-
efa: Optional[int] = None) -> Optional[dict]:
37+
efa_interfaces: Optional[int] = None) -> Optional[dict]:
3838
has_accelerator_partition = accelerator_partition_type is not None and accelerator_partition_count is not None
3939
has_compute_resources = _has_compute_resource_quota_allocation_resources(memory_in_gib, vcpu, accelerators)
4040

@@ -73,7 +73,7 @@ def _get_resources_from_compute_quotas(instance_type: str,
7373
result["memory"] = memory_value
7474
result[type_of_accelerator] = accelerators
7575

76-
efa_count = efa or instance.get("efa", 0)
76+
efa_count = efa_interfaces or instance.get("efa", 0)
7777
if efa_count > 0:
7878
result["vpc.amazonaws.com/efa"] = efa_count
7979

@@ -135,7 +135,7 @@ def _trim_resource_requests(instance_type: str, requests_values: dict) -> dict:
135135
return requests_values
136136

137137

138-
def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_limit: Optional[float], accelerators_limit: Optional[int], accelerator_partition_type: Optional[str], accelerator_partition_limit: Optional[int], efa_limit: Optional[int] = None) -> dict:
138+
def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_limit: Optional[float], accelerators_limit: Optional[int], accelerator_partition_type: Optional[str], accelerator_partition_limit: Optional[int], efa_interfaces_limit: Optional[int] = None) -> dict:
139139

140140
result = {}
141141
type_of_accelerator, _max_accelerator_per_instance = _get_accelerator_type_and_count(instance_type)
@@ -154,8 +154,8 @@ def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_l
154154
if memory_in_gib_limit is not None:
155155
result["memory"] = str(memory_in_gib_limit) + "Gi"
156156

157-
if efa_limit is not None and efa_limit > 0:
158-
result["vpc.amazonaws.com/efa"] = efa_limit
157+
if efa_interfaces_limit is not None and efa_interfaces_limit > 0:
158+
result["vpc.amazonaws.com/efa"] = efa_interfaces_limit
159159

160160
return result
161161

@@ -226,29 +226,29 @@ def _validate_accelerators_inputs(instance_type: str, accelerators_request: int,
226226
raise ValueError('Requested accelerators exceeds capacity')
227227

228228

229-
def _validate_efa_inputs(instance_type: str, efa_request: Optional[int], efa_limit: Optional[int]) -> None:
229+
def _validate_efa_inputs(instance_type: str, efa_interfaces: Optional[int], efa_interfaces_limit: Optional[int]) -> None:
230230
"""Validate EFA inputs similar to accelerator validation."""
231231
instance = INSTANCE_RESOURCES.get(instance_type, {})
232232
max_efa_per_instance = instance.get("efa", 0)
233233

234234
# Check if user provided EFA values but instance doesn't support EFA
235-
if max_efa_per_instance == 0 and (efa_request is not None or efa_limit is not None):
235+
if max_efa_per_instance == 0 and (efa_interfaces is not None or efa_interfaces_limit is not None):
236236
raise ValueError(
237237
f"Instance type {instance_type} does not support EFA, but EFA values were provided.")
238238

239239
# Validate EFA values if instance supports EFA
240240
if max_efa_per_instance > 0:
241-
if efa_request is not None and efa_limit is not None:
242-
if efa_request != efa_limit:
241+
if efa_interfaces is not None and efa_interfaces_limit is not None:
242+
if efa_interfaces != efa_interfaces_limit:
243243
raise ValueError('EFA request must equal EFA limit')
244-
if efa_limit > max_efa_per_instance:
245-
raise ValueError(f'Requested EFA limit ({efa_limit}) exceeds instance capacity ({max_efa_per_instance})')
246-
if efa_request > max_efa_per_instance:
247-
raise ValueError(f'Requested EFA ({efa_request}) exceeds instance capacity ({max_efa_per_instance})')
248-
elif efa_request is not None and efa_request > max_efa_per_instance:
249-
raise ValueError(f'Requested EFA ({efa_request}) exceeds instance capacity ({max_efa_per_instance})')
250-
elif efa_limit is not None and efa_limit > max_efa_per_instance:
251-
raise ValueError(f'Requested EFA limit ({efa_limit}) exceeds instance capacity ({max_efa_per_instance})')
244+
if efa_interfaces_limit > max_efa_per_instance:
245+
raise ValueError(f'Requested EFA limit ({efa_interfaces_limit}) exceeds instance capacity ({max_efa_per_instance})')
246+
if efa_interfaces > max_efa_per_instance:
247+
raise ValueError(f'Requested EFA ({efa_interfaces}) exceeds instance capacity ({max_efa_per_instance})')
248+
elif efa_interfaces is not None and efa_interfaces > max_efa_per_instance:
249+
raise ValueError(f'Requested EFA ({efa_interfaces}) exceeds instance capacity ({max_efa_per_instance})')
250+
elif efa_interfaces_limit is not None and efa_interfaces_limit > max_efa_per_instance:
251+
raise ValueError(f'Requested EFA limit ({efa_interfaces_limit}) exceeds instance capacity ({max_efa_per_instance})')
252252

253253

254254
def _set_default_accelerators_val(instance_type: Optional[str], accelerators_request: Optional[int], accelerators_limit: Optional[int]) -> Tuple[Optional[int], Optional[int]]:

test/integration_tests/training/cli/test_gpu_quota_allocation.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ def test_create_job_with_integer_quota_parameters(self, test_job_name):
5353
result = execute_command(describe_cmd)
5454
logger.info(f"describe result: {result}")
5555
assert result.returncode == 0
56-
assert " Limits: {'cpu': '4', 'memory': '2Gi', 'nvidia.com/gpu': '1'}" in result.stdout
57-
assert " Requests: {'cpu': '3', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout
56+
assert " Limits: {'cpu': '4', 'memory': '2Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout
57+
assert " Requests: {'cpu': '3', 'memory': '1Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout
5858

5959
delete_cmd = [
6060
"hyp", "delete", "hyp-pytorch-job",
@@ -103,8 +103,8 @@ def test_create_job_with_float_quota_parameters(self, test_job_name):
103103
]
104104
result = execute_command(describe_cmd)
105105
assert result.returncode == 0
106-
assert " Limits: {'cpu': '4800m', 'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout
107-
assert " Requests: {'cpu': '3600m', 'memory': '1Gi', 'nvidia.com/gpu': '1'}" in result.stdout
106+
assert " Limits: {'cpu': '4800m', 'memory': '2899102924800m', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout
107+
assert " Requests: {'cpu': '3600m', 'memory': '1Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout
108108

109109
delete_cmd = [
110110
"hyp", "delete", "hyp-pytorch-job",
@@ -149,8 +149,8 @@ def test_create_job_with_only_accelerators_parameter(self, test_job_name):
149149
]
150150
result = execute_command(describe_cmd)
151151
assert result.returncode == 0
152-
assert " Limits: {'memory': '104Gi', 'nvidia.com/gpu': '1'}" in result.stdout
153-
assert " Requests: {'cpu': '29', 'memory': '104Gi', 'nvidia.com/gpu': '1'}" in result.stdout
152+
assert " Limits: {'memory': '104Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout
153+
assert " Requests: {'cpu': '29', 'memory': '104Gi', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout
154154

155155
delete_cmd = [
156156
"hyp", "delete", "hyp-pytorch-job",
@@ -196,8 +196,8 @@ def test_create_job_with_accelerators_memory_parameters(self, test_job_name):
196196
time.sleep(5)
197197

198198
assert result.returncode == 0
199-
assert " Limits: {'memory': '2899102924800m', 'nvidia.com/gpu': '1'}" in result.stdout
200-
assert " Requests: {'cpu': '29', 'memory': '2040109465600m', 'nvidia.com/gpu': '1'}" in result.stdout
199+
assert " Limits: {'memory': '2899102924800m', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout
200+
assert " Requests: {'cpu': '29', 'memory': '2040109465600m', 'nvidia.com/gpu': '1', 'vpc.amazonaws.com/efa': '1'}" in result.stdout
201201

202202
delete_cmd = [
203203
"hyp", "delete", "hyp-pytorch-job",

test/unit_tests/training/test_pytorch_job_template_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ def test_user_specified_efa_overrides_default(self):
9797
job_name="test-custom-efa",
9898
image="pytorch:latest",
9999
accelerators=4,
100-
efa=2,
100+
efa_interfaces=2,
101101
instance_type="ml.p4d.24xlarge"
102102
)
103103

0 commit comments

Comments
 (0)