Skip to content

Commit 98e7703

Browse files
committed
address comment and add unit test for efa support
1 parent de768b4 commit 98e7703

File tree

5 files changed

+76
-27
lines changed

5 files changed

+76
-27
lines changed

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -464,26 +464,26 @@ def build_dict(**kwargs):
464464
**{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
465465
vcpu=str(self.vcpu) if self.vcpu else None,
466466
memory=str(self.memory) if self.memory else None,
467-
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}
467+
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
468468
)
469469
limits_value = build_dict(
470470
**{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
471471
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
472472
memory=str(self.memory_limit) if self.memory_limit else None,
473-
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}
473+
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
474474
)
475475
else:
476476
requests_value = build_dict(
477477
accelerators=str(self.accelerators) if self.accelerators else None,
478478
vcpu=str(self.vcpu) if self.vcpu else None,
479479
memory=str(self.memory) if self.memory else None,
480-
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}
480+
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
481481
)
482482
limits_value = build_dict(
483483
accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
484484
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
485485
memory=str(self.memory_limit) if self.memory_limit else None,
486-
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}
486+
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
487487
)
488488

489489
# Build container

src/sagemaker/hyperpod/cli/constants/command_constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
SAGEMAKER_TRAINING_LAUNCHER_DIR = str(Path(__file__).parent.parent / "sagemaker_hyperpod_recipes")
4646
NVIDIA_GPU_RESOURCE_LIMIT_KEY = "nvidia.com/gpu"
4747
NEURON_RESOURCE_LIMIT_KEY = "aws.amazon.com/neurondevice"
48+
EFA_RESOURCE_LIMIT_KEY = "vpc.amazonaws.com/efa"
4849
AVAILABLE_ACCELERATOR_DEVICES_KEY = "AvailableAcceleratorDevices"
4950
TOTAL_ACCELERATOR_DEVICES_KEY = "TotalAcceleratorDevices"
5051
USER_NAME_LABEL_KEY = "sagemaker.user/created-by"

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
from pydantic import ConfigDict, Field
22

3-
from sagemaker.hyperpod.cli.constants.command_constants import INSTANCE_TYPE_LABEL, NEURON_RESOURCE_LIMIT_KEY, \
4-
NVIDIA_GPU_RESOURCE_LIMIT_KEY
3+
from sagemaker.hyperpod.cli.constants.command_constants import (
4+
INSTANCE_TYPE_LABEL,
5+
NEURON_RESOURCE_LIMIT_KEY,
6+
NVIDIA_GPU_RESOURCE_LIMIT_KEY,
7+
EFA_RESOURCE_LIMIT_KEY,
8+
)
59
from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import (
610
_HyperPodPytorchJob, HyperPodPytorchJobStatus
711
)
@@ -47,6 +51,7 @@
4751
TRAINING_OPERATOR_LABEL = "hp-training-control-plane"
4852
NVIDIA_RESOURCE_KEY = NVIDIA_GPU_RESOURCE_LIMIT_KEY
4953
NEURON_RESOURCE_KEY = NEURON_RESOURCE_LIMIT_KEY
54+
EFA_RESOURCE_KEY = EFA_RESOURCE_LIMIT_KEY
5055

5156
class HyperPodPytorchJob(_HyperPodPytorchJob):
5257
"""HyperPod PyTorch job for distributed training on Amazon SageMaker HyperPod clusters.
@@ -148,12 +153,12 @@ def _process_replica_resources(cls, data):
148153
_validate_accelerators_inputs(instance_type, acc_req, acc_lim)
149154

150155
efa = None
151-
if requests.get('vpc.amazonaws.com/efa'):
152-
efa = int(requests.get('vpc.amazonaws.com/efa'))
156+
if requests.get(EFA_RESOURCE_KEY):
157+
efa = int(requests.get(EFA_RESOURCE_KEY))
153158

154159
efa_limit = None
155-
if limits.get('vpc.amazonaws.com/efa'):
156-
efa_limit = int(limits.get('vpc.amazonaws.com/efa'))
160+
if limits.get(EFA_RESOURCE_KEY):
161+
efa_limit = int(limits.get(EFA_RESOURCE_KEY))
157162

158163
_validate_efa_inputs(instance_type, efa, efa_limit)
159164

@@ -178,10 +183,7 @@ def _process_replica_resources(cls, data):
178183
elif NEURON_RESOURCE_KEY in requests_values:
179184
acc_lim = requests_values[NEURON_RESOURCE_KEY]
180185

181-
if efa is not None:
182-
requests_values["vpc.amazonaws.com/efa"] = efa
183-
184-
efa_lim = requests_values.get("vpc.amazonaws.com/efa")
186+
efa_lim = requests_values.get(EFA_RESOURCE_KEY)
185187
if efa_lim is not None:
186188
efa_lim = int(efa_lim)
187189

src/sagemaker/hyperpod/training/quota_allocation_util.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,9 @@ def _get_resources_from_compute_quotas(instance_type: str,
7373
result["memory"] = memory_value
7474
result[type_of_accelerator] = accelerators
7575

76-
if efa is not None:
77-
result["vpc.amazonaws.com/efa"] = efa
78-
else:
79-
efa_count = instance.get("efa", 0)
80-
if efa_count > 0:
81-
result["vpc.amazonaws.com/efa"] = efa_count
76+
efa_count = efa or instance.get("efa", 0)
77+
if efa_count > 0:
78+
result["vpc.amazonaws.com/efa"] = efa_count
8279

8380
else:
8481
result["cpu"] = vcpu or 0

test/unit_tests/training/test_pytorch_job_template_model.py

Lines changed: 56 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,22 +45,71 @@ class TestPyTorchJobConfigEFA(unittest.TestCase):
4545
# # Should also have GPU resources
4646
# self.assertEqual(container.resources.requests["nvidia.com/gpu"], "8")
4747

48-
def test_no_node_count_no_efa(self):
49-
"""Test that jobs without node_count don't get EFA resources"""
48+
def test_instance_without_efa_support_no_efa(self):
49+
"""Test that instances without EFA support don't get EFA (ml.g5.xlarge doesn't support EFA)"""
50+
from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob
51+
5052
config = PyTorchJobConfig(
51-
job_name="test-no-node-count",
53+
job_name="test-no-efa-support",
5254
image="pytorch:latest",
5355
accelerators=1,
5456
instance_type="ml.g5.xlarge"
5557
)
56-
58+
5759
job = config.to_domain()
58-
container = job.replicaSpecs[0].template.spec.containers[0]
59-
60-
# Should not have EFA resources
60+
# Call allocate_quotas_if_applicable to convert generic keys to actual resource keys
61+
job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job)
62+
container = job_with_resources.replicaSpecs[0].template.spec.containers[0]
63+
64+
# Should not have EFA resources (instance doesn't support it)
6165
self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests)
6266
self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits)
6367

68+
# Should have GPU resources
69+
self.assertIn("nvidia.com/gpu", container.resources.requests)
70+
71+
def test_accelerators_with_efa_support_gets_default_efa(self):
72+
"""Test that specifying accelerators on EFA-capable instance gets EFA from constants"""
73+
from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob
74+
75+
config = PyTorchJobConfig(
76+
job_name="test-accelerators-default-efa",
77+
image="pytorch:latest",
78+
accelerators=4,
79+
instance_type="ml.p4d.24xlarge"
80+
)
81+
82+
job = config.to_domain()
83+
# Call allocate_quotas_if_applicable to convert generic keys to actual resource keys
84+
job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job)
85+
container = job_with_resources.replicaSpecs[0].template.spec.containers[0]
86+
87+
# Should have EFA from constants
88+
self.assertIn("vpc.amazonaws.com/efa", container.resources.requests)
89+
self.assertIn("vpc.amazonaws.com/efa", container.resources.limits)
90+
self.assertEqual(int(container.resources.requests["vpc.amazonaws.com/efa"]), 4)
91+
92+
def test_user_specified_efa_overrides_default(self):
93+
"""Test that user-specified EFA value overrides the default from constants"""
94+
from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob
95+
96+
config = PyTorchJobConfig(
97+
job_name="test-custom-efa",
98+
image="pytorch:latest",
99+
accelerators=4,
100+
efa=2,
101+
instance_type="ml.p4d.24xlarge"
102+
)
103+
104+
job = config.to_domain()
105+
# Call allocate_quotas_if_applicable to convert generic keys to actual resource keys
106+
job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job)
107+
container = job_with_resources.replicaSpecs[0].template.spec.containers[0]
108+
109+
# Should use user-specified EFA value
110+
self.assertEqual(int(container.resources.requests["vpc.amazonaws.com/efa"]), 2)
111+
self.assertEqual(int(container.resources.limits["vpc.amazonaws.com/efa"]), 2)
112+
64113
# def test_multi_node_with_memory_and_cpu(self):
65114
# """Test EFA with other resource types"""
66115
# config = PyTorchJobConfig(

0 commit comments

Comments
 (0)