address comment and add unit test for efa support

yungwenh-aws · yungwenh-aws · commit 98e7703c2f8d · 2025-12-09T15:53:44.000-08:00
diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
@@ -464,26 +464,26 @@ def build_dict(**kwargs):
                     **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
                     vcpu=str(self.vcpu) if self.vcpu else None,
                     memory=str(self.memory) if self.memory else None,
-                    **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}
+                    **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
                 )
                 limits_value = build_dict(
                     **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
                     vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
                     memory=str(self.memory_limit) if self.memory_limit else None,
-                    **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}
+                    **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
                 )
             else:
                 requests_value = build_dict(
                     accelerators=str(self.accelerators) if self.accelerators else None,
                     vcpu=str(self.vcpu) if self.vcpu else None,
                     memory=str(self.memory) if self.memory else None,
-                    **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}
+                    **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
                 )
                 limits_value = build_dict(
                     accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
                     vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
                     memory=str(self.memory_limit) if self.memory_limit else None,
-                    **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}
+                    **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
                 )
 
         # Build container
diff --git a/src/sagemaker/hyperpod/cli/constants/command_constants.py b/src/sagemaker/hyperpod/cli/constants/command_constants.py
@@ -45,6 +45,7 @@
 SAGEMAKER_TRAINING_LAUNCHER_DIR = str(Path(__file__).parent.parent / "sagemaker_hyperpod_recipes")
 NVIDIA_GPU_RESOURCE_LIMIT_KEY = "nvidia.com/gpu"
 NEURON_RESOURCE_LIMIT_KEY = "aws.amazon.com/neurondevice"
+EFA_RESOURCE_LIMIT_KEY = "vpc.amazonaws.com/efa"
 AVAILABLE_ACCELERATOR_DEVICES_KEY = "AvailableAcceleratorDevices"
 TOTAL_ACCELERATOR_DEVICES_KEY = "TotalAcceleratorDevices"
 USER_NAME_LABEL_KEY = "sagemaker.user/created-by"
diff --git a/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py b/src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py
@@ -1,7 +1,11 @@
 from pydantic import ConfigDict, Field
 
-from sagemaker.hyperpod.cli.constants.command_constants import INSTANCE_TYPE_LABEL, NEURON_RESOURCE_LIMIT_KEY, \
-    NVIDIA_GPU_RESOURCE_LIMIT_KEY
+from sagemaker.hyperpod.cli.constants.command_constants import (
+    INSTANCE_TYPE_LABEL,
+    NEURON_RESOURCE_LIMIT_KEY,
+    NVIDIA_GPU_RESOURCE_LIMIT_KEY,
+    EFA_RESOURCE_LIMIT_KEY,
+)
 from sagemaker.hyperpod.training.config.hyperpod_pytorch_job_unified_config import (
     _HyperPodPytorchJob, HyperPodPytorchJobStatus
 )
@@ -47,6 +51,7 @@
 TRAINING_OPERATOR_LABEL = "hp-training-control-plane"
 NVIDIA_RESOURCE_KEY = NVIDIA_GPU_RESOURCE_LIMIT_KEY
 NEURON_RESOURCE_KEY = NEURON_RESOURCE_LIMIT_KEY
+EFA_RESOURCE_KEY = EFA_RESOURCE_LIMIT_KEY
 
 class HyperPodPytorchJob(_HyperPodPytorchJob):
     """HyperPod PyTorch job for distributed training on Amazon SageMaker HyperPod clusters.
@@ -148,12 +153,12 @@ def _process_replica_resources(cls, data):
             _validate_accelerators_inputs(instance_type, acc_req, acc_lim)
 
             efa = None
-            if requests.get('vpc.amazonaws.com/efa'):
-                efa = int(requests.get('vpc.amazonaws.com/efa'))
+            if requests.get(EFA_RESOURCE_KEY):
+                efa = int(requests.get(EFA_RESOURCE_KEY))
 
             efa_limit = None
-            if limits.get('vpc.amazonaws.com/efa'):
-                efa_limit = int(limits.get('vpc.amazonaws.com/efa'))
+            if limits.get(EFA_RESOURCE_KEY):
+                efa_limit = int(limits.get(EFA_RESOURCE_KEY))
 
             _validate_efa_inputs(instance_type, efa, efa_limit)
 
@@ -178,10 +183,7 @@ def _process_replica_resources(cls, data):
                 elif NEURON_RESOURCE_KEY in requests_values:
                     acc_lim = requests_values[NEURON_RESOURCE_KEY]
 
-                if efa is not None:
-                    requests_values["vpc.amazonaws.com/efa"] = efa
-
-            efa_lim = requests_values.get("vpc.amazonaws.com/efa")
+            efa_lim = requests_values.get(EFA_RESOURCE_KEY)
             if efa_lim is not None:
                 efa_lim = int(efa_lim)
 
diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py
@@ -73,12 +73,9 @@ def _get_resources_from_compute_quotas(instance_type: str,
         result["memory"] = memory_value
         result[type_of_accelerator] = accelerators
 
-        if efa is not None:
-            result["vpc.amazonaws.com/efa"] = efa
-        else:
-            efa_count = instance.get("efa", 0)
-            if efa_count > 0:
-                result["vpc.amazonaws.com/efa"] = efa_count
+        efa_count = efa or instance.get("efa", 0)
+        if efa_count > 0:
+            result["vpc.amazonaws.com/efa"] = efa_count
     
     else:
         result["cpu"] = vcpu or 0
diff --git a/test/unit_tests/training/test_pytorch_job_template_model.py b/test/unit_tests/training/test_pytorch_job_template_model.py
@@ -45,22 +45,71 @@ class TestPyTorchJobConfigEFA(unittest.TestCase):
     #     # Should also have GPU resources
     #     self.assertEqual(container.resources.requests["nvidia.com/gpu"], "8")
 
-    def test_no_node_count_no_efa(self):
-        """Test that jobs without node_count don't get EFA resources"""
+    def test_instance_without_efa_support_no_efa(self):
+        """Test that instances without EFA support don't get EFA (ml.g5.xlarge doesn't support EFA)"""
+        from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob
+
         config = PyTorchJobConfig(
-            job_name="test-no-node-count",
+            job_name="test-no-efa-support",
             image="pytorch:latest",
             accelerators=1,
             instance_type="ml.g5.xlarge"
         )
-        
+
         job = config.to_domain()
-        container = job.replicaSpecs[0].template.spec.containers[0]
-        
-        # Should not have EFA resources
+        # Call allocate_quotas_if_applicable to convert generic keys to actual resource keys
+        job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job)
+        container = job_with_resources.replicaSpecs[0].template.spec.containers[0]
+
+        # Should not have EFA resources (instance doesn't support it)
         self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests)
         self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits)
 
+        # Should have GPU resources
+        self.assertIn("nvidia.com/gpu", container.resources.requests)
+
+    def test_accelerators_with_efa_support_gets_default_efa(self):
+        """Test that specifying accelerators on EFA-capable instance gets EFA from constants"""
+        from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob
+
+        config = PyTorchJobConfig(
+            job_name="test-accelerators-default-efa",
+            image="pytorch:latest",
+            accelerators=4,
+            instance_type="ml.p4d.24xlarge"
+        )
+
+        job = config.to_domain()
+        # Call allocate_quotas_if_applicable to convert generic keys to actual resource keys
+        job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job)
+        container = job_with_resources.replicaSpecs[0].template.spec.containers[0]
+
+        # Should have EFA from constants
+        self.assertIn("vpc.amazonaws.com/efa", container.resources.requests)
+        self.assertIn("vpc.amazonaws.com/efa", container.resources.limits)
+        self.assertEqual(int(container.resources.requests["vpc.amazonaws.com/efa"]), 4)
+
+    def test_user_specified_efa_overrides_default(self):
+        """Test that user-specified EFA value overrides the default from constants"""
+        from sagemaker.hyperpod.training.hyperpod_pytorch_job import HyperPodPytorchJob
+
+        config = PyTorchJobConfig(
+            job_name="test-custom-efa",
+            image="pytorch:latest",
+            accelerators=4,
+            efa=2,
+            instance_type="ml.p4d.24xlarge"
+        )
+
+        job = config.to_domain()
+        # Call allocate_quotas_if_applicable to convert generic keys to actual resource keys
+        job_with_resources = HyperPodPytorchJob.allocate_quotas_if_applicable(job)
+        container = job_with_resources.replicaSpecs[0].template.spec.containers[0]
+
+        # Should use user-specified EFA value
+        self.assertEqual(int(container.resources.requests["vpc.amazonaws.com/efa"]), 2)
+        self.assertEqual(int(container.resources.limits["vpc.amazonaws.com/efa"]), 2)
+
     # def test_multi_node_with_memory_and_cpu(self):
     #     """Test EFA with other resource types"""
     #     config = PyTorchJobConfig(