@@ -110,18 +110,18 @@ def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_
110110
111111 def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_half (self ):
112112 result = _get_resources_from_compute_quotas ("ml.g6e.48xlarge" , None , None , 4 )
113- # ml.g5.xlarge has 8 GPU, 192 CPUs, 1536GiB memory
114- assert result == {"cpu" : "96.0" , "memory" : "768.0Gi" , "nvidia.com/gpu" : 4 }
113+ # ml.g6e.48xlarge has 8 GPU, 192 CPUs, 1536GiB memory, 4 EFA
114+ assert result == {"cpu" : "96.0" , "memory" : "768.0Gi" , "nvidia.com/gpu" : 4 , "vpc.amazonaws.com/efa" : 4 }
115115
116116 def test_get_resources_from_compute_quotas_gpu_instance_all_params (self ):
117117 result = _get_resources_from_compute_quotas ("ml.g5.xlarge" , 2.0 , 8.0 , 1 )
118118 assert result == {"cpu" : "2.0" , "memory" : "8.0Gi" , "nvidia.com/gpu" : 1 }
119119
120120 def test_get_resources_from_compute_quotas_trainium_instance (self ):
121121 result = _get_resources_from_compute_quotas ("ml.trn1.32xlarge" , None , None , 8 )
122- # ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory
122+ # ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory, 8 EFA
123123 # 8 trainium is half, so we should get half of CPU and memory
124- assert result == {"cpu" : "64.0" , "memory" : "256.0Gi" , "aws.amazon.com/neurondevice" : 8 }
124+ assert result == {"cpu" : "64.0" , "memory" : "256.0Gi" , "aws.amazon.com/neurondevice" : 8 , "vpc.amazonaws.com/efa" : 8 }
125125
126126 def test_get_resources_from_compute_quotas_cpu_only_instance (self ):
127127 result = _get_resources_from_compute_quotas ("ml.c5.large" , 1.0 , 2.0 , 1 )
@@ -142,14 +142,15 @@ def test_get_resources_from_compute_quotas_accelerators_and_cpu_only(self):
142142 @pytest .mark .parametrize (
143143 "instance_type,node_count,expected" ,
144144 [
145- # GPU instances
146- ("ml.p4d.24xlarge" , 1 , {"cpu" : "96" , "memory" : "1152Gi" , "nvidia.com/gpu" : 8 }),
147- ("ml.p4d.24xlarge" , 2 , {"cpu" : "192" , "memory" : "2304Gi" , "nvidia.com/gpu" : 16 }),
145+ # GPU instances with EFA support
146+ ("ml.p4d.24xlarge" , 1 , {"cpu" : "96" , "memory" : "1152Gi" , "nvidia.com/gpu" : 8 , "vpc.amazonaws.com/efa" : 4 }),
147+ ("ml.p4d.24xlarge" , 2 , {"cpu" : "192" , "memory" : "2304Gi" , "nvidia.com/gpu" : 16 , "vpc.amazonaws.com/efa" : 4 }),
148+ # GPU instances without EFA support
148149 ("ml.g5.xlarge" , 1 , {"cpu" : "4" , "memory" : "16Gi" , "nvidia.com/gpu" : 1 }),
149150 ("ml.g5.xlarge" , 3 , {"cpu" : "12" , "memory" : "48Gi" , "nvidia.com/gpu" : 3 }),
150151 # Trainium instances
151- ("ml.trn1.32xlarge" , 1 , {"cpu" : "128" , "memory" : "512Gi" , "aws.amazon.com/neurondevice" : 16 }),
152- ("ml.trn1.32xlarge" , 2 , {"cpu" : "256" , "memory" : "1024Gi" , "aws.amazon.com/neurondevice" : 32 }),
152+ ("ml.trn1.32xlarge" , 1 , {"cpu" : "128" , "memory" : "512Gi" , "aws.amazon.com/neurondevice" : 16 , "vpc.amazonaws.com/efa" : 8 }),
153+ ("ml.trn1.32xlarge" , 2 , {"cpu" : "256" , "memory" : "1024Gi" , "aws.amazon.com/neurondevice" : 32 , "vpc.amazonaws.com/efa" : 8 }),
153154 # CPU-only instances
154155 ("ml.c5.large" , 1 , {"cpu" : "2" , "memory" : "4Gi" }),
155156 ("ml.c5.large" , 5 , {"cpu" : "10" , "memory" : "20Gi" }),
0 commit comments