Skip to content

Commit 930eded

Browse files
committed
fix: add efa check in quota allocation test
1 parent 98e7703 commit 930eded

File tree

1 file changed

+10
-9
lines changed

1 file changed

+10
-9
lines changed

test/unit_tests/cli/test_quota_allocation_util.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,18 +110,18 @@ def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_
110110

111111
def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_half(self):
112112
result = _get_resources_from_compute_quotas("ml.g6e.48xlarge", None, None, 4)
113-
# ml.g5.xlarge has 8 GPU, 192 CPUs, 1536GiB memory
114-
assert result == {"cpu": "96.0", "memory": "768.0Gi", "nvidia.com/gpu": 4}
113+
# ml.g6e.48xlarge has 8 GPU, 192 CPUs, 1536GiB memory, 4 EFA
114+
assert result == {"cpu": "96.0", "memory": "768.0Gi", "nvidia.com/gpu": 4, "vpc.amazonaws.com/efa": 4}
115115

116116
def test_get_resources_from_compute_quotas_gpu_instance_all_params(self):
117117
result = _get_resources_from_compute_quotas("ml.g5.xlarge", 2.0, 8.0, 1)
118118
assert result == {"cpu": "2.0", "memory": "8.0Gi", "nvidia.com/gpu": 1}
119119

120120
def test_get_resources_from_compute_quotas_trainium_instance(self):
121121
result = _get_resources_from_compute_quotas("ml.trn1.32xlarge", None, None, 8)
122-
# ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory
122+
# ml.trn1.32xlarge has 16 trainium, 128 CPUs, 512GB memory, 8 EFA
123123
# 8 trainium is half, so we should get half of CPU and memory
124-
assert result == {"cpu": "64.0", "memory": "256.0Gi", "aws.amazon.com/neurondevice": 8}
124+
assert result == {"cpu": "64.0", "memory": "256.0Gi", "aws.amazon.com/neurondevice": 8, "vpc.amazonaws.com/efa": 8}
125125

126126
def test_get_resources_from_compute_quotas_cpu_only_instance(self):
127127
result = _get_resources_from_compute_quotas("ml.c5.large", 1.0, 2.0, 1)
@@ -142,14 +142,15 @@ def test_get_resources_from_compute_quotas_accelerators_and_cpu_only(self):
142142
@pytest.mark.parametrize(
143143
"instance_type,node_count,expected",
144144
[
145-
# GPU instances
146-
("ml.p4d.24xlarge", 1, {"cpu": "96", "memory": "1152Gi", "nvidia.com/gpu": 8}),
147-
("ml.p4d.24xlarge", 2, {"cpu": "192", "memory": "2304Gi", "nvidia.com/gpu": 16}),
145+
# GPU instances with EFA support
146+
("ml.p4d.24xlarge", 1, {"cpu": "96", "memory": "1152Gi", "nvidia.com/gpu": 8, "vpc.amazonaws.com/efa": 4}),
147+
("ml.p4d.24xlarge", 2, {"cpu": "192", "memory": "2304Gi", "nvidia.com/gpu": 16, "vpc.amazonaws.com/efa": 4}),
148+
# GPU instances without EFA support
148149
("ml.g5.xlarge", 1, {"cpu": "4", "memory": "16Gi", "nvidia.com/gpu": 1}),
149150
("ml.g5.xlarge", 3, {"cpu": "12", "memory": "48Gi", "nvidia.com/gpu": 3}),
150151
# Trainium instances
151-
("ml.trn1.32xlarge", 1, {"cpu": "128", "memory": "512Gi", "aws.amazon.com/neurondevice": 16}),
152-
("ml.trn1.32xlarge", 2, {"cpu": "256", "memory": "1024Gi", "aws.amazon.com/neurondevice": 32}),
152+
("ml.trn1.32xlarge", 1, {"cpu": "128", "memory": "512Gi", "aws.amazon.com/neurondevice": 16, "vpc.amazonaws.com/efa": 8}),
153+
("ml.trn1.32xlarge", 2, {"cpu": "256", "memory": "1024Gi", "aws.amazon.com/neurondevice": 32, "vpc.amazonaws.com/efa": 8}),
153154
# CPU-only instances
154155
("ml.c5.large", 1, {"cpu": "2", "memory": "4Gi"}),
155156
("ml.c5.large", 5, {"cpu": "10", "memory": "20Gi"}),

0 commit comments

Comments
 (0)