Skip to content

Commit 6ed98af

Browse files
author
Sean Archer
committed
Increasd reserved resources amounts
1 parent 54954c7 commit 6ed98af

File tree

4 files changed

+40
-41
lines changed

4 files changed

+40
-41
lines changed

.gitignore

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@ doc/_build/
2323
/sagemaker-hyperpod/.coverage
2424
/sagemaker-hyperpod/.coverage.*
2525

26+
/hyperpod-cluster-stack-template/build
27+
/hyperpod-pytorch-job-template/build
28+
/hyperpod-custom-inference-template/build
29+
/hyperpod-jumpstart-inference-template/build
30+
2631
# Ignore all contents of result and results directories
2732
/result/
2833
/results/
@@ -31,5 +36,3 @@ doc/_build/
3136

3237
.venv*
3338
venv
34-
35-
/hyperpod-cluster-stack-template/build

src/sagemaker/hyperpod/training/quota_allocation_util.py

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,6 @@
137137
"ml.i3en.24xlarge": {"cpu": 96, "gpu": 0, "trainium": 0, "memory": 768}
138138
}
139139

140-
MAX_MEMORY_PROPORTION = 0.85
141-
MAX_CPU_PROPORTION = 0.92
142-
143140
def _has_compute_resource_quota_allocation_resources(memory_in_gib: Optional[float], vcpu: Optional[float], accelerators: Optional[int]) -> bool:
144141
return (
145142
(memory_in_gib is not None) or
@@ -269,7 +266,7 @@ def _resolve_default_cpu_values(instance_type: str, requests_values: dict) -> No
269266
f"Maximum available CPU for {instance_type} is {total_available_cpu}."
270267
)
271268

272-
max_allocatable_cpu = int(total_available_cpu * MAX_CPU_PROPORTION)
269+
max_allocatable_cpu = int(total_available_cpu - _calculate_cpu_reservation(total_available_cpu))
273270
cpu_request = min(cpu_request, max_allocatable_cpu)
274271
requests_values["cpu"] = str(cpu_request)
275272

@@ -297,9 +294,11 @@ def _resolve_default_memory_values(instance_type: str, requests_values: dict, li
297294
f"Maximum available memory for {instance_type} is {total_available_memory}Gi."
298295
)
299296

300-
max_allocatable_memory = int(total_available_memory * MAX_MEMORY_PROPORTION)
297+
max_allocatable_memory = int(total_available_memory - _calculate_memory_reservation(total_available_memory))
298+
301299
if not user_set_limit:
302300
memory_limit = min(memory_limit, max_allocatable_memory)
301+
303302
memory_request = min(memory_request, max_allocatable_memory)
304303
limits_values["memory"] = str(memory_limit) + "Gi"
305304
requests_values["memory"] = str(memory_request) + "Gi"
@@ -387,32 +386,32 @@ def _calculate_memory_reservation(memory_gb):
387386
reserved_memory = static_memory_overhead
388387
remaining = memory_gb
389388

390-
# First 4 GB (25%)
389+
# First 4 GB (30%)
391390
first_4gb = min(4, remaining)
392-
reserved_memory += first_4gb * 0.25
391+
reserved_memory += first_4gb * 0.3
393392
remaining -= first_4gb
394393

395-
# Next 4 GB (20%)
394+
# Next 4 GB (25%)
396395
if remaining > 0:
397396
next_4gb = min(4, remaining)
398-
reserved_memory += next_4gb * 0.20
397+
reserved_memory += next_4gb * 0.25
399398
remaining -= next_4gb
400399

401-
# Next 8 GB (10%)
400+
# Next 8 GB (20%)
402401
if remaining > 0:
403402
next_8gb = min(8, remaining)
404-
reserved_memory += next_8gb * 0.10
403+
reserved_memory += next_8gb * 0.2
405404
remaining -= next_8gb
406405

407-
# Next 112 GB (6%)
406+
# Next 112 GB (17%)
408407
if remaining > 0:
409408
next_112gb = min(112, remaining)
410-
reserved_memory += next_112gb * 0.06
409+
reserved_memory += next_112gb * 0.17
411410
remaining -= next_112gb
412411

413-
# Remaining memory (2%)
412+
# Remaining memory (7%)
414413
if remaining > 0:
415-
reserved_memory += remaining * 0.02
414+
reserved_memory += remaining * 0.07
416415

417416
return reserved_memory
418417

@@ -424,21 +423,21 @@ def _calculate_cpu_reservation(cpu_count):
424423

425424
reserved_cpu = static_cpu_overhead
426425

427-
# First core (6%)
426+
# First core (30%)
428427
if cpu_count >= 1:
429-
reserved_cpu += 0.06
428+
reserved_cpu += 0.3
430429

431-
# Second core (1%)
430+
# Second core (15%)
432431
if cpu_count >= 2:
433-
reserved_cpu += 0.01
432+
reserved_cpu += 0.15
434433

435-
# Cores 3-4 (0.5% each)
434+
# Cores 3-4 (10% each)
436435
for _ in range(min(2, max(0, cpu_count - 2))):
437-
reserved_cpu += 0.005
436+
reserved_cpu += 0.1
438437

439-
# Remaining cores (0.25% each)
438+
# Remaining cores (6% each)
440439
if cpu_count > 4:
441-
reserved_cpu += (cpu_count - 4) * 0.0025
440+
reserved_cpu += (cpu_count - 4) * 0.06
442441

443442
return reserved_cpu
444443

test/integration_tests/training/cli/test_gpu_quota_allocation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,8 @@ def test_create_job_with_only_accelerators_parameter(self, test_job_name):
149149
]
150150
result = execute_command(describe_cmd)
151151
assert result.returncode == 0
152-
assert " Limits: {'memory': '108Gi', 'nvidia.com/gpu': '1'}" in result.stdout
153-
assert " Requests: {'cpu': '29', 'memory': '108Gi', 'nvidia.com/gpu': '1'}" in result.stdout
152+
assert " Limits: {'memory': '104Gi', 'nvidia.com/gpu': '1'}" in result.stdout
153+
assert " Requests: {'cpu': '29', 'memory': '104Gi', 'nvidia.com/gpu': '1'}" in result.stdout
154154

155155
delete_cmd = [
156156
"hyp", "delete", "hyp-pytorch-job",

test/unit_tests/cli/test_quota_allocation_util.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,6 @@
3030
INSTANCE_RESOURCES
3131
)
3232

33-
MAX_MEMORY_PROPORTION = 0.85
34-
MAX_CPU_PROPORTION = 0.92
35-
3633
def float_equals(a, b, tolerance=0.0001):
3734
return abs(a - b) <= tolerance
3835

@@ -107,7 +104,7 @@ def test_get_resources_from_compute_quotas_memory_only(self):
107104
def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_1(self):
108105
result = _get_resources_from_compute_quotas("ml.g5.xlarge", None, None, 1)
109106
# ml.g5.xlarge has 1 GPU, 4 CPUs, 16GiB memory
110-
assert result == {"cpu": "3.82", "memory": "12.9Gi", "nvidia.com/gpu": 1}
107+
assert result == {"cpu": "3.25", "memory": "11.7Gi", "nvidia.com/gpu": 1}
111108

112109
def test_get_resources_from_compute_quotas_gpu_instance_with_accelerators_ratio_half(self):
113110
result = _get_resources_from_compute_quotas("ml.g6e.48xlarge", None, None, 4)
@@ -137,7 +134,7 @@ def test_get_resources_from_compute_quotas_vcpu_only(self):
137134
def test_get_resources_from_compute_quotas_accelerators_and_cpu_only(self):
138135
result = _get_resources_from_compute_quotas("ml.g5.xlarge", 2.0, None, 1)
139136
# ml.g5.xlarge has 1 gpu, 4 CPUs and 16GB memory, and memory calculated as accelerator ratio
140-
assert result == {'cpu': '2.0', 'memory': '12.9Gi', 'nvidia.com/gpu': 1}
137+
assert result == {'cpu': '2.0', 'memory': '11.7Gi', 'nvidia.com/gpu': 1}
141138

142139
# Tests for _get_resources_from_instance method
143140
@pytest.mark.parametrize(
@@ -312,8 +309,8 @@ def test_resolve_default_memory_values_set_to_allocatable(self):
312309
requests = {"memory": "16Gi"}
313310
limits = {}
314311
_resolve_default_memory_values("ml.g5.xlarge", requests, limits)
315-
assert requests["memory"] == "13Gi"
316-
assert limits["memory"] == "13Gi"
312+
assert requests["memory"] == "11Gi"
313+
assert limits["memory"] == "11Gi"
317314

318315
# Tests for _validate_accelerators_inputs
319316
def test_validate_accelerators_inputs_valid_equal_values(self):
@@ -419,17 +416,17 @@ def test_request_modification(self):
419416
def test_memory_reservation_small_instance(self):
420417
memory_gb = 4
421418
reserved = _calculate_memory_reservation(memory_gb)
422-
assert float_equals(reserved, 1.5)
419+
assert float_equals(reserved, 1.7)
423420

424421
def test_memory_reservation_medium_instance(self):
425422
memory_gb = 16
426423
reserved = _calculate_memory_reservation(memory_gb)
427-
assert (float_equals(reserved, 3.1))
424+
assert (float_equals(reserved, 4.3))
428425

429426
def test_memory_reservation_large_instance(self):
430427
memory_gb = 2048
431428
reserved = _calculate_memory_reservation(memory_gb)
432-
assert (float_equals(reserved, 48.22))
429+
assert (float_equals(reserved, 157.74))
433430

434431
def test_memory_reservation_zero(self):
435432
memory_gb = 0
@@ -440,23 +437,23 @@ def test_cpu_reservation_single_core(self):
440437
"""Test CPU reservation for single core"""
441438
cpu_count = 1
442439
reserved = _calculate_cpu_reservation(cpu_count)
443-
assert (float_equals(reserved, 0.16))
440+
assert (float_equals(reserved, 0.4))
444441

445442
def test_cpu_reservation_dual_core(self):
446443
cpu_count = 2
447444
reserved = _calculate_cpu_reservation(cpu_count)
448-
assert (float_equals(reserved, 0.17))
445+
assert (float_equals(reserved, 0.55))
449446

450447
def test_cpu_reservation_quad_core(self):
451448
cpu_count = 4
452449
reserved = _calculate_cpu_reservation(cpu_count)
453-
assert (float_equals(reserved, 0.18))
450+
assert (float_equals(reserved, 0.75))
454451

455452
def test_cpu_reservation_many_cores(self):
456453
"""Test CPU reservation for 96 cores"""
457454
cpu_count = 96
458455
reserved = _calculate_cpu_reservation(cpu_count)
459-
assert (float_equals(reserved, 0.41))
456+
assert (float_equals(reserved, 6.27))
460457

461458
def test_cpu_reservation_zero(self):
462459
"""Test CPU reservation with 0 cores"""

0 commit comments

Comments
 (0)