Skip to content

Commit 6049212

Browse files
committed
set template-version flag to optional for cluster create, add support for efa for pytorch job, remove default request and limits when instance type is none
1 parent 0e32919 commit 6049212

File tree

5 files changed

+166
-16
lines changed

5 files changed

+166
-16
lines changed

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -333,19 +333,37 @@ def build_dict(**kwargs):
333333
return {k: v for k, v in kwargs.items() if v is not None}
334334

335335
# Build resources
336-
if self.instance_type is None:
337-
requests_value = limits_value = {"nvidia.com/gpu": "0"}
338-
else:
339-
requests_value = build_dict(
340-
accelerators=str(self.accelerators) if self.accelerators else None,
341-
vcpu=str(self.vcpu) if self.vcpu else None,
342-
memory=str(self.memory) if self.memory else None
343-
)
344-
limits_value = build_dict(
345-
accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
346-
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
347-
memory=str(self.memory_limit) if self.memory_limit else None
348-
)
336+
requests_value = {}
337+
limits_value = {}
338+
339+
# Add GPU resources (respect accelerators regardless of instance_type)
340+
if self.accelerators:
341+
requests_value["nvidia.com/gpu"] = str(self.accelerators)
342+
if self.accelerators_limit:
343+
limits_value["nvidia.com/gpu"] = str(self.accelerators_limit)
344+
345+
# Add CPU resources
346+
if self.vcpu:
347+
requests_value["cpu"] = str(self.vcpu)
348+
if self.vcpu_limit:
349+
limits_value["cpu"] = str(self.vcpu_limit)
350+
351+
# Add memory resources
352+
if self.memory:
353+
requests_value["memory"] = f"{self.memory}Gi"
354+
if self.memory_limit:
355+
limits_value["memory"] = f"{self.memory_limit}Gi"
356+
357+
# Add EFA for multi-node jobs
358+
if self.node_count and self.node_count > 1:
359+
requests_value["vpc.amazonaws.com/efa"] = "1"
360+
limits_value["vpc.amazonaws.com/efa"] = "1"
361+
362+
# Set default GPU to "0" only if no resources specified at all
363+
if not requests_value:
364+
requests_value = {"nvidia.com/gpu": "0"}
365+
if not limits_value:
366+
limits_value = {"nvidia.com/gpu": "0"}
349367

350368
# Build container
351369
container_kwargs = build_dict(

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/template.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@
8484
{%- endfor %}
8585
{%- endif %}
8686
resources:
87-
{%- if accelerators or vcpu or memory %}
87+
{%- if accelerators or vcpu or memory or (node_count and node_count > 1) %}
8888
requests:
8989
{%- if accelerators %}
9090
nvidia.com/gpu: {{ accelerators }}
@@ -95,11 +95,14 @@
9595
{%- if memory %}
9696
memory: {{ memory }}Gi
9797
{%- endif %}
98+
{%- if (node_count and node_count > 1) %}
99+
vpc.amazonaws.com/efa: 1
100+
{%- endif %}
98101
{%- else %}
99102
requests:
100103
nvidia.com/gpu: "0"
101104
{%- endif %}
102-
{%- if accelerators_limit or vcpu_limit or memory_limit %}
105+
{%- if accelerators_limit or vcpu_limit or memory_limit or (node_count and node_count > 1) %}
103106
limits:
104107
{%- if accelerators_limit %}
105108
nvidia.com/gpu: {{ accelerators_limit }}
@@ -110,6 +113,9 @@
110113
{%- if memory_limit %}
111114
memory: {{ memory_limit }}Gi
112115
{%- endif %}
116+
{%- if (node_count and node_count > 1) %}
117+
vpc.amazonaws.com/efa: 1
118+
{%- endif %}
113119
{%- else %}
114120
limits:
115121
nvidia.com/gpu: "0"

src/sagemaker/hyperpod/cli/commands/cluster_stack.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,19 @@
3030
logger = logging.getLogger(__name__)
3131

3232

33+
def get_newest_template_version() -> int:
34+
"""Get the newest available template version.
35+
36+
Returns:
37+
int: The newest template version number
38+
39+
TODO: Implement logic to fetch the actual newest template version
40+
from the template registry or remote source.
41+
"""
42+
# Placeholder implementation - currently returns 1 as the latest version
43+
return 1
44+
45+
3346
def parse_status_list(ctx, param, value):
3447
"""Parse status list from string format like "['CREATE_COMPLETE', 'UPDATE_COMPLETE']" """
3548
if not value:
@@ -79,7 +92,6 @@ def create_cluster_stack(config_file, region, template_version, debug):
7992
return
8093

8194
# Load config to get template and version
82-
8395
config_dir = Path(config_file).parent
8496
data, template, version = load_config(config_dir)
8597

@@ -95,6 +107,11 @@ def create_cluster_stack(config_file, region, template_version, debug):
95107
model_instance = model_class(**filtered_config)
96108
config = model_instance.to_config(region=region)
97109

110+
# Use newest template version if not provided
111+
if template_version is None:
112+
template_version = get_newest_template_version()
113+
logger.info(f"No template version specified, using newest version: {template_version}")
114+
98115
# Create the cluster stack
99116
stack_id = HpClusterStack(**config).create(region, template_version)
100117

src/sagemaker/hyperpod/cli/commands/init.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
CFN
1212
)
1313
from sagemaker.hyperpod.cluster_management.hp_cluster_stack import HpClusterStack
14+
from sagemaker.hyperpod.cli.commands.cluster_stack import get_newest_template_version
1415
from sagemaker.hyperpod.cli.init_utils import (
1516
generate_click_command,
1617
save_config_yaml,
@@ -375,6 +376,10 @@ def _default_create(region, template_version):
375376
# Pass region to to_domain for cluster stack template
376377
if template == "cluster-stack":
377378
config = template_model.to_config(region=region)
379+
# Use newest template version if not provided
380+
if template_version is None:
381+
template_version = get_newest_template_version()
382+
click.secho(f"No template version specified, using newest version: {template_version}", fg="yellow")
378383
HpClusterStack(**config).create(region, template_version)
379384
else:
380385
# Create from k8s.yaml
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import unittest
2+
from hyperpod_pytorch_job_template.v1_1.model import PyTorchJobConfig
3+
4+
5+
class TestPyTorchJobConfigEFA(unittest.TestCase):
6+
"""Test EFA resource allocation in PyTorchJobConfig"""
7+
8+
def test_single_node_no_efa(self):
9+
"""Test that single-node jobs don't get EFA resources"""
10+
config = PyTorchJobConfig(
11+
job_name="test-single-node",
12+
image="pytorch:latest",
13+
node_count=1,
14+
accelerators=2,
15+
instance_type="ml.p4d.24xlarge"
16+
)
17+
18+
job = config.to_domain()
19+
container = job.replicaSpecs[0].template.spec.containers[0]
20+
21+
# Should not have EFA resources
22+
self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests)
23+
self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits)
24+
25+
# Should have GPU resources
26+
self.assertEqual(container.resources.requests["nvidia.com/gpu"], "2")
27+
28+
def test_multi_node_with_efa(self):
29+
"""Test that multi-node jobs automatically get EFA resources"""
30+
config = PyTorchJobConfig(
31+
job_name="test-multi-node",
32+
image="pytorch:latest",
33+
node_count=4,
34+
accelerators=8,
35+
instance_type="ml.p4d.24xlarge"
36+
)
37+
38+
job = config.to_domain()
39+
container = job.replicaSpecs[0].template.spec.containers[0]
40+
41+
# Should have EFA resources
42+
self.assertEqual(container.resources.requests["vpc.amazonaws.com/efa"], "1")
43+
self.assertEqual(container.resources.limits["vpc.amazonaws.com/efa"], "1")
44+
45+
# Should also have GPU resources
46+
self.assertEqual(container.resources.requests["nvidia.com/gpu"], "8")
47+
48+
def test_no_node_count_no_efa(self):
49+
"""Test that jobs without node_count don't get EFA resources"""
50+
config = PyTorchJobConfig(
51+
job_name="test-no-node-count",
52+
image="pytorch:latest",
53+
accelerators=1,
54+
instance_type="ml.g5.xlarge"
55+
)
56+
57+
job = config.to_domain()
58+
container = job.replicaSpecs[0].template.spec.containers[0]
59+
60+
# Should not have EFA resources
61+
self.assertNotIn("vpc.amazonaws.com/efa", container.resources.requests)
62+
self.assertNotIn("vpc.amazonaws.com/efa", container.resources.limits)
63+
64+
def test_multi_node_with_memory_and_cpu(self):
65+
"""Test EFA with other resource types"""
66+
config = PyTorchJobConfig(
67+
job_name="test-multi-resources",
68+
image="pytorch:latest",
69+
node_count=2,
70+
accelerators=4,
71+
vcpu=16.0,
72+
memory=64.0,
73+
instance_type="ml.p4d.24xlarge"
74+
)
75+
76+
job = config.to_domain()
77+
container = job.replicaSpecs[0].template.spec.containers[0]
78+
79+
# Should have all resources including EFA
80+
self.assertEqual(container.resources.requests["vpc.amazonaws.com/efa"], "1")
81+
self.assertEqual(container.resources.requests["nvidia.com/gpu"], "4")
82+
self.assertEqual(container.resources.requests["cpu"], "16.0")
83+
self.assertEqual(container.resources.requests["memory"], "64.0Gi")
84+
85+
def test_accelerators_without_instance_type(self):
86+
"""Test that accelerators work without instance_type (fixes the main issue)"""
87+
config = PyTorchJobConfig(
88+
job_name="test-no-instance-type",
89+
image="pytorch:latest",
90+
accelerators=4
91+
# No instance_type specified
92+
)
93+
94+
job = config.to_domain()
95+
container = job.replicaSpecs[0].template.spec.containers[0]
96+
97+
# Should respect accelerators value even without instance_type
98+
self.assertEqual(container.resources.requests["nvidia.com/gpu"], "4")
99+
# Limits should default to "0" since accelerators_limit not specified
100+
self.assertEqual(container.resources.limits["nvidia.com/gpu"], "0")
101+
102+
103+
if __name__ == '__main__':
104+
unittest.main()

0 commit comments

Comments
 (0)