Skip to content

Commit 2b26f75

Browse files
Consolidate debug flag to show kubernates exception (#335)
* set template-version flag to optional for cluster create, add support for efa for pytorch job, remove default request and limits when instance type is none * fix gpu allocation validation error * remove redundant * fix unit test and expand logic to memory and vcpu field * Follow up on merge conflict in release * consolidate all debug flags to show kubernates exception * Revert "Follow up on merge conflict in release" This reverts commit c816838. * fix unit and integ test for space * fix more unit test for space * change dependency for delete in init integ test
1 parent 530792a commit 2b26f75

File tree

14 files changed

+50
-32
lines changed

14 files changed

+50
-32
lines changed

hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -480,16 +480,6 @@ def build_dict(**kwargs):
480480

481481
result = HyperPodPytorchJob(**job_kwargs)
482482
return result
483-
484-
def create_from_k8s_yaml(self, yaml_file_path: str) -> None:
485-
"""Create HyperPodPytorchJob from k8s YAML file."""
486-
with open(yaml_file_path, 'r') as f:
487-
yaml_data = yaml.safe_load(f)
488-
489-
# Combine metadata and spec for full validation
490-
full_data = {**yaml_data['spec'], 'metadata': yaml_data['metadata']}
491-
job = HyperPodPytorchJob.model_validate(full_data, by_name=True)
492-
job.create()
493483

494484

495485
# Volume-specific type handlers - only override what's needed

src/sagemaker/hyperpod/cli/commands/inference.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
# CREATE
2222
@click.command("hyp-jumpstart-endpoint")
2323
@click.option("--version", default="1.1", help="Schema version to use")
24-
@click.option("--debug", default=False, help="Enable debug mode")
24+
@click.option("--debug", is_flag=True, help="Enable debug mode")
2525
@generate_click_command(
2626
schema_pkg="hyperpod_jumpstart_inference_template",
2727
registry=JS_REG,
@@ -38,7 +38,7 @@ def js_create(version, debug, js_endpoint):
3838

3939
@click.command("hyp-custom-endpoint")
4040
@click.option("--version", default="1.1", help="Schema version to use")
41-
@click.option("--debug", default=False, help="Enable debug mode")
41+
@click.option("--debug", is_flag=True, help="Enable debug mode")
4242
@generate_click_command(
4343
schema_pkg="hyperpod_custom_inference_template",
4444
registry=C_REG,

src/sagemaker/hyperpod/cli/commands/init.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,9 @@ def validate():
274274
@click.command(name="_default_create")
275275
@click.option("--region", "-r", default=None, help="Region to create cluster stack for, default to your region in aws configure. Not available for other templates.")
276276
@click.option("--template-version", type=click.INT, help="Version number of cluster creation template. Not available for other templates.")
277+
@click.option("--debug", is_flag=True, help="Enable debug logging")
277278
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_create_cli")
278-
def _default_create(region, template_version):
279+
def _default_create(region, template_version, debug):
279280
"""
280281
Validate configuration and render template files for deployment.
281282
@@ -384,7 +385,7 @@ def _default_create(region, template_version):
384385
else:
385386
# Create from k8s.yaml
386387
k8s_file = out_dir / 'k8s.yaml'
387-
create_from_k8s_yaml(str(k8s_file))
388+
create_from_k8s_yaml(str(k8s_file), debug=debug)
388389

389390

390391
except Exception as e:

src/sagemaker/hyperpod/cli/commands/space.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,16 @@
1313

1414

1515
@click.command("hyp-space")
16+
@click.option("--debug", is_flag=True, help="Enable debug mode")
1617
@generate_click_command(
1718
schema_pkg="hyperpod_space_template",
1819
registry=SCHEMA_REGISTRY,
1920
)
20-
def space_create(version, config):
21+
def space_create(version, debug, config):
2122
"""Create a space resource."""
2223
space_config = SpaceConfig(**config)
2324
space = HPSpace(config=space_config)
24-
space.create()
25+
space.create(debug=debug)
2526
click.echo(f"Space '{space_config.name}' created successfully in namespace '{space_config.namespace}'")
2627

2728

src/sagemaker/hyperpod/cli/commands/training.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
@click.command("hyp-pytorch-job")
1515
@click.option("--version", default="1.0", help="Schema version to use")
16-
@click.option("--debug", default=False, help="Enable debug mode")
16+
@click.option("--debug", is_flag=True, help="Enable debug mode")
1717
@generate_click_command(
1818
schema_pkg="hyperpod_pytorch_job_template",
1919
registry=SCHEMA_REGISTRY,

src/sagemaker/hyperpod/cli/init_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@ def build_config_from_schema(template: str, version: str, model_config=None, exi
536536
return full_cfg, comment_map
537537

538538

539-
def create_from_k8s_yaml(yaml_file_path: str) -> None:
539+
def create_from_k8s_yaml(yaml_file_path: str, debug: bool = False) -> None:
540540
"""Create HyperPod resource from K8s YAML file based on kind mapping."""
541541
from sagemaker.hyperpod.cli.constants.init_constants import K8S_KIND_MAPPING
542542

@@ -563,4 +563,4 @@ def create_from_k8s_yaml(yaml_file_path: str) -> None:
563563
resource = resource_class.model_validate(yaml_data['spec'], by_name=True)
564564
resource.metadata = Metadata.model_validate(yaml_data['metadata'], by_name=True)
565565

566-
resource.create()
566+
resource.create(debug=debug)

src/sagemaker/hyperpod/cli/space_utils.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ def _parse_idle_shutdown_param(ctx, param, value):
200200
# 1) the wrapper click will call
201201
def wrapped_func(*args, **kwargs):
202202
version = version_key or kwargs.pop("version", "1.0")
203+
debug = kwargs.pop("debug", False)
203204

204205
Model = registry.get(version)
205206
if Model is None:
@@ -241,6 +242,10 @@ def wrapped_func(*args, **kwargs):
241242
# filter out None/empty values so Pydantic model defaults apply
242243
filtered_kwargs = {}
243244
for key, value in kwargs.items():
245+
# Skip debug parameter as it's not part of the model
246+
if key == "debug":
247+
continue
248+
244249
if value is not None:
245250
# Parse JSON for object/array type parameters
246251
spec = props.get(key, {})
@@ -285,7 +290,13 @@ def wrapped_func(*args, **kwargs):
285290
f"Configuration validation errors:\n" + "\n".join(error_messages)
286291
)
287292

288-
return func(version, config_dict)
293+
# Call the original function with appropriate parameters
294+
import inspect
295+
sig = inspect.signature(func)
296+
if 'debug' in sig.parameters:
297+
return func(version, debug, config_dict)
298+
else:
299+
return func(version, config_dict)
289300

290301
# 2) inject click options from JSON Schema
291302
wrapped_func = click.option(
@@ -387,6 +398,7 @@ def wrapped_func(*args, **kwargs):
387398
"container_config",
388399
"template_ref",
389400
"idle_shutdown",
401+
"debug", # Exclude debug from validation
390402
]
391403
)
392404

src/sagemaker/hyperpod/common/utils.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def get_default_namespace():
3939
)
4040

4141
def handle_exception(e: Exception, name: str, namespace: Optional[str],
42-
operation_type: str = 'unknown', resource_type: str = 'unknown'):
42+
operation_type: str = 'unknown', resource_type: str = 'unknown', debug: bool = False):
4343
"""
4444
Handle various Kubernetes API exceptions for SDK usage (non-CLI).
4545
@@ -52,13 +52,17 @@ def handle_exception(e: Exception, name: str, namespace: Optional[str],
5252
namespace: Kubernetes namespace
5353
operation_type: Operation type (legacy parameter, kept for backward compatibility)
5454
resource_type: Resource type (legacy parameter, kept for backward compatibility)
55+
debug: If True, show full Kubernetes exception details
5556
"""
5657

5758
if isinstance(e, ApiException):
5859
if e.status == 401:
5960
raise Exception(f"Credentials unauthorized.") from e
6061
elif e.status == 403:
61-
if namespace:
62+
if debug and e.body:
63+
# Show full Kubernetes error details in debug mode
64+
raise Exception(f"Kubernetes API error: {e.body}") from e
65+
elif namespace:
6266
raise Exception(
6367
f"Access denied to resource '{name}' in namespace '{namespace}'."
6468
) from e
@@ -67,7 +71,9 @@ def handle_exception(e: Exception, name: str, namespace: Optional[str],
6771
f"Access denied to resource '{name}'."
6872
) from e
6973
elif e.status == 404:
70-
if namespace:
74+
if debug and e.body:
75+
raise Exception(f"Kubernetes API error: {e.body}") from e
76+
elif namespace:
7177
# Basic 404 for SDK usage - CLI commands get enhanced 404 via decorator
7278
raise Exception(
7379
f"Resource '{name}' not found in namespace '{namespace}'. "
@@ -78,7 +84,9 @@ def handle_exception(e: Exception, name: str, namespace: Optional[str],
7884
f"Resource '{name}' not found. Please check the resource name."
7985
) from e
8086
elif e.status == 409:
81-
if namespace:
87+
if debug and e.body:
88+
raise Exception(f"Kubernetes API error: {e.body}") from e
89+
elif namespace:
8290
raise Exception(
8391
f"Resource '{name}' already exists in namespace '{namespace}'."
8492
) from e
@@ -87,9 +95,15 @@ def handle_exception(e: Exception, name: str, namespace: Optional[str],
8795
f"Resource '{name}' already exists."
8896
) from e
8997
elif 500 <= e.status < 600:
90-
raise Exception("Kubernetes API internal server error.") from e
98+
if debug and e.body:
99+
raise Exception(f"Kubernetes API error: {e.body}") from e
100+
else:
101+
raise Exception("Kubernetes API internal server error.") from e
91102
else:
92-
raise Exception(f"Unhandled Kubernetes error: {e.status} {e.reason}") from e
103+
if debug and e.body:
104+
raise Exception(f"Kubernetes API error: {e.body}") from e
105+
else:
106+
raise Exception(f"Unhandled Kubernetes error: {e.status} {e.reason}") from e
93107

94108
if isinstance(e, ValidationError):
95109
raise Exception("Response did not match expected schema.") from e

src/sagemaker/hyperpod/inference/hp_endpoint_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def call_create_api(
127127
)
128128
except Exception as e:
129129
logger.error(f"Failed to create endpoint in namespace {metadata.namespace}!")
130-
handle_exception(e, metadata.name, metadata.namespace)
130+
handle_exception(e, metadata.name, metadata.namespace, debug=debug)
131131

132132
@classmethod
133133
def call_list_api(

src/sagemaker/hyperpod/space/hyperpod_space.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def create(self, debug: bool = False):
301301
logger.debug(f"Successfully created HyperPod Space '{self.config.name}'!")
302302
except Exception as e:
303303
logger.error(f"Failed to create HyperPod Space {self.config.name}!")
304-
handle_exception(e, self.config.name, self.config.namespace)
304+
handle_exception(e, self.config.name, self.config.namespace, debug=debug)
305305

306306
@classmethod
307307
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "list_spaces")

0 commit comments

Comments
 (0)