Skip to content

Commit e51fd95

Browse files
author
Roja Reddy Sareddy
committed
feat: add get_operator_logs to pytorch job
1 parent 96c5b2b commit e51fd95

File tree

7 files changed

+91
-2
lines changed

7 files changed

+91
-2
lines changed

src/sagemaker/hyperpod/cli/commands/training.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,3 +354,21 @@ def pytorch_get_logs(job_name: str, pod_name: str, namespace: str):
354354

355355
except Exception as e:
356356
raise click.UsageError(f"Failed to list jobs: {str(e)}")
357+
358+
359+
@click.command("hyp-pytorch-job")
360+
@click.option(
361+
"--since-hours",
362+
type=click.FLOAT,
363+
required=True,
364+
help="Required. The time frame to get logs for.",
365+
)
366+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "get_pytorch_operator_logs")
367+
def pytorch_get_operator_logs(
368+
since_hours: float,
369+
):
370+
"""
371+
Get operator logs for pytorch training jobs.
372+
"""
373+
logs = HyperPodPytorchJob.get_operator_logs(since_hours=since_hours)
374+
click.echo(logs)

src/sagemaker/hyperpod/cli/hyp_cli.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
pytorch_delete,
1616
pytorch_list_pods,
1717
pytorch_get_logs,
18+
pytorch_get_operator_logs,
1819
)
1920
from sagemaker.hyperpod.cli.commands.inference import (
2021
js_create,
@@ -116,6 +117,7 @@ def get_operator_logs():
116117
get_logs.add_command(js_get_logs)
117118
get_logs.add_command(custom_get_logs)
118119

120+
get_operator_logs.add_command(pytorch_get_operator_logs)
119121
get_operator_logs.add_command(js_get_operator_logs)
120122
get_operator_logs.add_command(custom_get_operator_logs)
121123

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
API_VERSION = "v1"
2424
PLURAL = "hyperpodpytorchjobs"
2525
KIND = "HyperPodPyTorchJob"
26+
TRAINING_OPERATOR_NAMESPACE = "aws-hyperpod"
2627

2728

2829
class HyperPodPytorchJob(_HyperPodPytorchJob):
@@ -233,6 +234,36 @@ def get_logs_from_pod(self, pod_name: str, container: Optional[str] = None) -> s
233234
logger.error(f"Failed to get logs from pod {pod_name}!")
234235
handle_exception(e, self.metadata.name, self.metadata.namespace)
235236

237+
@classmethod
238+
@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "get_operator_logs_pytorchjob")
239+
def get_operator_logs(cls, since_hours: float):
240+
cls.verify_kube_config()
241+
242+
v1 = client.CoreV1Api()
243+
244+
pods = v1.list_namespaced_pod(namespace=TRAINING_OPERATOR_NAMESPACE)
245+
246+
if not pods.items:
247+
raise Exception(
248+
f"No pod found in namespace {TRAINING_OPERATOR_NAMESPACE}"
249+
)
250+
251+
# Get logs from first pod
252+
first_pod = pods.items[0]
253+
pod_name = first_pod.metadata.name
254+
255+
try:
256+
logs = v1.read_namespaced_pod_log(
257+
name=pod_name,
258+
namespace=TRAINING_OPERATOR_NAMESPACE,
259+
timestamps=True,
260+
since_seconds=int(3600 * since_hours),
261+
)
262+
except Exception as e:
263+
handle_exception(e, pod_name, TRAINING_OPERATOR_NAMESPACE)
264+
265+
return logs
266+
236267

237268
def _load_hp_job(response: dict) -> HyperPodPytorchJob:
238269

test/integration_tests/training/cli/test_cli_training.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,4 +239,9 @@ def test_delete_job(self, test_job_name):
239239
assert list_result.returncode == 0
240240

241241
# The job name should no longer be in the output
242-
assert test_job_name not in list_result.stdout
242+
assert test_job_name not in list_result.stdout
243+
244+
def test_pytorch_get_operator_logs():
245+
"""Test getting operator logs via CLI"""
246+
result = execute_command(["hyp", "get-operator-logs", "hyp-pytorch-job", "--since-hours", "1"])
247+
assert result.returncode == 0

test/integration_tests/training/sdk/test_sdk_training.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,8 @@ def test_delete_job(self, pytorch_job):
112112
jobs = HyperPodPytorchJob.list()
113113
job_names = [job.metadata.name for job in jobs]
114114
assert pytorch_job.metadata.name not in job_names
115+
116+
def test_get_operator_logs():
117+
"""Test getting operator logs"""
118+
logs = HyperPodPytorchJob.get_operator_logs(since_hours=1)
119+
assert logs

test/unit_tests/cli/test_training.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
pytorch_create,
77
list_jobs,
88
pytorch_describe,
9+
pytorch_get_operator_logs,
910
)
1011
from hyperpod_pytorch_job_template.v1_1.model import ALLOWED_TOPOLOGY_LABELS
1112
import sys
@@ -827,3 +828,12 @@ def test_none_topology_labels(self):
827828
)
828829
self.assertIsNone(config.preferred_topology)
829830
self.assertIsNone(config.required_topology)
831+
832+
@patch('sagemaker.hyperpod.cli.commands.training.HyperPodPytorchJob')
833+
def test_pytorch_get_operator_logs(mock_hp):
834+
mock_hp.get_operator_logs.return_value = "operator logs"
835+
runner = CliRunner()
836+
result = runner.invoke(pytorch_get_operator_logs, ['--since-hours', '2'])
837+
assert result.exit_code == 0
838+
assert 'operator logs' in result.output
839+
mock_hp.get_operator_logs.assert_called_once_with(since_hours=2.0)

test/unit_tests/training/test_hyperpod_pytorch_job.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,24 @@ def test_get_logs_from_pod_with_container_name(
283283
)
284284
self.assertEqual(result, "test logs")
285285

286+
@patch("kubernetes.client.CoreV1Api")
287+
@patch.object(HyperPodPytorchJob, "verify_kube_config")
288+
def test_get_operator_logs(self, mock_verify_config, mock_core_api):
289+
mock_pod = MagicMock()
290+
mock_pod.metadata.name = "training-operator-pod"
291+
mock_core_api.return_value.list_namespaced_pod.return_value.items = [mock_pod]
292+
mock_core_api.return_value.read_namespaced_pod_log.return_value = "training operator logs"
293+
294+
result = HyperPodPytorchJob.get_operator_logs(2.5)
295+
296+
self.assertEqual(result, "training operator logs")
297+
mock_core_api.return_value.read_namespaced_pod_log.assert_called_once_with(
298+
name="training-operator-pod",
299+
namespace="aws-hyperpod",
300+
timestamps=True,
301+
since_seconds=9000,
302+
)
303+
286304

287305
class TestLoadHpJob(unittest.TestCase):
288306
"""Test the _load_hp_job function"""
@@ -350,4 +368,4 @@ def test_load_hp_job_list_empty(self):
350368
result = _load_hp_job_list(response)
351369

352370
self.assertEqual(len(result), 0)
353-
self.assertEqual(result, [])
371+
self.assertEqual(result, [])

0 commit comments

Comments
 (0)