Skip to content

Commit 6ea09ac

Browse files
author
Roja Reddy Sareddy
committed
feat: add get_operator_logs to pytorch job
1 parent e716b54 commit 6ea09ac

File tree

2 files changed

+9
-7
lines changed

2 files changed

+9
-7
lines changed

src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
PLURAL = "hyperpodpytorchjobs"
2525
KIND = "HyperPodPyTorchJob"
2626
TRAINING_OPERATOR_NAMESPACE = "aws-hyperpod"
27-
TRAINING_OPERATOR_POD_PREFIX = "hp-training-operator-hp-training-controller-manager-"
27+
TRAINING_OPERATOR_LABEL = "hp-training-control-plane"
2828

2929

3030
class HyperPodPytorchJob(_HyperPodPytorchJob):
@@ -249,16 +249,16 @@ def get_operator_logs(cls, since_hours: float):
249249
f"No pod found in namespace {TRAINING_OPERATOR_NAMESPACE}"
250250
)
251251

252-
# Find the training operator pod
252+
# Find the training operator pod by label
253253
operator_pod = None
254254
for pod in pods.items:
255-
if pod.metadata.name.startswith(TRAINING_OPERATOR_POD_PREFIX):
255+
if pod.metadata.labels and TRAINING_OPERATOR_LABEL in pod.metadata.labels:
256256
operator_pod = pod
257257
break
258258

259259
if not operator_pod:
260260
raise Exception(
261-
f"No training operator pod found with prefix {TRAINING_OPERATOR_POD_PREFIX}"
261+
f"No training operator pod found with label {TRAINING_OPERATOR_LABEL}"
262262
)
263263

264264
pod_name = operator_pod.metadata.name

test/unit_tests/training/test_hyperpod_pytorch_job.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -286,12 +286,14 @@ def test_get_logs_from_pod_with_container_name(
286286
@patch("kubernetes.client.CoreV1Api")
287287
@patch.object(HyperPodPytorchJob, "verify_kube_config")
288288
def test_get_operator_logs(self, mock_verify_config, mock_core_api):
289-
# Mock multiple pods, including the training operator pod
289+
# Mock multiple pods, including the training operator pod with correct label
290290
mock_other_pod = MagicMock()
291291
mock_other_pod.metadata.name = "other-pod-123"
292+
mock_other_pod.metadata.labels = {"app": "other"}
292293

293294
mock_operator_pod = MagicMock()
294-
mock_operator_pod.metadata.name = "hp-training-operator-hp-training-controller-manager-abc123"
295+
mock_operator_pod.metadata.name = "training-operator-pod-abc123"
296+
mock_operator_pod.metadata.labels = {"hp-training-control-plane": "true"}
295297

296298
mock_core_api.return_value.list_namespaced_pod.return_value.items = [mock_other_pod, mock_operator_pod]
297299
mock_core_api.return_value.read_namespaced_pod_log.return_value = "training operator logs"
@@ -300,7 +302,7 @@ def test_get_operator_logs(self, mock_verify_config, mock_core_api):
300302

301303
self.assertEqual(result, "training operator logs")
302304
mock_core_api.return_value.read_namespaced_pod_log.assert_called_once_with(
303-
name="hp-training-operator-hp-training-controller-manager-abc123",
305+
name="training-operator-pod-abc123",
304306
namespace="aws-hyperpod",
305307
timestamps=True,
306308
since_seconds=9000,

0 commit comments

Comments
 (0)