Skip to content

Commit 26c5071

Browse files
committed
2 parents acd9bdd + 447038f commit 26c5071

File tree

8 files changed

+165
-39
lines changed

8 files changed

+165
-39
lines changed

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[submodule "src/hyperpod_cli/sagemaker_hyperpod_recipes"]
22
path = src/hyperpod_cli/sagemaker_hyperpod_recipes
33
url = https://github.com/aws/sagemaker-hyperpod-recipes.git
4-
branch = main
4+
branch = 1.3.1

helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ spec:
7272
- key: node.kubernetes.io/instance-type
7373
operator: In
7474
values:
75+
- ml.p5en.48xlarge
76+
- ml.p5e.48xlarge
7577
- ml.p5.48xlarge
7678
- ml.p4d.24xlarge
7779
- ml.p4de.24xlarge
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
namespace: "aws-hyperpod"
2-
hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.277.0_1.0.27.0"
2+
hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0"

helm_chart/HyperPodHelmChart/charts/job-auto-restart/templates/job-auto-restart-rbac.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ rules:
4646
- get
4747
- list
4848
- watch
49+
- create
50+
- delete
4951
- patch
5052
- update
5153
- describe
@@ -76,4 +78,4 @@ roleRef:
7678
subjects:
7779
- kind: User
7880
name: hyperpod-service-linked-role
79-
namespace: {{ .Values.namespace }}
81+
namespace: {{ .Values.namespace }}

helm_chart/readme.md

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -93,39 +93,34 @@ Notes:
9393
```
9494
helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set computeQuotaTarget.targetId=<target_id>
9595
```
96-
### Step Five (only required for changing the health monitoring agent installation on your cluster- version upgrade):
97-
* This command is required to change the version of the Health Monitoring Agent running on your Hyperpod cluster.
96+
### Step Four (whenever you want to upgrade the installation of helm charts):
97+
* This command is required to upgrade the helm chart installation on your cluster, which will also help consume the latest releases of service components like Health Monitoring Agent.
9898
```
99-
helm upgrade dependencies helm_chart/HyperPodHelmChart/charts/health-monitoring-agent --namespace kube-system -f helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml
99+
helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system
100100
```
101101
102102
* To install the sub-chart separately that only contains roles and role bindings
103103
```
104104
helm install dependencies helm_chart/HyperPodHelmChart/charts/team-role-and-bindings --set computeQuotaTarget.targetId=<target_id>
105105
```
106-
### Step Five (only required for changing the health monitoring agent installation on your cluster- version upgrade):
107-
* This command is required to change the version of the Health Monitoring Agent running on your Hyperpod cluster.
108-
```
109-
helm upgrade dependencies helm_chart/HyperPodHelmChart/charts/health-monitoring-agent --namespace kube-system -f helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/values.yaml
110-
```
111106
112107
## 6. Notes
113108
- Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases
114109
- If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI.
115110
```
116-
IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
117-
PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
118-
CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
119-
SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
120-
FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
121-
ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
122-
DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
123-
LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
124-
NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
125-
BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
126-
SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
127-
SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
128-
GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.318.0_1.0.35.0
111+
IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
112+
PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
113+
CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
114+
SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
115+
FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
116+
ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
117+
DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
118+
LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
119+
NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
120+
BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
121+
SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
122+
SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
123+
GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
129124
```
130125
131126
## 7. Troubleshooting

src/hyperpod_cli/commands/job.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import sys
2020
import subprocess
2121
from typing import Any, Dict, Optional, List
22+
from tabulate import tabulate
2223

2324
import click
2425
import yaml
@@ -40,6 +41,7 @@
4041
SAGEMAKER_QUOTA_ALLOCATION_LABEL,
4142
JobPatchType,
4243
SAGEMAKER_TRAINING_LAUNCHER_DIR,
44+
OutputFormat,
4345
PullPolicy,
4446
RestartPolicy,
4547
PersistentVolumeClaim,
@@ -170,6 +172,13 @@ def get_job(
170172
required=False,
171173
help="Optional. A label selector to filter the listed jobs. The selector supports the '=', '==', and '!=' operators (e.g., `-l key1=value1,key2=value2`).",
172174
)
175+
@click.option(
176+
"--output",
177+
type=click.Choice([c.value for c in OutputFormat]),
178+
required=False,
179+
default=OutputFormat.JSON.value,
180+
help="Optional. The output format. Available values are `TABLE` and `JSON`. The default value is `JSON`.",
181+
)
173182
@click.option(
174183
"--debug",
175184
is_flag=True,
@@ -179,6 +188,7 @@ def list_jobs(
179188
namespace: Optional[str],
180189
all_namespaces: Optional[bool],
181190
selector: Optional[str],
191+
output: Optional[str],
182192
debug: bool,
183193
):
184194
if debug:
@@ -189,7 +199,7 @@ def list_jobs(
189199
try:
190200
logger.debug("Listing training jobs")
191201
result = list_training_job_service.list_training_jobs(
192-
namespace, all_namespaces, selector
202+
namespace, all_namespaces, selector, output
193203
)
194204
click.echo(result)
195205
except Exception as e:

src/hyperpod_cli/service/list_training_jobs.py

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,10 @@
2323
V1ResourceAttributes
2424
)
2525

26-
from hyperpod_cli.constants.command_constants import KUEUE_WORKLOAD_PRIORITY_CLASS_LABEL_KEY
26+
from hyperpod_cli.constants.command_constants import KUEUE_WORKLOAD_PRIORITY_CLASS_LABEL_KEY, OutputFormat
2727
from hyperpod_cli.constants.pytorch_constants import PYTORCH_CUSTOM_OBJECT_GROUP, PYTORCH_CUSTOM_OBJECT_PLURAL
2828
from hyperpod_cli.service.discover_namespaces import DiscoverNamespaces
29+
from tabulate import tabulate
2930

3031

3132
class ListTrainingJobs:
@@ -37,14 +38,14 @@ def list_training_jobs(
3738
namespace: Optional[str],
3839
all_namespaces: Optional[bool],
3940
selector: Optional[str],
41+
output: Optional[str],
4042
) -> str:
4143
"""
4244
List training job provided by the user in the specified namespace.
4345
If namespace is not provided job are listed the default namespace in user context
4446
If all_namespace is true we will list training job from all namespaces that user has access
4547
Selector when specified will filter list of job addisional based on labels filter provided
4648
"""
47-
4849
k8s_client = KubernetesClient()
4950

5051
jobs: List = []
@@ -80,10 +81,12 @@ def list_training_jobs(
8081
except ApiException as e:
8182
raise RuntimeError(f"Unexpected API error: {e.reason} ({e.status})")
8283

83-
return self._generate_list_training_job_output(jobs)
84+
return self._generate_list_training_job_output(jobs, output)
8485

85-
def _generate_list_training_job_output(self, jobs: List):
86+
def _generate_list_training_job_output(self, jobs: List, output: Optional[str]):
8687
output_jobs = {"jobs": []}
88+
priority_header_required = False
89+
8790
for job in jobs:
8891
if job.get("metadata"):
8992
name = job.get("metadata").get("name")
@@ -104,10 +107,34 @@ def _generate_list_training_job_output(self, jobs: List):
104107

105108
if priority is not None:
106109
job_summary["priority"] = priority
110+
priority_header_required = True
107111

108112
output_jobs["jobs"].append(job_summary)
109113

110-
return json.dumps(output_jobs, indent=1, sort_keys=False)
114+
if output == OutputFormat.TABLE.value:
115+
return self._generate_table(output_jobs, priority_header_required)
116+
return json.dumps(output_jobs, indent=4, sort_keys=False)
117+
118+
def _generate_table(self, output_jobs, priority_header_required):
119+
headers = [
120+
"Name",
121+
"Namespace",
122+
"CreationTime",
123+
"State"
124+
]
125+
126+
if priority_header_required:
127+
headers.append("Priority")
128+
129+
jobs = []
130+
if "jobs" in output_jobs and isinstance(output_jobs["jobs"], list):
131+
for job in output_jobs["jobs"]:
132+
job_values = list(job.values())
133+
if priority_header_required and len(job_values) == 4:
134+
job_values.append("NA")
135+
jobs.append(job_values)
136+
137+
return tabulate(jobs, headers=headers, tablefmt="presto")
111138

112139
def _get_job_status(self, status: List) -> Optional[str]:
113140
current_status = None

test/unit_tests/service/test_list_training_jobs_service.py

Lines changed: 99 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
)
2424

2525
from kubernetes.client.rest import ApiException
26+
from tabulate import tabulate
2627

2728
SAMPLE_OUTPUT = {
2829
"items": [
@@ -152,7 +153,7 @@ def test_list_training_jobs_with_namespace(
152153
mock_kubernetes_client.return_value = self.mock_k8s_client
153154
self.mock_k8s_client.list_training_jobs.return_value = SAMPLE_OUTPUT
154155
result = self.mock_list_training_jobs.list_training_jobs(
155-
"namespace", None, None
156+
"namespace", None, None, None
156157
)
157158
self.assertIn("test-name", result)
158159
self.assertIn("test-name1", result)
@@ -166,7 +167,7 @@ def test_list_training_jobs_without_namespace(
166167
mock_kubernetes_client.return_value = self.mock_k8s_client
167168
self.mock_k8s_client.get_current_context_namespace.return_value = "namespace"
168169
self.mock_k8s_client.list_training_jobs.return_value = SAMPLE_OUTPUT
169-
result = self.mock_list_training_jobs.list_training_jobs(None, None, None)
170+
result = self.mock_list_training_jobs.list_training_jobs(None, None, None, None)
170171
self.assertIn("test-name", result)
171172
self.assertIn("test-name1", result)
172173
self.assertIn("Running", result)
@@ -182,7 +183,7 @@ def test_list_training_jobs_with_namespace_auto_discover(
182183
mock_discover_accessible_namespace.return_value = "discovered-namespace"
183184
self.mock_k8s_client.get_current_context_namespace.return_value = None
184185
self.mock_k8s_client.list_training_jobs.return_value = SAMPLE_OUTPUT
185-
result = self.mock_list_training_jobs.list_training_jobs(None, None, None)
186+
result = self.mock_list_training_jobs.list_training_jobs(None, None, None, None)
186187
mock_discover_accessible_namespace.assert_called_once_with(
187188
V1ResourceAttributes(
188189
verb="list",
@@ -209,7 +210,7 @@ def test_list_training_jobs_without_namespace_no_jobs(
209210
mock_kubernetes_client.return_value = self.mock_k8s_client
210211
self.mock_k8s_client.get_current_context_namespace.return_value = "namespace"
211212
self.mock_k8s_client.list_training_jobs.return_value = {"items": []}
212-
result = self.mock_list_training_jobs.list_training_jobs(None, None, None)
213+
result = self.mock_list_training_jobs.list_training_jobs(None, None, None, None)
213214
self.assertNotIn("test-name", result)
214215
self.assertNotIn("test-name1", result)
215216

@@ -221,7 +222,7 @@ def test_list_training_jobs_all_namespace(
221222
mock_kubernetes_client.return_value = self.mock_k8s_client
222223
self.mock_k8s_client.list_namespaces.return_value = ["namespace"]
223224
self.mock_k8s_client.list_training_jobs.return_value = SAMPLE_OUTPUT
224-
result = self.mock_list_training_jobs.list_training_jobs(None, True, None)
225+
result = self.mock_list_training_jobs.list_training_jobs(None, True, None, None)
225226
self.assertIn("test-name", result)
226227
self.assertIn("test-name1", result)
227228

@@ -236,7 +237,7 @@ def test_list_training_jobs_all_namespace_api_exception(
236237
status="Failed", reason="unexpected"
237238
)
238239
with self.assertRaises(RuntimeError):
239-
self.mock_list_training_jobs.list_training_jobs(None, True, None)
240+
self.mock_list_training_jobs.list_training_jobs(None, True, None, None)
240241

241242
@mock.patch("hyperpod_cli.clients.kubernetes_client.KubernetesClient.__new__")
242243
def test_list_training_jobs_all_namespace_no_jobs(
@@ -246,7 +247,7 @@ def test_list_training_jobs_all_namespace_no_jobs(
246247
mock_kubernetes_client.return_value = self.mock_k8s_client
247248
self.mock_k8s_client.list_namespaces.return_value = ["namespace"]
248249
self.mock_k8s_client.list_training_jobs.return_value = {"items": []}
249-
result = self.mock_list_training_jobs.list_training_jobs(None, True, None)
250+
result = self.mock_list_training_jobs.list_training_jobs(None, True, None, None)
250251
self.assertNotIn("test-name", result)
251252
self.assertNotIn("test-name1", result)
252253

@@ -258,7 +259,7 @@ def test_list_training_jobs_all_namespace_missing_metadata(
258259
mock_kubernetes_client.return_value = self.mock_k8s_client
259260
self.mock_k8s_client.list_namespaces.return_value = ["namespace"]
260261
self.mock_k8s_client.list_training_jobs.return_value = INVALID_OUTPUT
261-
result = self.mock_list_training_jobs.list_training_jobs(None, True, None)
262+
result = self.mock_list_training_jobs.list_training_jobs(None, True, None, None)
262263
self.assertNotIn("name", result)
263264

264265
@mock.patch("hyperpod_cli.clients.kubernetes_client.KubernetesClient.__new__")
@@ -269,5 +270,94 @@ def test_list_training_jobs_all_namespace_missing_status(
269270
mock_kubernetes_client.return_value = self.mock_k8s_client
270271
self.mock_k8s_client.list_namespaces.return_value = ["namespace"]
271272
self.mock_k8s_client.list_training_jobs.return_value = OUTPUT_WITHOUT_STATUS
272-
result = self.mock_list_training_jobs.list_training_jobs(None, True, None)
273+
result = self.mock_list_training_jobs.list_training_jobs(None, True, None, None)
273274
self.assertNotIn("State: null", result)
275+
276+
def test_generate_table_with_no_priority_header_and_values(self):
277+
list_training_jobs = ListTrainingJobs()
278+
output_jobs = {
279+
"jobs": [
280+
{
281+
"Name": "job1",
282+
"Namespace": "namespace1",
283+
"CreationTime": "2023-01-01T00:00:00Z",
284+
"State": "Running"
285+
}
286+
]
287+
}
288+
priority_header_required = False
289+
290+
result = list_training_jobs._generate_table(output_jobs, priority_header_required)
291+
292+
expected_headers = ["Name", "Namespace", "CreationTime", "State"]
293+
expected_jobs = [["job1", "namespace1", "2023-01-01T00:00:00Z", "Running"]]
294+
expected_result = tabulate(expected_jobs, headers=expected_headers, tablefmt="presto")
295+
296+
assert result == expected_result
297+
298+
def test_generate_table_with_priority_header_and_priority_values(self):
299+
list_training_jobs = ListTrainingJobs()
300+
output_jobs = {
301+
"jobs": [
302+
{
303+
"Name": "job1",
304+
"Namespace": "namespace1",
305+
"CreationTime": "2023-01-01T00:00:00Z",
306+
"State": "Running",
307+
"priority": "high"
308+
},
309+
{
310+
"Name": "job2",
311+
"Namespace": "namespace2",
312+
"CreationTime": "2023-01-02T00:00:00Z",
313+
"State": "Completed",
314+
"priority": "low"
315+
}
316+
]
317+
}
318+
priority_header_required = True
319+
320+
result = list_training_jobs._generate_table(output_jobs, priority_header_required)
321+
322+
expected_headers = ["Name", "Namespace", "CreationTime", "State", "Priority"]
323+
expected_jobs = [
324+
["job1", "namespace1", "2023-01-01T00:00:00Z", "Running", "high"],
325+
["job2", "namespace2", "2023-01-02T00:00:00Z", "Completed", "low"]
326+
]
327+
expected_result = tabulate(expected_jobs, headers=expected_headers, tablefmt="presto")
328+
329+
assert result == expected_result
330+
331+
def test_generate_table_with_priority_header_but_no_priority_value(self):
332+
list_training_jobs = ListTrainingJobs()
333+
output_jobs = {
334+
"jobs": [
335+
{
336+
"Name": "job1",
337+
"Namespace": "namespace1",
338+
"CreationTime": "2023-01-01T00:00:00Z",
339+
"State": "Running"
340+
}
341+
]
342+
}
343+
priority_header_required = True
344+
345+
result = list_training_jobs._generate_table(output_jobs, priority_header_required)
346+
347+
expected_headers = ["Name", "Namespace", "CreationTime", "State", "Priority"]
348+
expected_jobs = [["job1", "namespace1", "2023-01-01T00:00:00Z", "Running", "NA"]]
349+
expected_result = tabulate(expected_jobs, headers=expected_headers, tablefmt="presto")
350+
351+
assert result == expected_result
352+
353+
def test_generate_table_empty_jobs(self):
354+
list_training_jobs = ListTrainingJobs()
355+
output_jobs = {"jobs": []}
356+
priority_header_required = False
357+
358+
result = list_training_jobs._generate_table(output_jobs, priority_header_required)
359+
360+
expected_headers = ["Name", "Namespace", "CreationTime", "State"]
361+
expected_result = tabulate([], headers=expected_headers, tablefmt="presto")
362+
363+
assert result == expected_result

0 commit comments

Comments
 (0)