Skip to content

Commit 65d6aaf

Browse files
authored
Merge branch 'aws:main' into fix-nvidia-device-plugin
2 parents 832bc4c + 61807d3 commit 65d6aaf

File tree

13 files changed

+182
-17
lines changed

13 files changed

+182
-17
lines changed

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[submodule "src/hyperpod_cli/sagemaker_hyperpod_recipes"]
22
path = src/hyperpod_cli/sagemaker_hyperpod_recipes
33
url = https://github.com/aws/sagemaker-hyperpod-recipes.git
4-
branch = release-1.3.2
4+
branch = release-1.3.3

helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,25 @@ spec:
9191
- ml.inf2.48xlarge
9292
- ml.trn1.32xlarge
9393
- ml.trn1n.32xlarge
94+
- ml.g6.xlarge
95+
- ml.g6.2xlarge
96+
- ml.g6.4xlarge
97+
- ml.g6.8xlarge
98+
- ml.g6.16xlarge
99+
- ml.g6.12xlarge
100+
- ml.g6.24xlarge
101+
- ml.g6.48xlarge
102+
- ml.gr6.4xlarge
103+
- ml.gr6.8xlarge
104+
- ml.g6e.xlarge
105+
- ml.g6e.2xlarge
106+
- ml.g6e.4xlarge
107+
- ml.g6e.8xlarge
108+
- ml.g6e.16xlarge
109+
- ml.g6e.12xlarge
110+
- ml.g6e.24xlarge
111+
- ml.g6e.48xlarge
112+
- ml.trn2.48xlarge
94113
containers:
95114
- name: health-monitoring-agent
96115
args:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
namespace: "aws-hyperpod"
2-
hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0"
2+
hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0"

helm_chart/HyperPodHelmChart/values.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,11 +181,36 @@ aws-efa-k8s-device-plugin:
181181
- ml.g6e.24xlarge
182182
- ml.g6e.48xlarge
183183
- ml.gr6.8xlarge
184+
- ml.i3en.large
185+
- ml.i3en.xlarge
186+
- ml.i3en.2xlarge
187+
- ml.i3en.3xlarge
188+
- ml.i3en.6xlarge
189+
- ml.i3en.12xlarge
190+
- ml.i3en.24xlarge
191+
- ml.m7i.large
192+
- ml.m7i.xlarge
193+
- ml.m7i.2xlarge
194+
- ml.m7i.4xlarge
195+
- ml.m7i.8xlarge
196+
- ml.m7i.12xlarge
197+
- ml.m7i.16xlarge
198+
- ml.m7i.24xlarge
199+
- ml.m7i.48xlarge
184200
- ml.p4d.24xlarge
185201
- ml.p4de.24xlarge
186202
- ml.p5.48xlarge
187203
- ml.p5e.48xlarge
188204
- ml.p5en.48xlarge
205+
- ml.r7i.large
206+
- ml.r7i.xlarge
207+
- ml.r7i.2xlarge
208+
- ml.r7i.4xlarge
209+
- ml.r7i.8xlarge
210+
- ml.r7i.12xlarge
211+
- ml.r7i.16xlarge
212+
- ml.r7i.24xlarge
213+
- ml.r7i.48xlarge
189214
- ml.trn1.32xlarge
190215
- ml.trn1n.32xlarge
191216
- ml.trn2.48xlarge

helm_chart/readme.md

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -108,19 +108,19 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system
108108
- Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases
109109
- If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI.
110110
```
111-
IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
112-
PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
113-
CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
114-
SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
115-
FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
116-
ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
117-
DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
118-
LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
119-
NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
120-
BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
121-
SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
122-
SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
123-
GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.408.0_1.0.105.0
111+
IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
112+
PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
113+
CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
114+
SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
115+
FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
116+
ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
117+
DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
118+
LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
119+
NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
120+
BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
121+
SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
122+
SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
123+
GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
124124
```
125125
126126
## 7. Troubleshooting

src/hyperpod_cli/clients/kubernetes_client.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,17 @@ def list_training_jobs(
302302
plural=PYTORCH_CUSTOM_OBJECT_PLURAL,
303303
label_selector=label_selector,
304304
)
305+
306+
def check_if_namespace_exists(self, namespace: str):
307+
try:
308+
client.CoreV1Api().read_namespace(name=namespace)
309+
return True
310+
except client.rest.ApiException as e:
311+
if e.status == 404:
312+
return False
313+
else:
314+
print(f"Exception when calling read_namespace: {e}")
315+
raise e
305316

306317
def exec_command_on_pod(
307318
self,

src/hyperpod_cli/commands/job.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1142,4 +1142,11 @@ def get_user_name():
11421142
user_name = 'Unknown'
11431143

11441144
# label value does not allow slash
1145-
return user_name.replace('/', '-')
1145+
user_name = user_name.replace('/', '-')
1146+
# 63 is the max length for a Kubernetes label
1147+
if len(user_name) > 63:
1148+
# Add dots in the end to indicate the username is trimmed
1149+
trimmed_user_name = user_name[:55] + '-trimmed'
1150+
logger.warning(f"The username is longer than the maximum length (63) of Kubernetes label, trimming to {trimmed_user_name}")
1151+
return trimmed_user_name
1152+
return user_name

src/hyperpod_cli/constants/hyperpod_instance_types.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ class HyperpodInstanceType(Enum):
4040
ML_C5N_4XLARGE = "ml.c5n.4xlarge"
4141
ML_C5N_9XLARGE = "ml.c5n.9xlarge"
4242
ML_C5N_18XLARGE = "ml.c5n.18xlarge"
43+
ML_I3EN_LARGE = "ml.i3en.large"
44+
ML_I3EN_XLARGE = "ml.i3en.xlarge"
45+
ML_I3EN_2XLARGE = "ml.i3en.2xlarge"
46+
ML_I3EN_3XLARGE = "ml.i3en.3xlarge"
47+
ML_I3EN_6XLARGE = "ml.i3en.6xlarge"
48+
ML_I3EN_12XLARGE = "ml.i3en.12xlarge"
49+
ML_I3EN_24XLARGE = "ml.i3en.24xlarge"
4350
ML_M5_LARGE = "ml.m5.large"
4451
ML_M5_XLARGE = "ml.m5.xlarge"
4552
ML_M5_2XLARGE = "ml.m5.2xlarge"
@@ -48,6 +55,15 @@ class HyperpodInstanceType(Enum):
4855
ML_M5_12XLARGE = "ml.m5.12xlarge"
4956
ML_M5_16XLARGE = "ml.m5.16xlarge"
5057
ML_M5_24XLARGE = "ml.m5.24xlarge"
58+
ML_M7I_LARGE = "ml.m7i.large"
59+
ML_M7I_XLARGE = "ml.m7i.xlarge"
60+
ML_M7I_2XLARGE = "ml.m7i.2xlarge"
61+
ML_M7I_4XLARGE = "ml.m7i.4xlarge"
62+
ML_M7I_8XLARGE = "ml.m7i.8xlarge"
63+
ML_M7I_12XLARGE = "ml.m7i.12xlarge"
64+
ML_M7I_16XLARGE = "ml.m7i.16xlarge"
65+
ML_M7I_24XLARGE = "ml.m7i.24xlarge"
66+
ML_M7I_48XLARGE = "ml.m7i.48xlarge"
5167
ML_T3_MEDIUM = "ml.t3.medium"
5268
ML_T3_LARGE = "ml.t3.large"
5369
ML_T3_XLARGE = "ml.t3.xlarge"
@@ -72,4 +88,13 @@ class HyperpodInstanceType(Enum):
7288
ML_G6E_48XLARGE = "ml.g6e.48xlarge"
7389
ML_P5E_48XLARGE = "ml.p5e.48xlarge"
7490
ML_P5EN_48XLARGE = "ml.p5en.48xlarge"
91+
ML_R7I_LARGE = "ml.r7i.large"
92+
ML_R7I_XLARGE = "ml.r7i.xlarge"
93+
ML_R7I_2XLARGE = "ml.r7i.2xlarge"
94+
ML_R7I_4XLARGE = "ml.r7i.4xlarge"
95+
ML_R7I_8XLARGE = "ml.r7i.8xlarge"
96+
ML_R7I_12XLARGE = "ml.r7i.12xlarge"
97+
ML_R7I_16XLARGE = "ml.r7i.16xlarge"
98+
ML_R7I_24XLARGE = "ml.r7i.24xlarge"
99+
ML_R7I_48XLARGE = "ml.r7i.48xlarge"
75100
ML_TRN2_48XLARGE = "ml.trn2.48xlarge"

src/hyperpod_cli/service/list_training_jobs.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from hyperpod_cli.clients.kubernetes_client import (
1919
KubernetesClient,
2020
)
21+
from hyperpod_cli.utils import setup_logger
2122
from kubernetes.client.rest import ApiException
2223
from kubernetes.client import (
2324
V1ResourceAttributes
@@ -49,6 +50,8 @@ def list_training_jobs(
4950
k8s_client = KubernetesClient()
5051

5152
jobs: List = []
53+
logger = setup_logger(__name__)
54+
logger.debug(namespace)
5255
try:
5356
if all_namespaces:
5457
namespaces: List[str] = k8s_client.list_namespaces()
@@ -70,6 +73,9 @@ def list_training_jobs(
7073
namespace = DiscoverNamespaces().discover_accessible_namespace(
7174
resource_attributes_template
7275
)
76+
else:
77+
if not k8s_client.check_if_namespace_exists(namespace):
78+
raise ValueError(f"Namespace {namespace} does not exist!")
7379

7480
namespace_jobs = k8s_client.list_training_jobs(
7581
namespace=namespace,

0 commit comments

Comments
 (0)