Skip to content

Commit 87c7098

Browse files
committed
feat: Add region detection to install Health Monitoring Agent and use regionalized HMA URI
1 parent 631ddf9 commit 87c7098

File tree

7 files changed

+218
-19
lines changed

7 files changed

+218
-19
lines changed
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
apiVersion: v2
22
name: health-monitoring-agent
33
version: 0.1.0
4-
appVersion: 1.0
4+
appVersion: "1.0"
55
description: A Helm chart for setting up Hyperpod health-monitoring-agent related permissions
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
{{/*
2+
Expand the name of the chart.
3+
*/}}
4+
{{- define "health-monitoring-agent.name" -}}
5+
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
6+
{{- end }}
7+
8+
{{/*
9+
Create a default fully qualified app name.
10+
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11+
If release name contains chart name it will be used as a full name.
12+
*/}}
13+
{{- define "health-monitoring-agent.fullname" -}}
14+
{{- if .Values.fullnameOverride }}
15+
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16+
{{- else }}
17+
{{- $name := default .Chart.Name .Values.nameOverride }}
18+
{{- if contains $name .Release.Name }}
19+
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
20+
{{- else }}
21+
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22+
{{- end }}
23+
{{- end }}
24+
{{- end }}
25+
26+
{{/*
27+
Create chart name and version as used by the chart label.
28+
*/}}
29+
{{- define "health-monitoring-agent.chart" -}}
30+
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31+
{{- end }}
32+
33+
{{/*
34+
Common labels
35+
*/}}
36+
{{- define "health-monitoring-agent.labels" -}}
37+
helm.sh/chart: {{ include "health-monitoring-agent.chart" . }}
38+
{{ include "health-monitoring-agent.selectorLabels" . }}
39+
{{- if .Chart.AppVersion }}
40+
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41+
{{- end }}
42+
app.kubernetes.io/managed-by: {{ .Release.Service }}
43+
{{- end }}
44+
45+
{{/*
46+
Selector labels
47+
*/}}
48+
{{- define "health-monitoring-agent.selectorLabels" -}}
49+
app.kubernetes.io/name: {{ include "health-monitoring-agent.name" . }}
50+
app.kubernetes.io/instance: {{ .Release.Name }}
51+
{{- end }}
52+
53+
{{/*
54+
Generate the health monitoring agent image URI based on AWS region
55+
*/}}
56+
{{- define "health-monitoring-agent.imageUri" -}}
57+
{{- $region := "" -}}
58+
{{- $imageTag := "1.0.674.0_1.0.199.0" -}}
59+
60+
{{/* Try to get region from various sources in priority order */}}
61+
{{- if .Values.region -}}
62+
{{/* 1. Explicit region setting (highest priority) */}}
63+
{{- $region = .Values.region -}}
64+
{{- else if and .Values.global .Values.global.region -}}
65+
{{/* 2. Global region setting */}}
66+
{{- $region = .Values.global.region -}}
67+
{{- else -}}
68+
{{/* 3. Try to detect region from Kubernetes cluster context */}}
69+
{{- $detectedRegion := "" -}}
70+
71+
{{/* Try to get region from cluster info */}}
72+
{{- $clusterInfo := lookup "v1" "ConfigMap" "kube-system" "cluster-info" -}}
73+
{{- if $clusterInfo -}}
74+
{{- $server := index $clusterInfo.data "kubeconfig" | fromYaml -}}
75+
{{- if $server -}}
76+
{{- range $server.clusters -}}
77+
{{- if .cluster.server -}}
78+
{{/* Extract region from EKS API server URL pattern: https://XXXXX.gr7.REGION.eks.amazonaws.com */}}
79+
{{- $serverUrl := .cluster.server -}}
80+
{{- if contains ".eks.amazonaws.com" $serverUrl -}}
81+
{{- $urlParts := split "." $serverUrl -}}
82+
{{- if gt (len $urlParts) 3 -}}
83+
{{- $detectedRegion = index $urlParts 2 -}}
84+
{{- end -}}
85+
{{- end -}}
86+
{{- end -}}
87+
{{- end -}}
88+
{{- end -}}
89+
{{- end -}}
90+
91+
{{/* Try alternative method: look for AWS node info */}}
92+
{{- if not $detectedRegion -}}
93+
{{- $nodes := lookup "v1" "Node" "" "" -}}
94+
{{- if $nodes -}}
95+
{{- range $nodes.items -}}
96+
{{- if .metadata.labels -}}
97+
{{/* Check for topology.kubernetes.io/region label */}}
98+
{{- if index .metadata.labels "topology.kubernetes.io/region" -}}
99+
{{- $detectedRegion = index .metadata.labels "topology.kubernetes.io/region" -}}
100+
{{- break -}}
101+
{{- end -}}
102+
{{/* Check for failure-domain.beta.kubernetes.io/region label (legacy) */}}
103+
{{- if and (not $detectedRegion) (index .metadata.labels "failure-domain.beta.kubernetes.io/region") -}}
104+
{{- $detectedRegion = index .metadata.labels "failure-domain.beta.kubernetes.io/region" -}}
105+
{{- break -}}
106+
{{- end -}}
107+
{{- end -}}
108+
{{- end -}}
109+
{{- end -}}
110+
{{- end -}}
111+
112+
{{/* Use detected region or fall back to default */}}
113+
{{- if $detectedRegion -}}
114+
{{- $region = $detectedRegion -}}
115+
{{- else -}}
116+
{{/* 4. Default fallback to us-west-2 */}}
117+
{{- $region = "us-west-2" -}}
118+
{{- end -}}
119+
{{- end -}}
120+
121+
{{/* Region to ECR account ID mapping */}}
122+
{{- $regionAccountMap := dict
123+
"us-east-1" "767398015722"
124+
"us-west-2" "905418368575"
125+
"us-east-2" "851725546812"
126+
"us-west-1" "011528288828"
127+
"eu-central-1" "211125453373"
128+
"eu-north-1" "654654141839"
129+
"eu-west-1" "533267293120"
130+
"eu-west-2" "011528288831"
131+
"ap-northeast-1" "533267052152"
132+
"ap-south-1" "011528288864"
133+
"ap-southeast-1" "905418428165"
134+
"ap-southeast-2" "851725636348"
135+
"sa-east-1" "025066253954"
136+
-}}
137+
138+
{{/* Get the account ID for the region, default to us-west-2 account if region not found */}}
139+
{{- $accountId := index $regionAccountMap $region | default "905418368575" -}}
140+
141+
{{/* Allow override of the full image URI if specified */}}
142+
{{- if .Values.hmaimage -}}
143+
{{- .Values.hmaimage -}}
144+
{{- else -}}
145+
{{- printf "%s.dkr.ecr.%s.amazonaws.com/hyperpod-health-monitoring-agent:%s" $accountId $region $imageTag -}}
146+
{{- end -}}
147+
{{- end }}

helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ spec:
116116
args:
117117
- --enable-k8s-exporter=false
118118
- --config.system-log-monitor=/config/system-message-monitor.json
119-
image: {{ .Values.hmaimage }}
119+
image: {{ include "health-monitoring-agent.imageUri" . }}
120120
resources:
121121
limits:
122122
cpu: 500m
Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,25 @@
11
namespace: "aws-hyperpod"
2-
hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0"
2+
3+
# AWS region for the health monitoring agent ECR image
4+
# The chart automatically detects the region from Kubernetes cluster context.
5+
# Only specify this if you want to override the automatic detection.
6+
#
7+
# Automatic detection priority:
8+
# 1. This explicit region setting (highest priority)
9+
# 2. Global region setting (global.region)
10+
# 3. Kubernetes cluster context detection:
11+
# - EKS API server URL patterns
12+
# - Node topology labels (topology.kubernetes.io/region)
13+
# - AWS provider IDs in node specifications
14+
# - Legacy region labels (failure-domain.beta.kubernetes.io/region)
15+
# 4. Default fallback: us-west-2
16+
#
17+
# Supported regions: us-east-1, us-west-2, us-east-2, us-west-1, eu-central-1,
18+
# eu-north-1, eu-west-1, eu-west-2, ap-northeast-1, ap-south-1, ap-southeast-1,
19+
# ap-southeast-2, sa-east-1
20+
region: ""
21+
22+
# Override the health monitoring agent image URI
23+
# If specified, this will override the automatic region-based URI selection
24+
# Example: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0"
25+
hmaimage: ""

helm_chart/HyperPodHelmChart/values.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,9 @@ aws-efa-k8s-device-plugin:
258258
mpi-operator:
259259
enabled: true
260260
health-monitoring-agent:
261-
enabled: true
261+
enabled: true
262+
# AWS region will be automatically detected or can be specified
263+
# region: "us-west-2"
262264
deep-health-check:
263265
enabled: true
264266
job-auto-restart:

helm_chart/readme.md

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -169,21 +169,48 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system
169169
170170
## 6. Notes
171171
- Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases
172-
- If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI.
173-
```
174-
IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
175-
PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
176-
CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
177-
SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
178-
FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
179-
ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
180-
DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
181-
LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
182-
NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
183-
BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
184-
SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
185-
SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
186-
GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
172+
- The Health Monitoring Agent now automatically selects the correct container image URI based on your AWS region. The Helm chart intelligently detects the region from your Kubernetes cluster context.
173+
174+
- **Intelligent Region Detection**: The chart automatically detects your AWS region using multiple methods:
175+
1. **Explicit region setting** (highest priority): `--set health-monitoring-agent.region=us-east-1`
176+
2. **Global region setting**: `--set global.region=us-east-1`
177+
3. **Kubernetes cluster context detection**: Automatically extracts region from:
178+
- EKS API server URL patterns
179+
- Node topology labels (`topology.kubernetes.io/region`)
180+
- AWS provider IDs in node specifications
181+
- Legacy region labels (`failure-domain.beta.kubernetes.io/region`)
182+
4. **Default fallback**: us-west-2
183+
184+
- **Zero Configuration Required**: For most EKS deployments, no manual region specification is needed:
185+
```bash
186+
helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system
187+
```
188+
189+
- **Manual Region Override**: If needed, you can still specify a region manually:
190+
```bash
191+
helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.region=us-east-1
192+
```
193+
194+
- **Supported Regions and their ECR URIs**:
195+
```
196+
us-east-1 (US East (N. Virginia)): 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
197+
us-west-2 (US West (Oregon)): 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
198+
us-east-2 (US East (Ohio)): 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
199+
us-west-1 (US West (N. California)): 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
200+
eu-central-1 (Europe (Frankfurt)): 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
201+
eu-north-1 (Europe (Stockholm)): 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
202+
eu-west-1 (Europe (Ireland)): 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
203+
eu-west-2 (Europe (London)): 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
204+
ap-northeast-1 (Asia Pacific (Tokyo)): 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
205+
ap-south-1 (Asia Pacific (Mumbai)): 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
206+
ap-southeast-1 (Asia Pacific (Singapore)): 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
207+
ap-southeast-2 (Asia Pacific (Sydney)): 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
208+
sa-east-1 (South America (São Paulo)): 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
209+
```
210+
211+
- **Custom Image Override**: For advanced use cases, you can still override the image URI completely:
212+
```bash
213+
helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.hmaimage=""
187214
```
188215

189216
## 7. Troubleshooting

0 commit comments

Comments
 (0)