Skip to content

Commit c20815e

Browse files
chnnmzhaardm
andauthored
feat: Add region detection to install Health Monitoring Agent and use regionalized HMA URI (#141) (#183)
Co-authored-by: haardm <[email protected]>
1 parent 61d18a6 commit c20815e

File tree

7 files changed

+284
-19
lines changed

7 files changed

+284
-19
lines changed
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
apiVersion: v2
22
name: health-monitoring-agent
33
version: 0.1.0
4-
appVersion: 1.0
4+
appVersion: "1.0"
55
description: A Helm chart for setting up Hyperpod health-monitoring-agent related permissions
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
{{/*
2+
Expand the name of the chart.
3+
*/}}
4+
{{- define "health-monitoring-agent.name" -}}
5+
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
6+
{{- end }}
7+
8+
{{/*
9+
Create a default fully qualified app name.
10+
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11+
If release name contains chart name it will be used as a full name.
12+
*/}}
13+
{{- define "health-monitoring-agent.fullname" -}}
14+
{{- if .Values.fullnameOverride }}
15+
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16+
{{- else }}
17+
{{- $name := default .Chart.Name .Values.nameOverride }}
18+
{{- if contains $name .Release.Name }}
19+
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
20+
{{- else }}
21+
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22+
{{- end }}
23+
{{- end }}
24+
{{- end }}
25+
26+
{{/*
27+
Create chart name and version as used by the chart label.
28+
*/}}
29+
{{- define "health-monitoring-agent.chart" -}}
30+
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31+
{{- end }}
32+
33+
{{/*
34+
Common labels
35+
*/}}
36+
{{- define "health-monitoring-agent.labels" -}}
37+
helm.sh/chart: {{ include "health-monitoring-agent.chart" . }}
38+
{{ include "health-monitoring-agent.selectorLabels" . }}
39+
{{- if .Chart.AppVersion }}
40+
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41+
{{- end }}
42+
app.kubernetes.io/managed-by: {{ .Release.Service }}
43+
{{- end }}
44+
45+
{{/*
46+
Selector labels
47+
*/}}
48+
{{- define "health-monitoring-agent.selectorLabels" -}}
49+
app.kubernetes.io/name: {{ include "health-monitoring-agent.name" . }}
50+
app.kubernetes.io/instance: {{ .Release.Name }}
51+
{{- end }}
52+
53+
{{/*
54+
Generate the health monitoring agent image URI based on AWS region
55+
*/}}
56+
{{- define "health-monitoring-agent.imageUri" -}}
57+
{{- $region := "" -}}
58+
{{- $imageTag := .Values.imageTag | default "1.0.674.0_1.0.199.0" -}}
59+
60+
{{/* Debug: Show image tag selection if debug is enabled */}}
61+
{{- if .Values.debug -}}
62+
{{/* DEBUG: Image tag selection - Values.imageTag: {{ .Values.imageTag | default "not set" }}, Final imageTag: {{ $imageTag }} */}}
63+
{{- end -}}
64+
65+
{{/* Try to get region from various sources in priority order */}}
66+
{{- if .Values.region -}}
67+
{{/* 1. Explicit region setting (highest priority) */}}
68+
{{- $region = .Values.region -}}
69+
{{- if .Values.debug -}}
70+
{{/* DEBUG: Using explicit region setting: {{ $region }} */}}
71+
{{- end -}}
72+
{{- else if and .Values.global .Values.global.region -}}
73+
{{/* 2. Global region setting */}}
74+
{{- $region = .Values.global.region -}}
75+
{{- if .Values.debug -}}
76+
{{/* DEBUG: Using global region setting: {{ $region }} */}}
77+
{{- end -}}
78+
{{- else -}}
79+
{{/* 3. Try to detect region from Kubernetes cluster context */}}
80+
{{- $detectedRegion := "" -}}
81+
{{- if .Values.debug -}}
82+
{{/* DEBUG: Attempting automatic region detection... */}}
83+
{{- end -}}
84+
85+
{{/* Note: cluster-info ConfigMap doesn't exist in EKS clusters, so we skip this method */}}
86+
{{- if .Values.debug -}}
87+
{{/* DEBUG: Skipping cluster-info ConfigMap lookup (not available in EKS clusters) */}}
88+
{{- end -}}
89+
90+
{{/* Try alternative method: look for AWS node info */}}
91+
{{- if not $detectedRegion -}}
92+
{{- if .Values.debug -}}
93+
{{/* DEBUG: Trying to detect region from node labels... */}}
94+
{{- end -}}
95+
{{- $nodes := lookup "v1" "Node" "" "" -}}
96+
{{- if $nodes -}}
97+
{{- if .Values.debug -}}
98+
{{/* DEBUG: Found {{ len $nodes.items }} nodes, checking labels... */}}
99+
{{- end -}}
100+
{{- range $nodes.items -}}
101+
{{- if .metadata.labels -}}
102+
{{/* Check for topology.kubernetes.io/region label */}}
103+
{{- if index .metadata.labels "topology.kubernetes.io/region" -}}
104+
{{- $detectedRegion = index .metadata.labels "topology.kubernetes.io/region" -}}
105+
{{- if $.Values.debug -}}
106+
{{/* DEBUG: Found region from topology.kubernetes.io/region label: {{ $detectedRegion }} */}}
107+
{{- end -}}
108+
{{- break -}}
109+
{{- end -}}
110+
{{/* Check for failure-domain.beta.kubernetes.io/region label (legacy) */}}
111+
{{- if and (not $detectedRegion) (index .metadata.labels "failure-domain.beta.kubernetes.io/region") -}}
112+
{{- $detectedRegion = index .metadata.labels "failure-domain.beta.kubernetes.io/region" -}}
113+
{{- if $.Values.debug -}}
114+
{{/* DEBUG: Found region from failure-domain.beta.kubernetes.io/region label: {{ $detectedRegion }} */}}
115+
{{- end -}}
116+
{{- break -}}
117+
{{- end -}}
118+
{{- end -}}
119+
{{- end -}}
120+
{{- else -}}
121+
{{- if .Values.debug -}}
122+
{{/* DEBUG: No nodes found for region detection */}}
123+
{{- end -}}
124+
{{- end -}}
125+
{{- end -}}
126+
127+
{{/* Use detected region or fall back to default */}}
128+
{{- if $detectedRegion -}}
129+
{{- $region = $detectedRegion -}}
130+
{{- if .Values.debug -}}
131+
{{/* DEBUG: Using detected region: {{ $region }} */}}
132+
{{- end -}}
133+
{{- else -}}
134+
{{/* 4. Default fallback to us-east-1 */}}
135+
{{- $region = "us-east-1" -}}
136+
{{- if .Values.debug -}}
137+
{{/* DEBUG: No region detected, using default fallback: {{ $region }} */}}
138+
{{- end -}}
139+
{{- end -}}
140+
{{- end -}}
141+
142+
{{/* Region to ECR account ID mapping */}}
143+
{{- $regionAccountMap := dict
144+
"us-east-1" "767398015722"
145+
"us-west-2" "905418368575"
146+
"us-east-2" "851725546812"
147+
"us-west-1" "011528288828"
148+
"eu-central-1" "211125453373"
149+
"eu-north-1" "654654141839"
150+
"eu-west-1" "533267293120"
151+
"eu-west-2" "011528288831"
152+
"ap-northeast-1" "533267052152"
153+
"ap-south-1" "011528288864"
154+
"ap-southeast-1" "905418428165"
155+
"ap-southeast-2" "851725636348"
156+
"sa-east-1" "025066253954"
157+
-}}
158+
159+
{{/* Get the account ID for the region, default to us-west-2 account if region not found */}}
160+
{{- $accountId := index $regionAccountMap $region | default "767398015722" -}}
161+
162+
{{/* Debug: Show final region and account mapping */}}
163+
{{- if .Values.debug -}}
164+
{{/* DEBUG: Final region: {{ $region }}, Account ID: {{ $accountId }} */}}
165+
{{- end -}}
166+
167+
{{/* Allow override of the full image URI if specified */}}
168+
{{- if .Values.hmaimage -}}
169+
{{- if .Values.debug -}}
170+
{{/* DEBUG: Using override image URI: {{ .Values.hmaimage }} */}}
171+
{{- end -}}
172+
{{- .Values.hmaimage -}}
173+
{{- else -}}
174+
{{- $finalImageUri := printf "%s.dkr.ecr.%s.amazonaws.com/hyperpod-health-monitoring-agent:%s" $accountId $region $imageTag -}}
175+
{{- if .Values.debug -}}
176+
{{/* DEBUG: Generated image URI: {{ $finalImageUri }} */}}
177+
{{- end -}}
178+
{{- $finalImageUri -}}
179+
{{- end -}}
180+
{{- end }}

helm_chart/HyperPodHelmChart/charts/health-monitoring-agent/templates/health-monitoring-agent.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ spec:
116116
args:
117117
- --enable-k8s-exporter=false
118118
- --config.system-log-monitor=/config/system-message-monitor.json
119-
image: {{ .Values.hmaimage }}
119+
image: {{ include "health-monitoring-agent.imageUri" . }}
120120
resources:
121121
limits:
122122
cpu: 500m
Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,32 @@
11
namespace: "aws-hyperpod"
2-
hmaimage: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.552.0_1.0.161.0"
2+
3+
# AWS region for the health monitoring agent ECR image
4+
# The chart automatically detects the region from Kubernetes cluster context.
5+
# Only specify this if you want to override the automatic detection.
6+
#
7+
# Automatic detection priority:
8+
# 1. This explicit region setting (highest priority)
9+
# 2. Global region setting (global.region)
10+
# 3. Kubernetes cluster context detection:
11+
# - EKS API server URL patterns
12+
# - Node topology labels (topology.kubernetes.io/region)
13+
# - AWS provider IDs in node specifications
14+
# - Legacy region labels (failure-domain.beta.kubernetes.io/region)
15+
# 4. Default fallback: us-west-2
16+
#
17+
# Supported regions: us-east-1, us-west-2, us-east-2, us-west-1, eu-central-1,
18+
# eu-north-1, eu-west-1, eu-west-2, ap-northeast-1, ap-south-1, ap-southeast-1,
19+
# ap-southeast-2, sa-east-1
20+
region: ""
21+
22+
# Image tag for health monitoring agent
23+
# If not specified, uses global.imageTag or defaults to hardcoded version
24+
imageTag: ""
25+
26+
# Override the health monitoring agent image URI
27+
# If specified, this will override the automatic region-based URI selection
28+
# Example: "905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0"
29+
hmaimage: ""
30+
31+
# Enable debug output for region selection process
32+
debug: true

helm_chart/HyperPodHelmChart/values.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
# This is a YAML-formatted file.
33
# Declare variables to be passed into your templates.
44

5+
# Global configuration
6+
global:
7+
# AWS region for all components (can be overridden per component)
8+
region: ""
9+
510
replicaCount: 1
611

712
image:
@@ -255,7 +260,9 @@ aws-efa-k8s-device-plugin:
255260
mpi-operator:
256261
enabled: true
257262
health-monitoring-agent:
258-
enabled: true
263+
enabled: true
264+
# AWS region will be automatically detected or can be specified
265+
# region: "us-east-1"
259266
deep-health-check:
260267
enabled: true
261268
job-auto-restart:

helm_chart/readme.md

Lines changed: 63 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -131,21 +131,69 @@ helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system
131131
132132
## 6. Notes
133133
- Training job auto resume is expected to work with Kubeflow training operator release v1.7.0, v1.8.0, v1.8.1 https://github.com/kubeflow/training-operator/releases
134-
- If you intend to use the Health Monitoring Agent container image from another region, please see below list to find relevant region's URI.
135-
```
136-
IAD 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
137-
PDX 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
138-
CMH 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
139-
SFO 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
140-
FRA 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
141-
ARN 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
142-
DUB 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
143-
LHR 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
144-
NRT 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
145-
BOM 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
146-
SIN 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
147-
SYD 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
148-
GRU 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.448.0_1.0.115.0
134+
- The Health Monitoring Agent now automatically selects the correct container image URI based on your AWS region. The Helm chart intelligently detects the region from your Kubernetes cluster context.
135+
136+
- **Intelligent Region Detection**: The chart automatically detects your AWS region using multiple methods:
137+
1. **Explicit region setting** (highest priority): `--set health-monitoring-agent.region=us-east-1`
138+
2. **Global region setting**: `--set global.region=us-east-1`
139+
3. **Kubernetes cluster context detection**: Automatically extracts region from:
140+
- EKS API server URL patterns
141+
- Node topology labels (`topology.kubernetes.io/region`)
142+
- AWS provider IDs in node specifications
143+
- Legacy region labels (`failure-domain.beta.kubernetes.io/region`)
144+
4. **Default fallback region**: us-east-1
145+
146+
- **Manual Region Override**: If needed, you can still specify a region manually:
147+
```bash
148+
helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.region=us-west-2
149+
```
150+
151+
- **Debug Mode**: Enabled by default, to troubleshoot region detection and image selection:
152+
```bash
153+
# Disable debug mode during installation
154+
helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.debug=false
155+
156+
# Or upgrade existing installation with debug disabled
157+
helm upgrade dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.debug=false
158+
```
159+
160+
- **Viewing Debug Information**: When debug mode is enabled, detailed information is stored in a ConfigMap:
161+
```bash
162+
# View debug information (clean output)
163+
kubectl get configmap health-monitoring-agent-debug -n aws-hyperpod -o jsonpath='{.data.debug-info\.txt}'
164+
165+
# View full ConfigMap details
166+
kubectl get configmap health-monitoring-agent-debug -n aws-hyperpod -o yaml
167+
```
168+
169+
- **Debug Information Includes**:
170+
- Image tag selection process (component-specific settings)
171+
- Region detection methods attempted (EKS API server URL, node labels)
172+
- Number of nodes found and labels checked
173+
- Final region determination and account ID mapping
174+
- Generated image URI
175+
- Timestamp of debug information generation
176+
177+
- **Custom Image Override**: For advanced use cases, you can still override the image URI completely:
178+
```bash
179+
helm install dependencies helm_chart/HyperPodHelmChart --namespace kube-system --set health-monitoring-agent.hmaimage=""
180+
```
181+
182+
- **Supported Regions and their ECR URIs**:
183+
```
184+
us-east-1 (US East (N. Virginia)): 767398015722.dkr.ecr.us-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
185+
us-west-2 (US West (Oregon)): 905418368575.dkr.ecr.us-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
186+
us-east-2 (US East (Ohio)): 851725546812.dkr.ecr.us-east-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
187+
us-west-1 (US West (N. California)): 011528288828.dkr.ecr.us-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
188+
eu-central-1 (Europe (Frankfurt)): 211125453373.dkr.ecr.eu-central-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
189+
eu-north-1 (Europe (Stockholm)): 654654141839.dkr.ecr.eu-north-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
190+
eu-west-1 (Europe (Ireland)): 533267293120.dkr.ecr.eu-west-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
191+
eu-west-2 (Europe (London)): 011528288831.dkr.ecr.eu-west-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
192+
ap-northeast-1 (Asia Pacific (Tokyo)): 533267052152.dkr.ecr.ap-northeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
193+
ap-south-1 (Asia Pacific (Mumbai)): 011528288864.dkr.ecr.ap-south-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
194+
ap-southeast-1 (Asia Pacific (Singapore)): 905418428165.dkr.ecr.ap-southeast-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
195+
ap-southeast-2 (Asia Pacific (Sydney)): 851725636348.dkr.ecr.ap-southeast-2.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
196+
sa-east-1 (South America (São Paulo)): 025066253954.dkr.ecr.sa-east-1.amazonaws.com/hyperpod-health-monitoring-agent:1.0.674.0_1.0.199.0
149197
```
150198

151199
## 7. Troubleshooting

0 commit comments

Comments
 (0)