Skip to content

Commit fc3adb6

Browse files
committed
🔖 Auto-update Helm chart to v0.1.12 (#62)
1 parent 5fcbc97 commit fc3adb6

File tree

7 files changed

+105
-12
lines changed

7 files changed

+105
-12
lines changed

grafana_dashboards/oke-workloads-to-gpuscanner-mapping-dashboard.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2567,7 +2567,7 @@
25672567
"uid": "{{DATASOURCE_PROMETHEUS}}"
25682568
},
25692569
"editorMode": "code",
2570-
"expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (amd_gpu_junction_temperature * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
2570+
"expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_junction_temperature, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
25712571
"hide": false,
25722572
"instant": false,
25732573
"interval": "",
@@ -2642,7 +2642,7 @@
26422642
"uid": "{{DATASOURCE_PROMETHEUS}}"
26432643
},
26442644
"editorMode": "code",
2645-
"expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (amd_gpu_power_usage * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
2645+
"expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_power_usage, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
26462646
"hide": false,
26472647
"instant": false,
26482648
"legendFormat": "__auto",
@@ -2717,7 +2717,7 @@
27172717
"uid": "{{DATASOURCE_PROMETHEUS}}"
27182718
},
27192719
"editorMode": "code",
2720-
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (amd_gpu_junction_temperature * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))) ",
2720+
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_junction_temperature, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))) ",
27212721
"interval": "",
27222722
"legendFormat": "Avg GPU Temperature",
27232723
"range": true,
@@ -2824,7 +2824,7 @@
28242824
"uid": "{{DATASOURCE_PROMETHEUS}}"
28252825
},
28262826
"editorMode": "code",
2827-
"expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR (amd_gpu_gfx_activity * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
2827+
"expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_gfx_activity, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
28282828
"interval": "",
28292829
"legendFormat": "{{Hostname}}{{hostname}}",
28302830
"range": true,
@@ -2929,7 +2929,7 @@
29292929
"uid": "{{DATASOURCE_PROMETHEUS}}"
29302930
},
29312931
"editorMode": "code",
2932-
"expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (amd_gpu_power_usage * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
2932+
"expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_power_usage, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
29332933
"interval": "",
29342934
"legendFormat": "GPU {{gpu}}{{gpu_id}} - {{Hostname}}{{hostname}}",
29352935
"range": true,

grafana_dashboards/resourcepools-to-gpuhealth-mapping-dashboard.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2552,7 +2552,7 @@
25522552
"uid": "{{DATASOURCE_PROMETHEUS}}"
25532553
},
25542554
"editorMode": "code",
2555-
"expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR gpu_junction_temperature{hostname=~\"$hostname\"}",
2555+
"expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_junction_temperature, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
25562556
"hide": false,
25572557
"instant": false,
25582558
"interval": "",
@@ -2627,7 +2627,7 @@
26272627
"uid": "{{DATASOURCE_PROMETHEUS}}"
26282628
},
26292629
"editorMode": "code",
2630-
"expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR gpu_power_usage{hostname=~\"$hostname\"})",
2630+
"expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_power_usage, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
26312631
"hide": false,
26322632
"instant": false,
26332633
"legendFormat": "__auto",
@@ -2702,7 +2702,7 @@
27022702
"uid": "{{DATASOURCE_PROMETHEUS}}"
27032703
},
27042704
"editorMode": "code",
2705-
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR gpu_junction_temperature{hostname=~\"$hostname\"}) ",
2705+
"expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_junction_temperature, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))) ",
27062706
"interval": "",
27072707
"legendFormat": "Avg GPU Temperature",
27082708
"range": true,
@@ -2809,7 +2809,7 @@
28092809
"uid": "{{DATASOURCE_PROMETHEUS}}"
28102810
},
28112811
"editorMode": "code",
2812-
"expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR gpu_gfx_activity{hostname=~\"$hostname\"})",
2812+
"expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_gfx_activity, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
28132813
"interval": "",
28142814
"legendFormat": "{{Hostname}}{{hostname}}",
28152815
"range": true,
@@ -2914,7 +2914,7 @@
29142914
"uid": "{{DATASOURCE_PROMETHEUS}}"
29152915
},
29162916
"editorMode": "code",
2917-
"expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR gpu_power_usage{hostname=~\"$hostname\"}",
2917+
"expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_power_usage, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
29182918
"interval": "",
29192919
"legendFormat": "GPU {{gpu}}{{gpu_id}} - {{Hostname}}{{hostname}}",
29202920
"range": true,

oci-gpu-scanner-deploy.zip

0 Bytes
Binary file not shown.
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
{{- if .Values.amdGpuExporter.enabled }}
2+
apiVersion: batch/v1
3+
kind: Job
4+
metadata:
5+
name: patch-amd-exporter-{{ .Release.Revision }}
6+
namespace: {{ .Values.namespace.name }}
7+
annotations:
8+
"helm.sh/hook": post-install,post-upgrade
9+
"helm.sh/hook-weight": "5"
10+
"helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
11+
labels:
12+
app: amd-exporter-patch
13+
component: monitoring
14+
spec:
15+
ttlSecondsAfterFinished: 60
16+
template:
17+
metadata:
18+
labels:
19+
app: amd-exporter-patch
20+
spec:
21+
serviceAccountName: metrics-push-sa
22+
restartPolicy: Never
23+
containers:
24+
- name: patch
25+
image: docker.io/alpine/k8s:1.31.10
26+
command:
27+
- /bin/bash
28+
- -c
29+
- |
30+
set -e
31+
echo "Waiting for AMD GPU exporter DaemonSet to be created..."
32+
33+
# Wait for DaemonSet to exist (up to 60 seconds)
34+
for i in {1..12}; do
35+
if kubectl get daemonset oci-gpu-scanner-plugin-amdgpu-metrics-exporter -n {{ .Values.namespace.name }} &>/dev/null; then
36+
echo "DaemonSet found, applying hostNetwork patch..."
37+
break
38+
fi
39+
echo "Waiting for DaemonSet... (attempt $i/12)"
40+
sleep 5
41+
done
42+
43+
# Apply the patch
44+
kubectl patch daemonset oci-gpu-scanner-plugin-amdgpu-metrics-exporter \
45+
-n {{ .Values.namespace.name }} \
46+
-p '{"spec":{"template":{"spec":{"hostNetwork":true}}}}' || {
47+
echo "Failed to patch DaemonSet. It may not exist yet."
48+
exit 1
49+
}
50+
51+
echo "Successfully patched AMD GPU exporter DaemonSet with hostNetwork=true"
52+
53+
# Verify the patch
54+
if kubectl get daemonset oci-gpu-scanner-plugin-amdgpu-metrics-exporter -n {{ .Values.namespace.name }} -o jsonpath='{.spec.template.spec.hostNetwork}' | grep -q "true"; then
55+
echo "✓ Verification successful: hostNetwork is enabled"
56+
else
57+
echo "⚠ Warning: hostNetwork may not be set correctly"
58+
exit 1
59+
fi
60+
{{- end }}

oci-scanner-plugin-helm/templates/rbac.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ rules:
6767
verbs: ["get", "list", "watch"]
6868
- apiGroups: ["apps"]
6969
resources: ["daemonsets"]
70-
verbs: ["get", "list", "watch"]
70+
verbs: ["get", "list", "watch", "patch"]
7171
{{- if .Values.metricsPushJob.enableServiceDiscovery }}
7272
- apiGroups: [""]
7373
resources: ["services", "endpoints"]

oci_lens_terraform/modules/app/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ resource "oci_identity_policy" "workload_identity_policy" {
2626
resource "helm_release" "app" {
2727
name = "lens"
2828
namespace = kubernetes_namespace.ns.metadata[0].name
29-
chart = "https://oci-ai-incubations.github.io/corrino-lens-devops/lens-0.1.11-20251117-2233.tgz"
29+
chart = "https://oci-ai-incubations.github.io/corrino-lens-devops/lens-0.1.12-20251205-2232.tgz"
3030
wait = true
3131
timeout = 1800
3232
atomic = false

release/CHANGELOG_120525.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# v0.1.10
2+
## Release Date
3+
Oct 31, 2025
4+
5+
## Dependency Versions
6+
7+
| Module | Version | Resource |
8+
| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
9+
| Control Plane | v0.0.1 | iad.ocir.io/iduyx1qnmway/corrino-lens-backend:v0.0.1 |
10+
| Portal | v0.0.2 | iad.ocir.io/iduyx1qnmway/corrino-lens-portal:v0.0.2 |
11+
| Plugin | N/A | iduyx1qnmway/lens-metric-collector/oci-dr-hpc-v2:cuda-latest<br>iduyx1qnmway/lens-metric-collector/oci-dr-hpc-v2:rocm-latest<br>iduyx1qnmway/lens-metric-collector/oci_lens_pod_node_info:latest<br>iduyx1qnmway/lens-metric-collector/oci_lens_metric_collector:latest |
12+
| Helm | v 0.1.12 | [lens-0.1.12-20251205-2232.tgz](https://oci-ai-incubations.github.io/corrino-lens-devops/lens-0.1.12-20251205-2232.tgz) |
13+
| Quickstart | v 0.1.12 | [v0.1.12](https://github.com/oracle-quickstart/oci-gpu-scanner/releases/download/v0.1.12/oci-gpu-scanner-deploy.zip) |
14+
15+
----
16+
## Changelog
17+
### Bugfix
18+
#### Quickstart
19+
- Fixed Prometheus and Grafana for AMD metrics ([corrino-lens-devops #16](https://github.com/oracle-quickstart/oci-gpu-scanner/pull/61), [@gablyu-oci](https://github.com/gablyu-oci))
20+
21+
#### DRHPC
22+
- Resolved false negative on PCIE Width Missing Lanes Check ([@jolettacheungoracle](https://github.com/jolettacheungoracle))
23+
24+
#### Backend
25+
- Convert region shorthand to regional codes ([@jolettacheungoracle](https://github.com/jolettacheungoracle))
26+
27+
----
28+
### Others
29+
30+
#### Quickstart
31+
- Updated default values in `Values.yaml` for oci-gpu-scanner-plugin to enable nodeProblemDetector, healthCheck and nodeExporter by default ([corrino-lens-devops #16](https://github.com/oracle-quickstart/oci-gpu-scanner/pull/61), [@gablyu-oci](https://github.com/gablyu-oci))
32+
33+

0 commit comments

Comments
 (0)