🔖 Auto-update Helm chart to v0.1.12 (#62)

gablyu-oci · gablyu-oci · commit fc3adb6effdb · 2025-12-05T23:47:35.000Z
diff --git a/grafana_dashboards/oke-workloads-to-gpuscanner-mapping-dashboard.json b/grafana_dashboards/oke-workloads-to-gpuscanner-mapping-dashboard.json
@@ -2567,7 +2567,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (amd_gpu_junction_temperature * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
+              "expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_junction_temperature, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
               "hide": false,
               "instant": false,
               "interval": "",
@@ -2642,7 +2642,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (amd_gpu_power_usage * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
+              "expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_power_usage, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
               "hide": false,
               "instant": false,
               "legendFormat": "__auto",
@@ -2717,7 +2717,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (amd_gpu_junction_temperature * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))) ",
+              "expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_junction_temperature, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))) ",
               "interval": "",
               "legendFormat": "Avg GPU Temperature",
               "range": true,
@@ -2824,7 +2824,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR (amd_gpu_gfx_activity * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
+              "expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_gfx_activity, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
               "interval": "",
               "legendFormat": "{{Hostname}}{{hostname}}",
               "range": true,
@@ -2929,7 +2929,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (amd_gpu_power_usage * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
+              "expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_power_usage, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
               "interval": "",
               "legendFormat": "GPU {{gpu}}{{gpu_id}} - {{Hostname}}{{hostname}}",
               "range": true,
diff --git a/grafana_dashboards/resourcepools-to-gpuhealth-mapping-dashboard.json b/grafana_dashboards/resourcepools-to-gpuhealth-mapping-dashboard.json
@@ -2552,7 +2552,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR gpu_junction_temperature{hostname=~\"$hostname\"}",
+              "expr": "DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_junction_temperature, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
               "hide": false,
               "instant": false,
               "interval": "",
@@ -2627,7 +2627,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR gpu_power_usage{hostname=~\"$hostname\"})",
+              "expr": "avg(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_power_usage, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
               "hide": false,
               "instant": false,
               "legendFormat": "__auto",
@@ -2702,7 +2702,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR gpu_junction_temperature{hostname=~\"$hostname\"}) ",
+              "expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_junction_temperature, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))) ",
               "interval": "",
               "legendFormat": "Avg GPU Temperature",
               "range": true,
@@ -2809,7 +2809,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR gpu_gfx_activity{hostname=~\"$hostname\"})",
+              "expr": "avg(DCGM_FI_DEV_GPU_UTIL{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_gfx_activity, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"}))))",
               "interval": "",
               "legendFormat": "{{Hostname}}{{hostname}}",
               "range": true,
@@ -2914,7 +2914,7 @@
                 "uid": "{{DATASOURCE_PROMETHEUS}}"
               },
               "editorMode": "code",
-              "expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR gpu_power_usage{hostname=~\"$hostname\"}",
+              "expr": "DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$hostname\"} OR (label_replace(amd_gpu_power_usage, \"node\", \"$1\", \"instance\", \"([^:]+)\") * on(node) group_left(hostname) (max by(node, hostname) (oci_lens_pod_node_info{hostname=~\"$hostname\"})))",
               "interval": "",
               "legendFormat": "GPU {{gpu}}{{gpu_id}} - {{Hostname}}{{hostname}}",
               "range": true,
diff --git a/oci-gpu-scanner-deploy.zip b/oci-gpu-scanner-deploy.zip
diff --git a/oci-scanner-plugin-helm/templates/amd-exporter-patch-job.yaml b/oci-scanner-plugin-helm/templates/amd-exporter-patch-job.yaml
@@ -0,0 +1,60 @@
+{{- if .Values.amdGpuExporter.enabled }}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: patch-amd-exporter-{{ .Release.Revision }}
+  namespace: {{ .Values.namespace.name }}
+  annotations:
+    "helm.sh/hook": post-install,post-upgrade
+    "helm.sh/hook-weight": "5"
+    "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
+  labels:
+    app: amd-exporter-patch
+    component: monitoring
+spec:
+  ttlSecondsAfterFinished: 60
+  template:
+    metadata:
+      labels:
+        app: amd-exporter-patch
+    spec:
+      serviceAccountName: metrics-push-sa
+      restartPolicy: Never
+      containers:
+      - name: patch
+        image: docker.io/alpine/k8s:1.31.10
+        command:
+        - /bin/bash
+        - -c
+        - |
+          set -e
+          echo "Waiting for AMD GPU exporter DaemonSet to be created..."
+          
+          # Wait for DaemonSet to exist (up to 60 seconds)
+          for i in {1..12}; do
+            if kubectl get daemonset oci-gpu-scanner-plugin-amdgpu-metrics-exporter -n {{ .Values.namespace.name }} &>/dev/null; then
+              echo "DaemonSet found, applying hostNetwork patch..."
+              break
+            fi
+            echo "Waiting for DaemonSet... (attempt $i/12)"
+            sleep 5
+          done
+          
+          # Apply the patch
+          kubectl patch daemonset oci-gpu-scanner-plugin-amdgpu-metrics-exporter \
+            -n {{ .Values.namespace.name }} \
+            -p '{"spec":{"template":{"spec":{"hostNetwork":true}}}}' || {
+              echo "Failed to patch DaemonSet. It may not exist yet."
+              exit 1
+            }
+          
+          echo "Successfully patched AMD GPU exporter DaemonSet with hostNetwork=true"
+          
+          # Verify the patch
+          if kubectl get daemonset oci-gpu-scanner-plugin-amdgpu-metrics-exporter -n {{ .Values.namespace.name }} -o jsonpath='{.spec.template.spec.hostNetwork}' | grep -q "true"; then
+            echo "✓ Verification successful: hostNetwork is enabled"
+          else
+            echo "⚠ Warning: hostNetwork may not be set correctly"
+            exit 1
+          fi
+{{- end }}
diff --git a/oci-scanner-plugin-helm/templates/rbac.yaml b/oci-scanner-plugin-helm/templates/rbac.yaml
@@ -67,7 +67,7 @@ rules:
   verbs: ["get", "list", "watch"]
 - apiGroups: ["apps"]
   resources: ["daemonsets"]
-  verbs: ["get", "list", "watch"]
+  verbs: ["get", "list", "watch", "patch"]
 {{- if .Values.metricsPushJob.enableServiceDiscovery }}
 - apiGroups: [""]
   resources: ["services", "endpoints"]
diff --git a/oci_lens_terraform/modules/app/main.tf b/oci_lens_terraform/modules/app/main.tf
@@ -26,7 +26,7 @@ resource "oci_identity_policy" "workload_identity_policy" {
 resource "helm_release" "app" {
   name      = "lens"
   namespace = kubernetes_namespace.ns.metadata[0].name
-  chart = "https://oci-ai-incubations.github.io/corrino-lens-devops/lens-0.1.11-20251117-2233.tgz"
+  chart = "https://oci-ai-incubations.github.io/corrino-lens-devops/lens-0.1.12-20251205-2232.tgz"
   wait            = true
   timeout         = 1800
   atomic          = false
diff --git a/release/CHANGELOG_120525.md b/release/CHANGELOG_120525.md
@@ -0,0 +1,33 @@
+# v0.1.10
+## Release Date
+Oct 31, 2025
+
+## Dependency Versions
+
+| Module       | Version  | Resource                                                                                                                                                                                                                                                              |
+| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Control Plane | v0.0.1      | iad.ocir.io/iduyx1qnmway/corrino-lens-backend:v0.0.1                                                                                                                                                             |
+| Portal        | v0.0.2      | iad.ocir.io/iduyx1qnmway/corrino-lens-portal:v0.0.2                                                                                                                                                             |
+| Plugin        | N/A      | iduyx1qnmway/lens-metric-collector/oci-dr-hpc-v2:cuda-latest<br>iduyx1qnmway/lens-metric-collector/oci-dr-hpc-v2:rocm-latest<br>iduyx1qnmway/lens-metric-collector/oci_lens_pod_node_info:latest<br>iduyx1qnmway/lens-metric-collector/oci_lens_metric_collector:latest |
+| Helm          | v 0.1.12 | [lens-0.1.12-20251205-2232.tgz](https://oci-ai-incubations.github.io/corrino-lens-devops/lens-0.1.12-20251205-2232.tgz)                                                                                                 |
+| Quickstart    | v 0.1.12 | [v0.1.12](https://github.com/oracle-quickstart/oci-gpu-scanner/releases/download/v0.1.12/oci-gpu-scanner-deploy.zip)                                                                                                                                                    |
+
+----
+## Changelog
+### Bugfix
+#### Quickstart
+- Fixed Prometheus and Grafana for AMD metrics ([corrino-lens-devops #16](https://github.com/oracle-quickstart/oci-gpu-scanner/pull/61), [@gablyu-oci](https://github.com/gablyu-oci))
+
+#### DRHPC
+- Resolved false negative on PCIE Width Missing Lanes Check ([@jolettacheungoracle](https://github.com/jolettacheungoracle))
+
+#### Backend
+- Convert region shorthand to regional codes ([@jolettacheungoracle](https://github.com/jolettacheungoracle))
+
+----
+### Others
+
+#### Quickstart
+- Updated default values in `Values.yaml` for oci-gpu-scanner-plugin to enable nodeProblemDetector, healthCheck and nodeExporter by default ([corrino-lens-devops #16](https://github.com/oracle-quickstart/oci-gpu-scanner/pull/61), [@gablyu-oci](https://github.com/gablyu-oci))
+
+