Skip to content

Commit 37f403c

Browse files
Various updates
1 parent eabf460 commit 37f403c

File tree

16 files changed

+130
-686
lines changed

16 files changed

+130
-686
lines changed

manifests/node-problem-detector/values.yaml

Lines changed: 0 additions & 654 deletions
This file was deleted.

terraform/files/cert-manager/cluster-issuer.yaml renamed to terraform/files/cert-manager/cluster-issuer-prod.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ spec:
1717
solvers:
1818
- http01:
1919
ingress:
20-
ingressClassName: nginx
20+
ingressClassName: nginx
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
---
2+
apiVersion: cert-manager.io/v1
3+
kind: ClusterIssuer
4+
metadata:
5+
name: le-clusterissuer
6+
spec:
7+
acme:
8+
# You must replace this email address with your own.
9+
# Let's Encrypt will use this to contact you about expiring
10+
# certificates, and issues related to your account.
11+
12+
server: https://acme-staging-v02.api.letsencrypt.org/directory
13+
privateKeySecretRef:
14+
# Secret resource that will be used to store the account's private key.
15+
name: le-clusterissuer-secret
16+
# Add a single challenge solver, HTTP01 using nginx
17+
solvers:
18+
- http01:
19+
ingress:
20+
ingressClassName: nginx

terraform/files/node-problem-detector/values.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -520,8 +520,8 @@ logDir:
520520
pod: ""
521521

522522
image:
523-
repository: iad.ocir.io/hpc_limited_availability/oke-npd
524-
tag: v0.8.21-1
523+
repository: iad.ocir.io/idxzjcdglx2s/oke-npd
524+
tag: v1.34.0-1
525525
# image.digest -- the image digest. If given it takes precedence over a given tag.
526526
digest: ""
527527
pullPolicy: IfNotPresent
@@ -591,6 +591,8 @@ affinity:
591591
- VM.GPU.A100.80G.1
592592
- BM.GPU.H200.8
593593
- BM.GPU.B200.8
594+
- BM.GPU.GB200.4
595+
- BM.GPU.GB200-v2.4
594596

595597
nodeSelector: {}
596598

terraform/files/nvidia-dcgm-exporter/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
apiVersion: v2
22
name: dcgm-exporter
33
description: A Helm chart for DCGM exporter
4-
version: "4.5.2"
4+
version: "4.6.0"
55
kubeVersion: ">= 1.19.0-0"
6-
appVersion: "4.5.2"
6+
appVersion: "4.6.0"
77
sources:
88
- https://github.com/nvidia/dcgm-exporter
99
home: https://github.com/nvidia/dcgm-exporter/

terraform/files/nvidia-dcgm-exporter/templates/daemonset.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ spec:
4646
runtimeClassName: {{ .Values.runtimeClassName }}
4747
{{- end }}
4848
priorityClassName: {{ .Values.priorityClassName | default "system-node-critical" }}
49+
hostPID: {{ .Values.hostPID | default false }}
4950
{{- if .Values.hostNetwork }}
5051
hostNetwork: {{ .Values.hostNetwork }}
5152
dnsPolicy: ClusterFirstWithHostNet
@@ -138,6 +139,10 @@ spec:
138139
- name: "DCGM_EXPORTER_KUBERNETES_ENABLE_POD_UID"
139140
value: "true"
140141
{{- end }}
142+
{{- if .Values.kubernetes.podLabelAllowlistRegex }}
143+
- name: "DCGM_EXPORTER_KUBERNETES_POD_LABEL_ALLOWLIST_REGEX"
144+
value: {{ .Values.kubernetes.podLabelAllowlistRegex | join "," | quote }}
145+
{{- end }}
141146
- name: "DCGM_EXPORTER_LISTEN"
142147
value: "{{ .Values.service.address }}"
143148
- name: NODE_NAME

terraform/files/nvidia-dcgm-exporter/values.yaml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ image:
1717
pullPolicy: IfNotPresent
1818
# Image tag defaults to AppVersion, but you can use the tag key
1919
# for the image tag, e.g:
20-
tag: 4.4.1-4.5.2-ubuntu22.04
20+
tag: 4.4.1-4.6.0-ubuntu22.04
2121

2222
# Change the following reference to "/etc/dcgm-exporter/default-counters.csv"
2323
# to stop profiling metrics from DCGM
@@ -49,6 +49,10 @@ fullnameOverride: ""
4949
# Overrides the deployment namespace
5050
namespaceOverride: ""
5151

52+
# hostPID allows the DCGM-Exporter container to see processes on the host node
53+
# Default: false
54+
hostPID: false
55+
5256
# Defines the runtime class that will be used by the pod
5357
runtimeClassName: ""
5458
# Defines serviceAccount names for components.
@@ -211,6 +215,16 @@ kubernetes:
211215
# This requires cluster-level read permissions to pods
212216
enablePodUID: false
213217

218+
# Pod label filtering configuration
219+
# Filter which pod labels are included in metrics using regex patterns
220+
# Empty list means all labels are included (default behavior)
221+
# Examples:
222+
# podLabelAllowlistRegex:
223+
# - "^app$" # Exact match for "app" label
224+
# - "^app\\.kubernetes\\.io/.*" # All labels starting with app.kubernetes.io/
225+
# - "^(tier|environment|version)$" # Match tier, environment, or version labels
226+
podLabelAllowlistRegex: []
227+
214228
# RBAC settings for Kubernetes integration
215229
rbac:
216230
# Automatically creates ClusterRole and ClusterRoleBinding for pod access when enablePodLabels or enablePodUID is true
@@ -321,4 +335,4 @@ livenessProbe:
321335
periodSeconds: 5
322336

323337
readinessProbe:
324-
initialDelaySeconds: 45
338+
initialDelaySeconds: 45

terraform/files/oke-ons-webhook/templates/configmap.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,13 @@ data:
124124
if response.status_code == 200:
125125
short_url = response.json().get("url")
126126
parsed_short_url = urlparse(short_url)
127-
if "sslip.io" in short_url:
127+
netloc = parsed_url.netloc.split(":")[0]
128+
ip_only = False
129+
match = re.match(r"^(\d{1,3}\.?){4}$", netloc)
130+
if match:
131+
ip_only = True
132+
if not ip_only:
128133
scheme = "https"
129-
netloc = parsed_url.netloc.split(":")[0]
130134
return parsed_short_url.geturl().replace(f'{parsed_short_url.scheme}://{parsed_url.netloc}/', f"{scheme}://{netloc}/")
131135
return short_url
132136
else:

terraform/oke-cluster.tf

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,6 @@ locals {
152152
module "oke" {
153153
source = "oracle-terraform-modules/oke/oci"
154154
version = "5.3.3"
155-
# source = "github.com/oracle-terraform-modules/terraform-oci-oke.git?ref=ca3e66f"
156155

157156
providers = { oci.home = oci.home }
158157

terraform/output.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ output "grafana_fetch_endpoint_command" {
6464
output "grafana_url" {
6565
value = var.install_node_problem_detector_kube_prometheus_stack ? (
6666
var.preferred_kubernetes_services == "public" ?
67-
format("https://grafana.%s.sslip.io", try(data.kubernetes_service.nginx_lb[0].status[0].load_balancer[0].ingress[0].ip, try(data.oci_load_balancer_load_balancers.lbs[0].load_balancers[0].ip_addresses[0], "N/A"))) :
67+
format("https://grafana.%s.%s", try(data.kubernetes_service.nginx_lb[0].status[0].load_balancer[0].ingress[0].ip, try(data.oci_load_balancer_load_balancers.lbs[0].load_balancers[0].ip_addresses[0], "N/A")), var.wildcard_dns_domain):
6868
format("http://%s", try(data.kubernetes_service.grafana_internal_ip[0].status[0].load_balancer[0].ingress[0].ip, try(data.oci_load_balancer_load_balancers.lbs[0].load_balancers[0].ip_addresses[0], try(data.oci_load_balancer_load_balancers.internal_lbs[0].load_balancers[0].ip_addresses[0], "N/A"))))
6969
) : "N/A"
7070
}

0 commit comments

Comments
 (0)