Skip to content

Commit bcf6908

Browse files
committed
Readme and Prometheus Stack Fix
Signed-off-by: Anurag Guda <[email protected]>
1 parent fc1100f commit bcf6908

File tree

3 files changed

+45
-26
lines changed

3 files changed

+45
-26
lines changed

playbooks/cns_values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ lws_version: "0.4.0"
3535
# GPU Operator Values
3636
enable_gpu_operator: yes
3737
confidential_computing: no
38-
gpu_driver_version: "550.127.05"
38+
gpu_driver_version: "570.86.15"
3939
use_open_kernel_module: no
4040
enable_mig: no
4141
mig_profile: all-disabled

playbooks/files/kube-prometheus-stack.values

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,19 @@ prometheus:
44
service:
55
type: NodePort
66
prometheusSpec:
7-
serviceMonitorSelectorNilUsesHelmValues: false
7+
serviceMonitorSelectorNilUsesHelmValues: false
8+
additionalScrapeConfigs:
9+
- job_name: gpu-metrics
10+
scrape_interval: 1s
11+
metrics_path: /metrics
12+
scheme: http
13+
kubernetes_sd_configs:
14+
- role: endpoints
15+
namespaces:
16+
names:
17+
- nvidia-gpu-operator
18+
- default
19+
relabel_configs:
20+
- source_labels: [__meta_kubernetes_pod_node_name]
21+
action: replace
22+
target_label: kubernetes_node

playbooks/readme.md

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,11 @@ Available versions are:
8282

8383
- 14.0
8484
- 13.2
85-
- 13.2
86-
- 13.2
85+
- 13.1
86+
- 13.0
8787
- 12.3
8888
- 12.2
89-
- 13.2
89+
- 12.1
9090
- 12.0
9191

9292
Edit the `cns_version.yaml` and update the version you want to install
@@ -104,39 +104,40 @@ cns_version: 13.2
104104
105105
## MicroK8s cluster
106106
microk8s: no
107-
## Kubernetes Install with Kubeadm
108-
install_k8s: yes
107+
## Kubernetes Install with Kubeadm
108+
install_k8s: yes
109109
110110
## Components Versions
111111
# Container Runtime options are containerd, cri-o, cri-dockerd
112112
container_runtime: "containerd"
113-
containerd_version: "1.7.20"
114-
runc_version: "1.1.13"
113+
containerd_version: "1.7.23"
114+
runc_version: "1.1.14"
115115
cni_plugins_version: "1.5.1"
116116
containerd_max_concurrent_downloads: "5"
117-
nvidia_container_toolkit_version: "1.16.1"
118-
crio_version: "1.30.2"
117+
nvidia_container_toolkit_version: "1.17.4"
118+
crio_version: "1.30.6"
119119
cri_dockerd_version: "0.3.15"
120-
k8s_version: "1.30.2"
121-
calico_version: "3.27.4"
122-
flannel_version: "0.25.5"
123-
helm_version: "3.15.3"
124-
gpu_operator_version: "24.6.1"
125-
network_operator_version: "24.4.1"
120+
k8s_version: "1.30.6"
121+
calico_version: "3.28.2"
122+
flannel_version: "0.25.6"
123+
helm_version: "3.16.2"
124+
gpu_operator_version: "24.9.2"
125+
network_operator_version: "24.10.1"
126126
nim_operator_version: "1.0.0"
127-
local_path_provisioner: "0.0.26"
127+
local_path_provisioner: "0.0.30"
128128
nfs_provisioner: "4.0.18"
129-
metallb_version: "0.14.5"
130-
kserve_version: "0.13"
131-
prometheus_stack: "25.27.0"
129+
metallb_version: "0.14.8"
130+
kserve_version: "0.14"
131+
prometheus_stack: "67.5.0"
132132
prometheus_adapter: "4.11.0"
133-
elastic_stack: "8.14.1"
133+
grafana_operator: "v5.15.1"
134+
elastic_stack: "8.15.3"
134135
lws_version: "0.4.0"
135136
136137
# GPU Operator Values
137138
enable_gpu_operator: yes
138139
confidential_computing: no
139-
gpu_driver_version: "550.90.07"
140+
gpu_driver_version: "570.86.15"
140141
use_open_kernel_module: no
141142
enable_mig: no
142143
mig_profile: all-disabled
@@ -185,9 +186,12 @@ k8s_gpg_key: "https://pkgs.k8s.io/core:/stable:/v1.30/rpm/repodata/repomd.xml.ke
185186
k8s_apt_ring: "/etc/apt/keyrings/kubernetes-apt-keyring.gpg"
186187
k8s_registry: "registry.k8s.io"
187188
188-
# Install NVIDIA NIM Operator
189+
# Install NVIDIA NIM Operator
189190
enable_nim_operator: no
190191
192+
# LeaderWorkerSet https://github.com/kubernetes-sigs/lws/tree/main
193+
lws: no
194+
191195
# Local Path Provisioner and NFS Provisoner as Storage option
192196
storage: no
193197
@@ -205,7 +209,7 @@ loadbalancer_ip: ""
205209
## Cloud Native Stack Validation
206210
cns_validation: no
207211
208-
# BMC Details for Confidential Computing
212+
# BMC Details for Confidential Computing
209213
bmc_ip:
210214
bmc_username:
211215
bmc_password:
@@ -218,7 +222,7 @@ aws_gpu_instance_type: g4dn.2xlarge
218222
219223
## Google Cloud GKE Values
220224
#https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects
221-
gke_project_id:
225+
gke_project_id:
222226
#https://cloud.google.com/compute/docs/regions-zones#available
223227
gke_region: us-west1
224228
gke_node_zones: ["us-west1-b"]

0 commit comments

Comments
 (0)