Skip to content

Commit 6bac725

Browse files
Add 25.3.0 (#47)
1 parent d0725fe commit 6bac725

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+4282
-6095
lines changed

README.md

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,19 @@ You can use the instructions [here](https://docs.oracle.com/en-us/iaas/Content/C
2727

2828
**Images for NVIDIA shapes**
2929

30-
- [GPU driver 535.183.06 & CUDA 12.2](https://objectstorage.ca-toronto-1.oraclecloud.com/p/KOcEZeDpEAASLSKzumODnVr42mFwM_p9n1_Nra2FsV_F6BcpAkoH66HZxN4cCtIb/n/hpc_limited_availability/b/images/o/Ubuntu-22-OCA-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.09.18-0)
30+
- [GPU driver 560 & CUDA 12.6](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-GPU-560-CUDA-12.6-2025-03-05.01)
3131

32-
- [GPU driver 550.90.12 & CUDA 12.4](https://objectstorage.ca-toronto-1.oraclecloud.com/p/EDngSWYfn3HjrN0xbfBSVCctRVKVvNf3NOW7DdInKMtgiZwiUqy7PsA_xifmI1oq/n/hpc_limited_availability/b/images/o/Ubuntu-22-OCA-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.09.18-0)
33-
34-
- [GPU driver 560.35.03 & CUDA 12.6](https://objectstorage.ca-toronto-1.oraclecloud.com/p/a_KKMCajcBpt9EfqgmnZbtUInpc6gdC5s2g1wz7b0KUCLW28DSvTKwMeOSgW5O0R/n/hpc_limited_availability/b/images/o/Ubuntu-22-OCA-OFED-23.10-2.1.3.1-GPU-560-CUDA-12.6-2024.09.18-0)
32+
- [GPU driver 570 & CUDA 12.8](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-GPU-570-CUDA-12.8-2025-03-06.01)
3533

3634
**Image for AMD shapes**
3735

38-
- [ROCm 6.2](https://objectstorage.us-ashburn-1.oraclecloud.com/p/tpswnRAUmrJ49uLAGk_ku6B13hyGzf_Gv1vrggtDWhOywSM5YGzoMPiO88gc3Cv-/n/imagegen/b/GPU-imaging/o/Ubuntu-22-OFED-5.9-0.5.6.0.127-ROCM-6.2-90-2024.08.12-0.oci)
36+
- [ROCm 6.3](https://objectstorage.ca-montreal-1.oraclecloud.com/p/ts6fjAuj7hY4io5x_jfX3fyC70HRCG8-9gOFqAjuF0KE0s-6tgDZkbRRZIbMZmoN/n/hpc_limited_availability/b/images/o/Canonical-Ubuntu-22.04-2024.10.04-0-OCA-OFED-24.10-1.1.4.0-ROCM-632-2025-03-05.01)
3937

4038

4139
### Deploy the cluster using the Oracle Cloud Resource Manager template
4240
You can easily deploy the cluster using the **Deploy to Oracle Cloud** button below.
4341

44-
[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc-oke/releases/download/v25.2.0/oke-rdma-quickstart-v25.2.0.zip)
42+
[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc-oke/releases/download/v25.3.0/oke-rdma-quickstart-v25.3.0.zip)
4543

4644
For the image ID, use the ID of the image that you imported in the previous step.
4745

docs/running-gpu-rdma-healtchecks-with-node-problem-detector.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ Please note depending on the shape and its configuration, some health checks wil
2020
You can deploy using the Node Problem Detector Helm chart. The health check scripts are created as a `ConfigMap`, so please make sure you use the `values.yaml` in the link below.
2121

2222
```
23-
helm install gpu-rdma-node-problem-detector oci://ghcr.io/deliveryhero/helm-charts/node-problem-detector --version 2.3.15 \
23+
helm install gpu-rdma-node-problem-detector oci://ghcr.io/deliveryhero/helm-charts/node-problem-detector --version 2.3.18 \
2424
-f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/node-problem-detector/values.yaml
2525
```
2626

manifests/node-problem-detector/values.yaml

Lines changed: 884 additions & 372 deletions
Large diffs are not rendered by default.

terraform/full/dcgm-exporter.tf renamed to terraform/dcgm-exporter.tf

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
11
# Copyright (c) 2024 Oracle Corporation and/or its affiliates.
22
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl
33

4-
locals {
5-
dcgm_exporter_version = one(helm_release.dcgm_exporter[*].version)
6-
}
7-
84
resource "helm_release" "dcgm_exporter" {
9-
count = var.install_dcgm_exporter && var.install_monitoring ? 1 : 0
5+
count = var.install_dcgm_exporter && var.install_node_problem_detector_kube_prometheus_stack ? 1 : 0
106
depends_on = [helm_release.prometheus]
117
namespace = var.monitoring_namespace
128
name = "dcgm-exporter"
139
chart = "dcgm-exporter"
1410
repository = "https://nvidia.github.io/dcgm-exporter/helm-charts"
1511
version = var.dcgm_exporter_chart_version
16-
values = ["${file("./files/kube-dcgm-exporter-values.yaml")}"]
12+
values = ["${file("./files/dcgm-exporter/values.yaml")}"]
1713
create_namespace = false
1814
recreate_pods = true
1915
force_update = true

terraform/files/.DS_Store

6 KB
Binary file not shown.
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
podAnnotations:
2+
prometheus.io/scrape: "true"
3+
prometheus.io/port: "9400"
4+
5+
serviceMonitor:
6+
apiVersion: "monitoring.coreos.com/v1"
7+
enabled: true
8+
interval: 15s
9+
honorLabels: false
10+
additionalLabels:
11+
release: kube-prometheus-stack
12+
relabelings:
13+
- sourceLabels: [__meta_kubernetes_pod_node_name]
14+
separator: ;
15+
regex: ^(.*)$
16+
targetLabel: nodename
17+
replacement: $1
18+
action: replace
19+
- sourceLabels: [__meta_kubernetes_node_provider_id]
20+
targetLabel: instance_id
21+
action: replace
22+
- sourceLabels: [__meta_kubernetes_node_label_oci_oraclecloud_com_host_serial_number]
23+
targetLabel: host_serial_number
24+
action: replace
25+
- sourceLabels: [__meta_kubernetes_node_label_node_kubernetes_io_instance_type]
26+
targetLabel: instance_shape
27+
action: replace
28+
29+
nodeSelector:
30+
nvidia.com/gpu: "true"
31+
32+
tolerations:
33+
- operator: Exists
34+
35+
customMetrics: |
36+
# Format
37+
# If line starts with a '#' it is considered a comment
38+
# DCGM FIELD, Prometheus metric type, help message
39+
40+
# Clocks
41+
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
42+
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
43+
# DCGM_EXP_CLOCK_EVENTS_COUNT, gauge, Count of clock events within the user-specified time window (see clock-events-count-window-size param).
44+
45+
# Temperature
46+
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
47+
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
48+
49+
# Power
50+
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
51+
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
52+
53+
# PCIE
54+
DCGM_FI_PROF_PCIE_TX_BYTES, counter, Total number of bytes transmitted through PCIe TX via NVML.
55+
DCGM_FI_PROF_PCIE_RX_BYTES, counter, Total number of bytes received through PCIe RX via NVML.
56+
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
57+
58+
# Utilization (the sample period varies depending on the product)
59+
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
60+
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
61+
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
62+
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
63+
64+
# Errors and violations
65+
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
66+
DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
67+
DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
68+
DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
69+
DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
70+
DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
71+
DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
72+
DCGM_EXP_XID_ERRORS_COUNT, gauge, Count of XID Errors within user-specified time window (see xid-count-window-size param).
73+
# Memory usage
74+
DCGM_FI_DEV_FB_FREE, gauge, Frame buffer memory free (in MB).
75+
DCGM_FI_DEV_FB_USED, gauge, Frame buffer memory used (in MB).
76+
77+
# ECC
78+
DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
79+
DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
80+
DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
81+
DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
82+
83+
# Retired pages
84+
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
85+
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
86+
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
87+
88+
# NVLink
89+
DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
90+
DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
91+
DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
92+
DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
93+
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes
94+
95+
# VGPU License status
96+
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
97+
98+
# Remapped rows
99+
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
100+
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
101+
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
102+
103+
# Static configuration information. These appear as labels on the other metrics
104+
DCGM_FI_DRIVER_VERSION, label, Driver Version
105+
DCGM_FI_NVML_VERSION, label, NVML Version
106+
DCGM_FI_DEV_BRAND, label, Device Brand
107+
DCGM_FI_DEV_SERIAL, label, Device Serial Number
108+
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
109+
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
110+
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
111+
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
112+
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device
113+
DCGM_FI_DEV_ROW_REMAP_PENDING, gauge, Whether remapping of rows is pending
114+
DCGM_FI_DEV_COUNT, gauge, Number of Devices on the node
115+
DCGM_FI_DEV_MEM_MAX_OP_TEMP, gauge, Maximum operating temperature for the memory of this GPU
116+
DCGM_FI_DEV_GPU_MAX_OP_TEMP, gauge, Maximum operating temperature for this GPU
117+
DCGM_FI_DEV_SLOWDOWN_TEMP, gauge, Slowdown temperature for the device
118+
DCGM_FI_DEV_SHUTDOWN_TEMP, gauge, Shutdown temperature for the device
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
apiVersion: 1
2+
groups:
3+
- orgId: 1
4+
name: Node Problem Detector
5+
folder: OKE
6+
interval: 1m
7+
rules:
8+
- uid: oke_npd_cpu_profile
9+
title: CPU Profile
10+
condition: C
11+
data:
12+
- refId: A
13+
relativeTimeRange:
14+
from: 300
15+
to: 0
16+
datasourceUid: prometheus
17+
model:
18+
editorMode: code
19+
expr: problem_gauge{reason="CpuProfileHasIssues",type="CpuProfile"}
20+
instant: true
21+
intervalMs: 300000
22+
legendFormat: __auto
23+
maxDataPoints: 43200
24+
range: false
25+
refId: A
26+
- refId: C
27+
datasourceUid: __expr__
28+
model:
29+
conditions:
30+
- evaluator:
31+
params:
32+
- 0
33+
type: gt
34+
operator:
35+
type: and
36+
query:
37+
params:
38+
- C
39+
reducer:
40+
params: []
41+
type: last
42+
type: query
43+
datasource:
44+
type: __expr__
45+
uid: __expr__
46+
expression: A
47+
intervalMs: 1000
48+
maxDataPoints: 43200
49+
refId: C
50+
type: threshold
51+
noDataState: NoData
52+
execErrState: Error
53+
for: 5m
54+
annotations: {}
55+
labels: {}
56+
isPaused: false
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
apiVersion: 1
2+
groups:
3+
- orgId: 1
4+
name: Node Problem Detector
5+
folder: OKE
6+
interval: 1m
7+
rules:
8+
- uid: oke_npd_gpu_bad_pages
9+
title: GPU Bad Pages
10+
condition: C
11+
data:
12+
- refId: A
13+
relativeTimeRange:
14+
from: 300
15+
to: 0
16+
datasourceUid: prometheus
17+
model:
18+
editorMode: code
19+
expr: problem_gauge{reason="GpuBadPagesHasIssues",type="GpuBadPages"}
20+
instant: true
21+
intervalMs: 300000
22+
legendFormat: __auto
23+
maxDataPoints: 43200
24+
range: false
25+
refId: A
26+
- refId: C
27+
datasourceUid: __expr__
28+
model:
29+
conditions:
30+
- evaluator:
31+
params:
32+
- 0
33+
type: gt
34+
operator:
35+
type: and
36+
query:
37+
params:
38+
- C
39+
reducer:
40+
params: []
41+
type: last
42+
type: query
43+
datasource:
44+
type: __expr__
45+
uid: __expr__
46+
expression: A
47+
intervalMs: 1000
48+
maxDataPoints: 43200
49+
refId: C
50+
type: threshold
51+
noDataState: NoData
52+
execErrState: Error
53+
for: 5m
54+
annotations: {}
55+
labels: {}
56+
isPaused: false
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
apiVersion: 1
2+
groups:
3+
- orgId: 1
4+
name: Node Problem Detector
5+
folder: OKE
6+
interval: 1m
7+
rules:
8+
- uid: oke_npd_gpu_bus
9+
title: GPU Bus
10+
condition: C
11+
data:
12+
- refId: A
13+
relativeTimeRange:
14+
from: 300
15+
to: 0
16+
datasourceUid: prometheus
17+
model:
18+
editorMode: code
19+
expr: problem_gauge{reason="GpuBusHasIssues",type="GpuBus"}
20+
instant: true
21+
intervalMs: 300000
22+
legendFormat: __auto
23+
maxDataPoints: 43200
24+
range: false
25+
refId: A
26+
- refId: C
27+
datasourceUid: __expr__
28+
model:
29+
conditions:
30+
- evaluator:
31+
params:
32+
- 0
33+
type: gt
34+
operator:
35+
type: and
36+
query:
37+
params:
38+
- C
39+
reducer:
40+
params: []
41+
type: last
42+
type: query
43+
datasource:
44+
type: __expr__
45+
uid: __expr__
46+
expression: A
47+
intervalMs: 1000
48+
maxDataPoints: 43200
49+
refId: C
50+
type: threshold
51+
noDataState: NoData
52+
execErrState: Error
53+
for: 5m
54+
annotations: {}
55+
labels: {}
56+
isPaused: false

0 commit comments

Comments
 (0)