Skip to content

Commit e23fd1c

Browse files
committed
[GPUHEALTH-1522] feat: add DCGM ECC and NVLink RX metrics
Signed-off-by: Jingxiang Zhang <jingzhang@nvidia.com>
1 parent 47c3d48 commit e23fd1c

File tree

6 files changed

+111
-0
lines changed

6 files changed

+111
-0
lines changed

third_party/fleet-intelligence-sdk/components/accelerator/nvidia/dcgm/mem/component.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -418,6 +418,26 @@ func (c *component) Check() components.CheckResult {
418418
"uuid": deviceData.UUID,
419419
"gpu": fmt.Sprintf("%d", deviceData.DeviceID),
420420
}).Set(float64(fieldValue.Int64()))
421+
case dcgm.DCGM_FI_DEV_ECC_SBE_VOL_DEV:
422+
metricDCGMFIDevECCSBEVolDev.With(prometheus.Labels{
423+
"uuid": deviceData.UUID,
424+
"gpu": fmt.Sprintf("%d", deviceData.DeviceID),
425+
}).Set(float64(fieldValue.Int64()))
426+
case dcgm.DCGM_FI_DEV_ECC_DBE_VOL_DEV:
427+
metricDCGMFIDevECCDBEVolDev.With(prometheus.Labels{
428+
"uuid": deviceData.UUID,
429+
"gpu": fmt.Sprintf("%d", deviceData.DeviceID),
430+
}).Set(float64(fieldValue.Int64()))
431+
case dcgm.DCGM_FI_DEV_ECC_SBE_AGG_DEV:
432+
metricDCGMFIDevECCSBEAggDev.With(prometheus.Labels{
433+
"uuid": deviceData.UUID,
434+
"gpu": fmt.Sprintf("%d", deviceData.DeviceID),
435+
}).Set(float64(fieldValue.Int64()))
436+
case dcgm.DCGM_FI_DEV_ECC_DBE_AGG_DEV:
437+
metricDCGMFIDevECCDBEAggDev.With(prometheus.Labels{
438+
"uuid": deviceData.UUID,
439+
"gpu": fmt.Sprintf("%d", deviceData.DeviceID),
440+
}).Set(float64(fieldValue.Int64()))
421441
case dcgm.DCGM_FI_DEV_RETIRED_PENDING:
422442
metricDCGMFIDevRetiredPending.With(prometheus.Labels{"uuid": deviceData.UUID, "gpu": fmt.Sprintf("%d", deviceData.DeviceID)}).Set(float64(fieldValue.Int64()))
423443
case dcgm.DCGM_FI_DEV_RETIRED_DBE:

third_party/fleet-intelligence-sdk/components/accelerator/nvidia/dcgm/mem/metrics.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ var memFields = []dcgm.Short{
3737
dcgm.DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, // Total double bit volatile ECC errors
3838
dcgm.DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, // Total single bit aggregate (persistent) ECC errors
3939
dcgm.DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, // Total double bit aggregate (persistent) ECC errors
40+
dcgm.DCGM_FI_DEV_ECC_SBE_VOL_DEV, // Single bit volatile ECC errors detected in device memory
41+
dcgm.DCGM_FI_DEV_ECC_DBE_VOL_DEV, // Double bit volatile ECC errors detected in device memory
42+
dcgm.DCGM_FI_DEV_ECC_SBE_AGG_DEV, // Aggregate single bit ECC errors detected in device memory
43+
dcgm.DCGM_FI_DEV_ECC_DBE_AGG_DEV, // Aggregate double bit ECC errors detected in device memory
4044
dcgm.DCGM_FI_DEV_RETIRED_PENDING, // Whether pages are pending retirement
4145
dcgm.DCGM_FI_DEV_RETIRED_DBE, // Retired DBE pages
4246
dcgm.DCGM_FI_DEV_RETIRED_SBE, // Retired SBE pages
@@ -172,6 +176,38 @@ var (
172176
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
173177
).MustCurryWith(componentLabel)
174178

179+
metricDCGMFIDevECCSBEVolDev = prometheus.NewGaugeVec(
180+
prometheus.GaugeOpts{
181+
Name: "dcgm_fi_dev_ecc_sbe_vol_dev",
182+
Help: "Device memory single bit volatile ECC errors.",
183+
},
184+
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
185+
).MustCurryWith(componentLabel)
186+
187+
metricDCGMFIDevECCDBEVolDev = prometheus.NewGaugeVec(
188+
prometheus.GaugeOpts{
189+
Name: "dcgm_fi_dev_ecc_dbe_vol_dev",
190+
Help: "Device memory double bit volatile ECC errors.",
191+
},
192+
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
193+
).MustCurryWith(componentLabel)
194+
195+
metricDCGMFIDevECCSBEAggDev = prometheus.NewGaugeVec(
196+
prometheus.GaugeOpts{
197+
Name: "dcgm_fi_dev_ecc_sbe_agg_dev",
198+
Help: "Device memory single bit aggregate (persistent) ECC errors. Note: monotonically increasing.",
199+
},
200+
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
201+
).MustCurryWith(componentLabel)
202+
203+
metricDCGMFIDevECCDBEAggDev = prometheus.NewGaugeVec(
204+
prometheus.GaugeOpts{
205+
Name: "dcgm_fi_dev_ecc_dbe_agg_dev",
206+
Help: "Device memory double bit aggregate (persistent) ECC errors. Note: monotonically increasing.",
207+
},
208+
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
209+
).MustCurryWith(componentLabel)
210+
175211
metricDCGMFIDevRetiredPending = prometheus.NewGaugeVec(
176212
prometheus.GaugeOpts{Name: "dcgm_fi_dev_retired_pending", Help: "Number of pages pending retirement"},
177213
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
@@ -227,6 +263,10 @@ func init() {
227263
metricDCGMFIDevECCDBEVolTotal,
228264
metricDCGMFIDevECCSBEAggTotal,
229265
metricDCGMFIDevECCDBAggTotal,
266+
metricDCGMFIDevECCSBEVolDev,
267+
metricDCGMFIDevECCDBEVolDev,
268+
metricDCGMFIDevECCSBEAggDev,
269+
metricDCGMFIDevECCDBEAggDev,
230270
metricDCGMFIDevRetiredPending,
231271
metricDCGMFIDevRetiredDBE,
232272
metricDCGMFIDevRetiredSBE,
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
16+
package mem
17+
18+
import (
19+
"testing"
20+
21+
dcgm "github.com/NVIDIA/go-dcgm/pkg/dcgm"
22+
)
23+
24+
func TestMemFieldsIncludeExpectedECCDeviceCounters(t *testing.T) {
25+
fieldSet := make(map[dcgm.Short]struct{}, len(memFields))
26+
for _, field := range memFields {
27+
fieldSet[field] = struct{}{}
28+
}
29+
30+
required := []dcgm.Short{
31+
dcgm.DCGM_FI_DEV_ECC_DBE_AGG_DEV,
32+
dcgm.DCGM_FI_DEV_ECC_DBE_VOL_DEV,
33+
dcgm.DCGM_FI_DEV_ECC_SBE_AGG_DEV,
34+
dcgm.DCGM_FI_DEV_ECC_SBE_VOL_DEV,
35+
}
36+
37+
for _, field := range required {
38+
if _, ok := fieldSet[field]; !ok {
39+
t.Errorf("missing expected mem field: %d", field)
40+
}
41+
}
42+
}

third_party/fleet-intelligence-sdk/components/accelerator/nvidia/dcgm/nvlink/component.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,8 @@ func (c *component) Check() components.CheckResult {
375375
metricDCGMFIDevC2CLinkErrorReplay.With(prometheus.Labels{"uuid": deviceData.UUID, "gpu": fmt.Sprintf("%d", deviceData.DeviceID)}).Set(float64(fieldValue.Int64()))
376376
case dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS:
377377
metricDCGMFIDevNvlinkCountRxGeneralErrors.With(prometheus.Labels{"uuid": deviceData.UUID, "gpu": fmt.Sprintf("%d", deviceData.DeviceID)}).Set(float64(fieldValue.Int64()))
378+
case dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS:
379+
metricDCGMFIDevNvlinkCountRxErrors.With(prometheus.Labels{"uuid": deviceData.UUID, "gpu": fmt.Sprintf("%d", deviceData.DeviceID)}).Set(float64(fieldValue.Int64()))
378380
case dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS:
379381
metricDCGMFIDevNvlinkCountRxMalformedPacketErrors.With(prometheus.Labels{"uuid": deviceData.UUID, "gpu": fmt.Sprintf("%d", deviceData.DeviceID)}).Set(float64(fieldValue.Int64()))
380382
case dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS:

third_party/fleet-intelligence-sdk/components/accelerator/nvidia/dcgm/nvlink/metrics.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ var nvlinkFields = []dcgm.Short{
3434
dcgm.DCGM_FI_DEV_FABRIC_MANAGER_STATUS,
3535
dcgm.DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY,
3636
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS,
37+
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS,
3738
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS,
3839
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS,
3940
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS,
@@ -122,6 +123,10 @@ var (
122123
prometheus.GaugeOpts{Name: "dcgm_fi_dev_nvlink_count_rx_general_errors", Help: "Total number of packets Rx with header mismatch"},
123124
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
124125
).MustCurryWith(componentLabel)
126+
metricDCGMFIDevNvlinkCountRxErrors = prometheus.NewGaugeVec(
127+
prometheus.GaugeOpts{Name: "dcgm_fi_dev_nvlink_count_rx_errors", Help: "Total number of packets with errors Rx on a link"},
128+
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
129+
).MustCurryWith(componentLabel)
125130
metricDCGMFIDevNvlinkCountRxMalformedPacketErrors = prometheus.NewGaugeVec(
126131
prometheus.GaugeOpts{Name: "dcgm_fi_dev_nvlink_count_rx_malformed_packet_errors", Help: "Number of packets Rx on a link where packets are malformed"},
127132
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
@@ -171,6 +176,7 @@ func init() {
171176
metricDCGMFIDevFabricManagerStatus,
172177
metricDCGMFIDevC2CLinkErrorReplay,
173178
metricDCGMFIDevNvlinkCountRxGeneralErrors,
179+
metricDCGMFIDevNvlinkCountRxErrors,
174180
metricDCGMFIDevNvlinkCountRxMalformedPacketErrors,
175181
metricDCGMFIDevNvlinkCountRxRemoteErrors,
176182
metricDCGMFIDevNvlinkCountRxSymbolErrors,

third_party/fleet-intelligence-sdk/components/accelerator/nvidia/dcgm/nvlink/metrics_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ func TestNVLinkFieldsIncludeExpectedCounters(t *testing.T) {
3737
dcgm.DCGM_FI_DEV_FABRIC_MANAGER_STATUS,
3838
dcgm.DCGM_FI_DEV_C2C_LINK_ERROR_REPLAY,
3939
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_GENERAL_ERRORS,
40+
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_ERRORS,
4041
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_MALFORMED_PACKET_ERRORS,
4142
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_REMOTE_ERRORS,
4243
dcgm.DCGM_FI_DEV_NVLINK_COUNT_RX_SYMBOL_ERRORS,

0 commit comments

Comments
 (0)