Skip to content

Commit f2b4b8e

Browse files
committed
fix: remove retired page metrics
Signed-off-by: Jingxiang Zhang <jingzhang@nvidia.com> (cherry picked from commit fbdb7fa)
1 parent 2767124 commit f2b4b8e

File tree

3 files changed

+0
-30
lines changed

3 files changed

+0
-30
lines changed

third_party/fleet-intelligence-sdk/components/accelerator/nvidia/dcgm/mem/component.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -438,12 +438,6 @@ func (c *component) Check() components.CheckResult {
438438
"uuid": deviceData.UUID,
439439
"gpu": fmt.Sprintf("%d", deviceData.DeviceID),
440440
}).Set(float64(fieldValue.Int64()))
441-
case dcgm.DCGM_FI_DEV_RETIRED_PENDING:
442-
metricDCGMFIDevRetiredPending.With(prometheus.Labels{"uuid": deviceData.UUID, "gpu": fmt.Sprintf("%d", deviceData.DeviceID)}).Set(float64(fieldValue.Int64()))
443-
case dcgm.DCGM_FI_DEV_RETIRED_DBE:
444-
metricDCGMFIDevRetiredDBE.With(prometheus.Labels{"uuid": deviceData.UUID, "gpu": fmt.Sprintf("%d", deviceData.DeviceID)}).Set(float64(fieldValue.Int64()))
445-
case dcgm.DCGM_FI_DEV_RETIRED_SBE:
446-
metricDCGMFIDevRetiredSBE.With(prometheus.Labels{"uuid": deviceData.UUID, "gpu": fmt.Sprintf("%d", deviceData.DeviceID)}).Set(float64(fieldValue.Int64()))
447441
case dcgm.DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH:
448442
metricDCGMFIDevBanksRemapRowsAvailHigh.With(prometheus.Labels{"uuid": deviceData.UUID, "gpu": fmt.Sprintf("%d", deviceData.DeviceID)}).Set(float64(fieldValue.Int64()))
449443
case dcgm.DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW:

third_party/fleet-intelligence-sdk/components/accelerator/nvidia/dcgm/mem/component_test.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,6 @@ func TestCheck(t *testing.T) {
150150
"dcgm_fi_dev_uncorrectable_remapped_rows": 0,
151151
"dcgm_fi_dev_correctable_remapped_rows": 0,
152152
"dcgm_fi_dev_row_remap_failure": 0,
153-
"dcgm_fi_dev_retired_pending": 0,
154-
"dcgm_fi_dev_retired_dbe": 0,
155-
"dcgm_fi_dev_retired_sbe": 0,
156153
"dcgm_fi_dev_banks_remap_rows_avail_high": 0,
157154
"dcgm_fi_dev_banks_remap_rows_avail_low": 0,
158155
"dcgm_fi_dev_banks_remap_rows_avail_max": 0,

third_party/fleet-intelligence-sdk/components/accelerator/nvidia/dcgm/mem/metrics.go

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,6 @@ var memFields = []dcgm.Short{
4141
dcgm.DCGM_FI_DEV_ECC_DBE_VOL_DEV, // Double bit volatile ECC errors detected in device memory
4242
dcgm.DCGM_FI_DEV_ECC_SBE_AGG_DEV, // Aggregate single bit ECC errors detected in device memory
4343
dcgm.DCGM_FI_DEV_ECC_DBE_AGG_DEV, // Aggregate double bit ECC errors detected in device memory
44-
dcgm.DCGM_FI_DEV_RETIRED_PENDING, // Whether pages are pending retirement
45-
dcgm.DCGM_FI_DEV_RETIRED_DBE, // Retired DBE pages
46-
dcgm.DCGM_FI_DEV_RETIRED_SBE, // Retired SBE pages
4744
dcgm.DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH, // Banks with high remap row availability
4845
dcgm.DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW, // Banks with low remap row availability
4946
dcgm.DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX, // Banks with max remap row availability
@@ -208,21 +205,6 @@ var (
208205
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
209206
).MustCurryWith(componentLabel)
210207

211-
metricDCGMFIDevRetiredPending = prometheus.NewGaugeVec(
212-
prometheus.GaugeOpts{Name: "dcgm_fi_dev_retired_pending", Help: "Number of pages pending retirement"},
213-
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
214-
).MustCurryWith(componentLabel)
215-
216-
metricDCGMFIDevRetiredDBE = prometheus.NewGaugeVec(
217-
prometheus.GaugeOpts{Name: "dcgm_fi_dev_retired_dbe", Help: "Number of retired pages because of double bit errors. Note: monotonically increasing"},
218-
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
219-
).MustCurryWith(componentLabel)
220-
221-
metricDCGMFIDevRetiredSBE = prometheus.NewGaugeVec(
222-
prometheus.GaugeOpts{Name: "dcgm_fi_dev_retired_sbe", Help: "Number of retired pages because of single bit errors. Note: monotonically increasing"},
223-
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
224-
).MustCurryWith(componentLabel)
225-
226208
metricDCGMFIDevBanksRemapRowsAvailHigh = prometheus.NewGaugeVec(
227209
prometheus.GaugeOpts{Name: "dcgm_fi_dev_banks_remap_rows_avail_high", Help: "Historical high mark of available spare memory rows per memory bank"},
228210
[]string{pkgmetrics.MetricComponentLabelKey, "uuid", "gpu"},
@@ -267,9 +249,6 @@ func init() {
267249
metricDCGMFIDevECCDBEVolDev,
268250
metricDCGMFIDevECCSBEAggDev,
269251
metricDCGMFIDevECCDBEAggDev,
270-
metricDCGMFIDevRetiredPending,
271-
metricDCGMFIDevRetiredDBE,
272-
metricDCGMFIDevRetiredSBE,
273252
metricDCGMFIDevBanksRemapRowsAvailHigh,
274253
metricDCGMFIDevBanksRemapRowsAvailLow,
275254
metricDCGMFIDevBanksRemapRowsAvailMax,

0 commit comments

Comments
 (0)