Skip to content

Commit f9023ae

Browse files
committed
Skip metrics when standby JSON cached
When smartctl returns standby (exit status bit 1), we cache the minimal JSON so power_mode can still be exported. That JSON omits capacity, block size, device info, and NVMe health fields, so collectors must skip those metrics when fields are missing to avoid emitting zeros or empty-label series. Signed-off-by: Yonathan Randolph <yonathan@gmail.com>
1 parent 43fb9d6 commit f9023ae

File tree

4 files changed

+190
-20
lines changed

4 files changed

+190
-20
lines changed

main.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,9 @@ type SMARTctlManagerCollector struct {
5757

5858
// Describe sends the super-set of all possible descriptors of metrics
5959
func (i *SMARTctlManagerCollector) Describe(ch chan<- *prometheus.Desc) {
60-
prometheus.DescribeByCollect(i, ch)
60+
for _, desc := range allMetricDescs {
61+
ch <- desc
62+
}
6163
}
6264

6365
// Collect is called by the Prometheus registry when collecting metrics.

metrics.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,4 +362,45 @@ var (
362362
},
363363
nil,
364364
)
365+
allMetricDescs = []*prometheus.Desc{
366+
metricSmartctlVersion,
367+
metricDeviceModel,
368+
metricDeviceCount,
369+
metricDeviceCapacityBlocks,
370+
metricDeviceCapacityBytes,
371+
metricDeviceTotalCapacityBytes,
372+
metricDeviceBlockSize,
373+
metricDeviceInterfaceSpeed,
374+
metricDeviceAttribute,
375+
metricDevicePowerOnSeconds,
376+
metricDeviceRotationRate,
377+
metricDeviceTemperature,
378+
metricDevicePowerCycleCount,
379+
metricDevicePercentageUsed,
380+
metricDeviceAvailableSpare,
381+
metricDeviceAvailableSpareThreshold,
382+
metricDeviceCriticalWarning,
383+
metricDeviceMediaErrors,
384+
metricDeviceNumErrLogEntries,
385+
metricDeviceBytesRead,
386+
metricDeviceBytesWritten,
387+
metricDeviceSmartStatus,
388+
metricDeviceExitStatus,
389+
metricDeviceState,
390+
metricDevicePowerMode,
391+
metricDeviceStatistics,
392+
metricDeviceErrorLogCount,
393+
metricDeviceSelfTestLogCount,
394+
metricDeviceSelfTestLogErrorCount,
395+
metricDeviceERCSeconds,
396+
metricSCSIGrownDefectList,
397+
metricReadErrorsCorrectedByRereadsRewrites,
398+
metricReadErrorsCorrectedByEccFast,
399+
metricReadErrorsCorrectedByEccDelayed,
400+
metricReadTotalUncorrectedErrors,
401+
metricWriteErrorsCorrectedByRereadsRewrites,
402+
metricWriteErrorsCorrectedByEccFast,
403+
metricWriteErrorsCorrectedByEccDelayed,
404+
metricWriteTotalUncorrectedErrors,
405+
}
365406
)

smartctl.go

Lines changed: 73 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,23 @@ func (smart *SMARTctl) mineExitStatus() {
132132
}
133133

134134
func (smart *SMARTctl) mineDevice() {
135+
hasInfo := false
136+
for _, key := range []string{
137+
"model_name",
138+
"scsi_vendor",
139+
"scsi_product",
140+
"serial_number",
141+
"firmware_version",
142+
"model_family",
143+
} {
144+
if smart.json.Get(key).Exists() {
145+
hasInfo = true
146+
break
147+
}
148+
}
149+
if !hasInfo {
150+
return
151+
}
135152
smart.ch <- prometheus.MustNewConstMetric(
136153
metricDeviceModel,
137154
prometheus.GaugeValue,
@@ -171,18 +188,27 @@ func (smart *SMARTctl) mineCapacity() {
171188
// The user_capacity exists only when NVMe have single namespace. Otherwise,
172189
// for NVMe devices with multiple namespaces, when device name used without
173190
// namespace number (exporter case) user_capacity will be absent
174-
smart.ch <- prometheus.MustNewConstMetric(
175-
metricDeviceCapacityBlocks,
176-
prometheus.GaugeValue,
177-
smart.json.Get("user_capacity.blocks").Float(),
178-
smart.device.device,
179-
)
180-
smart.ch <- prometheus.MustNewConstMetric(
181-
metricDeviceCapacityBytes,
182-
prometheus.GaugeValue,
183-
smart.json.Get("user_capacity.bytes").Float(),
184-
smart.device.device,
185-
)
191+
userCapacity := smart.json.Get("user_capacity")
192+
if userCapacity.Exists() {
193+
blocks := userCapacity.Get("blocks")
194+
if blocks.Exists() {
195+
smart.ch <- prometheus.MustNewConstMetric(
196+
metricDeviceCapacityBlocks,
197+
prometheus.GaugeValue,
198+
blocks.Float(),
199+
smart.device.device,
200+
)
201+
}
202+
bytes := userCapacity.Get("bytes")
203+
if bytes.Exists() {
204+
smart.ch <- prometheus.MustNewConstMetric(
205+
metricDeviceCapacityBytes,
206+
prometheus.GaugeValue,
207+
bytes.Float(),
208+
smart.device.device,
209+
)
210+
}
211+
}
186212
nvme_total_capacity := smart.json.Get("nvme_total_capacity")
187213
if nvme_total_capacity.Exists() {
188214
smart.ch <- prometheus.MustNewConstMetric(
@@ -196,10 +222,14 @@ func (smart *SMARTctl) mineCapacity() {
196222

197223
func (smart *SMARTctl) mineBlockSize() {
198224
for _, blockType := range []string{"logical", "physical"} {
225+
blockSize := smart.json.Get(fmt.Sprintf("%s_block_size", blockType))
226+
if !blockSize.Exists() {
227+
continue
228+
}
199229
smart.ch <- prometheus.MustNewConstMetric(
200230
metricDeviceBlockSize,
201231
prometheus.GaugeValue,
202-
smart.json.Get(fmt.Sprintf("%s_block_size", blockType)).Float(),
232+
blockSize.Float(),
203233
smart.device.device,
204234
blockType,
205235
)
@@ -342,55 +372,79 @@ func (smart *SMARTctl) mineDeviceSCTStatus() {
342372
}
343373

344374
func (smart *SMARTctl) mineNvmePercentageUsed() {
375+
percentageUsed := smart.json.Get("nvme_smart_health_information_log.percentage_used")
376+
if !percentageUsed.Exists() {
377+
return
378+
}
345379
smart.ch <- prometheus.MustNewConstMetric(
346380
metricDevicePercentageUsed,
347381
prometheus.CounterValue,
348-
smart.json.Get("nvme_smart_health_information_log.percentage_used").Float(),
382+
percentageUsed.Float(),
349383
smart.device.device,
350384
)
351385
}
352386

353387
func (smart *SMARTctl) mineNvmeAvailableSpare() {
388+
availableSpare := smart.json.Get("nvme_smart_health_information_log.available_spare")
389+
if !availableSpare.Exists() {
390+
return
391+
}
354392
smart.ch <- prometheus.MustNewConstMetric(
355393
metricDeviceAvailableSpare,
356394
prometheus.CounterValue,
357-
smart.json.Get("nvme_smart_health_information_log.available_spare").Float(),
395+
availableSpare.Float(),
358396
smart.device.device,
359397
)
360398
}
361399

362400
func (smart *SMARTctl) mineNvmeAvailableSpareThreshold() {
401+
availableSpareThreshold := smart.json.Get("nvme_smart_health_information_log.available_spare_threshold")
402+
if !availableSpareThreshold.Exists() {
403+
return
404+
}
363405
smart.ch <- prometheus.MustNewConstMetric(
364406
metricDeviceAvailableSpareThreshold,
365407
prometheus.CounterValue,
366-
smart.json.Get("nvme_smart_health_information_log.available_spare_threshold").Float(),
408+
availableSpareThreshold.Float(),
367409
smart.device.device,
368410
)
369411
}
370412

371413
func (smart *SMARTctl) mineNvmeCriticalWarning() {
414+
criticalWarning := smart.json.Get("nvme_smart_health_information_log.critical_warning")
415+
if !criticalWarning.Exists() {
416+
return
417+
}
372418
smart.ch <- prometheus.MustNewConstMetric(
373419
metricDeviceCriticalWarning,
374420
prometheus.CounterValue,
375-
smart.json.Get("nvme_smart_health_information_log.critical_warning").Float(),
421+
criticalWarning.Float(),
376422
smart.device.device,
377423
)
378424
}
379425

380426
func (smart *SMARTctl) mineNvmeMediaErrors() {
427+
mediaErrors := smart.json.Get("nvme_smart_health_information_log.media_errors")
428+
if !mediaErrors.Exists() {
429+
return
430+
}
381431
smart.ch <- prometheus.MustNewConstMetric(
382432
metricDeviceMediaErrors,
383433
prometheus.CounterValue,
384-
smart.json.Get("nvme_smart_health_information_log.media_errors").Float(),
434+
mediaErrors.Float(),
385435
smart.device.device,
386436
)
387437
}
388438

389439
func (smart *SMARTctl) mineNvmeNumErrLogEntries() {
440+
numErrLogEntries := smart.json.Get("nvme_smart_health_information_log.num_err_log_entries")
441+
if !numErrLogEntries.Exists() {
442+
return
443+
}
390444
smart.ch <- prometheus.MustNewConstMetric(
391445
metricDeviceNumErrLogEntries,
392446
prometheus.CounterValue,
393-
smart.json.Get("nvme_smart_health_information_log.num_err_log_entries").Float(),
447+
numErrLogEntries.Float(),
394448
smart.device.device,
395449
)
396450
}

smartctl_test.go

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,14 @@
1414
package main
1515

1616
import (
17+
"io"
18+
"log/slog"
19+
"os"
20+
"strings"
1721
"testing"
22+
23+
"github.com/prometheus/client_golang/prometheus"
24+
"github.com/tidwall/gjson"
1825
)
1926

2027
func TestBuildDeviceLabel(t *testing.T) {
@@ -42,3 +49,69 @@ func TestBuildDeviceLabel(t *testing.T) {
4249
}
4350
}
4451
}
52+
53+
func TestStandbyJSONSkipsMissingMetrics(t *testing.T) {
54+
names := collectMetricNames(t, "testdata/standby-sdc.json")
55+
expected := map[string]struct{}{
56+
"smartctl_device_power_mode": {},
57+
"smartctl_device_smartctl_exit_status": {},
58+
}
59+
60+
if len(names) != len(expected) {
61+
t.Fatalf("expected %d metrics, got %d: %v", len(expected), len(names), names)
62+
}
63+
for name := range expected {
64+
if _, ok := names[name]; !ok {
65+
t.Fatalf("missing metric %q", name)
66+
}
67+
}
68+
for name := range names {
69+
if _, ok := expected[name]; !ok {
70+
t.Fatalf("unexpected metric %q", name)
71+
}
72+
}
73+
}
74+
75+
func collectMetricNames(t *testing.T, jsonPath string) map[string]struct{} {
76+
t.Helper()
77+
78+
data, err := os.ReadFile(jsonPath)
79+
if err != nil {
80+
t.Fatalf("read json: %v", err)
81+
}
82+
json := gjson.ParseBytes(data)
83+
84+
ch := make(chan prometheus.Metric)
85+
logger := slog.New(slog.NewTextHandler(io.Discard, nil))
86+
smart := NewSMARTctl(logger, json, ch)
87+
88+
go func() {
89+
smart.Collect()
90+
close(ch)
91+
}()
92+
93+
names := make(map[string]struct{})
94+
for metric := range ch {
95+
name := metricName(metric)
96+
if name == "" {
97+
t.Fatalf("missing metric name for %v", metric)
98+
}
99+
names[name] = struct{}{}
100+
}
101+
return names
102+
}
103+
104+
func metricName(metric prometheus.Metric) string {
105+
desc := metric.Desc().String()
106+
const prefix = `fqName: "`
107+
start := strings.Index(desc, prefix)
108+
if start == -1 {
109+
return ""
110+
}
111+
start += len(prefix)
112+
end := strings.Index(desc[start:], `"`)
113+
if end == -1 {
114+
return ""
115+
}
116+
return desc[start : start+end]
117+
}

0 commit comments

Comments
 (0)