From 43fb9d6fc07a54566c4107c11cd2a994985c45d8 Mon Sep 17 00:00:00 2001 From: Yonathan Randolph Date: Sun, 26 Oct 2025 22:22:05 -0700 Subject: [PATCH 1/2] Add smartctl_device_power_mode metric Export power mode state from smartctl's power_mode JSON field. This allows monitoring which drives are spinning vs sleeping without waking them up during collection. Gauge: smartctl_device_power_mode{device}. The value is the ata_value: 0=standby, 255=active, etc. Cache JSON for standby drives (when smartctl --nocheck=standby exit code is 2) instead of returning stale data from when it was active. Signed-off-by: Yonathan Randolph --- metrics.go | 8 ++++ readjson.go | 13 +++++-- readjson_test.go | 66 ++++++++++++++++++++++++++++++++ smartctl.go | 13 +++++++ testdata/nonexistent-device.json | 39 +++++++++++++++++++ testdata/standby-sdc.json | 49 ++++++++++++++++++++++++ 6 files changed, 184 insertions(+), 4 deletions(-) create mode 100644 readjson_test.go create mode 100644 testdata/nonexistent-device.json create mode 100644 testdata/standby-sdc.json diff --git a/metrics.go b/metrics.go index 0ac083f..df6160b 100644 --- a/metrics.go +++ b/metrics.go @@ -234,6 +234,14 @@ var ( }, nil, ) + metricDevicePowerMode = prometheus.NewDesc( + "smartctl_device_power_mode", + "Device power mode from ATA CHECK POWER MODE command (ata_value: -1=sleep, 0x00=0=standby, 0x01=1=standby_y, 0x40=64=active_nv_down, 0x41=65=active_nv_up, 0x80=128=idle, 0x81=129=idle_a, 0x82=130=idle_b, 0x83=131=idle_c, 0xff=255=active_or_idle). Source: https://github.com/smartmontools/smartmontools/blob/RELEASE_7_5/smartmontools/ataprint.cpp#L3401-L3431", + []string{ + "device", + }, + nil, + ) metricDeviceStatistics = prometheus.NewDesc( "smartctl_device_statistics", "Device statistics", diff --git a/readjson.go b/readjson.go index a03b1d6..98f19c9 100644 --- a/readjson.go +++ b/readjson.go @@ -74,7 +74,8 @@ func readSMARTctl(logger *slog.Logger, device Device, wg *sync.WaitGroup) { // Accommodate a smartmontools pre-7.3 bug cleaned_out := strings.TrimPrefix(string(out), " Pending defect count:") json := parseJSON(cleaned_out) - rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int()) + exitStatus := json.Get("smartctl.exit_status").Int() + rcOk := resultCodeIsOk(logger, device, exitStatus, json) jsonOk := jsonIsOk(logger, json) logger.Debug("Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start)) if rcOk && jsonOk { @@ -134,7 +135,7 @@ func readData(logger *slog.Logger, device Device) gjson.Result { } // Parse smartctl return code -func resultCodeIsOk(logger *slog.Logger, device Device, SMARTCtlResult int64) bool { +func resultCodeIsOk(logger *slog.Logger, device Device, SMARTCtlResult int64, json gjson.Result) bool { result := true if SMARTCtlResult > 0 { b := SMARTCtlResult @@ -143,8 +144,12 @@ func resultCodeIsOk(logger *slog.Logger, device Device, SMARTCtlResult int64) bo result = false } if (b & (1 << 1)) != 0 { - logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device) - result = false + if json.Get("power_mode").Exists() { + logger.Info("Device in low-power mode", "device", device) + } else { + logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device) + result = false + } } if (b & (1 << 2)) != 0 { logger.Warn("Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device) diff --git a/readjson_test.go b/readjson_test.go new file mode 100644 index 0000000..9d77f40 --- /dev/null +++ b/readjson_test.go @@ -0,0 +1,66 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "io" + "log/slog" + "os" + "testing" + + "github.com/tidwall/gjson" +) + +func TestResultCodeIsOkStandbyJSON(t *testing.T) { + // output from a standby hard drive: + // sudo hdparm -y /dev/sdc + // sudo smartctl --nocheck=standby /dev/sdc --json --info --health --attributes --tolerance=verypermissive --format=brief --log=error + json := readTestJSON(t, "testdata/standby-sdc.json") + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + device := Device{ + Name: "/dev/sdc", + Type: "sat", + Label: "sdc", + } + + if !resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int(), json) { + t.Fatalf("expected exit status to be ok for standby json") + } +} + +func TestResultCodeIsOkNonexistentDeviceJSON(t *testing.T) { + // output from a nonexistent disk: + // sudo smartctl --nocheck=standby /dev/nonexistent --json --info --health --attributes --tolerance=verypermissive --format=brief --log=error + json := readTestJSON(t, "testdata/nonexistent-device.json") + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + device := Device{ + Name: "/dev/nonexistent", + Type: "auto", + Label: "nonexistent", + } + + if resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int(), json) { + t.Fatalf("expected exit status to be not ok for nonexistent device json") + } +} + +func readTestJSON(t *testing.T, path string) gjson.Result { + t.Helper() + + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read json: %v", err) + } + return gjson.ParseBytes(data) +} diff --git a/smartctl.go b/smartctl.go index b136b8b..474a85f 100644 --- a/smartctl.go +++ b/smartctl.go @@ -87,6 +87,7 @@ func (smart *SMARTctl) Collect() { smart.logger.Debug("Collecting metrics from", "device", smart.device.device, "family", smart.device.family, "model", smart.device.model) smart.mineExitStatus() smart.mineDevice() + smart.minePowerMode() smart.mineCapacity() smart.mineBlockSize() smart.mineInterfaceSpeed() @@ -154,6 +155,18 @@ func (smart *SMARTctl) mineDevice() { ) } +func (smart *SMARTctl) minePowerMode() { + powerMode := smart.json.Get("power_mode") + if powerMode.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDevicePowerMode, + prometheus.GaugeValue, + powerMode.Get("ata_value").Float(), + smart.device.device, + ) + } +} + func (smart *SMARTctl) mineCapacity() { // The user_capacity exists only when NVMe have single namespace. Otherwise, // for NVMe devices with multiple namespaces, when device name used without diff --git a/testdata/nonexistent-device.json b/testdata/nonexistent-device.json new file mode 100644 index 0000000..a764880 --- /dev/null +++ b/testdata/nonexistent-device.json @@ -0,0 +1,39 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 5 + ], + "pre_release": false, + "svn_revision": "5714", + "platform_info": "x86_64-linux-6.12.43", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--nocheck=standby", + "/dev/nonexistent", + "--json", + "--info", + "--health", + "--attributes", + "--tolerance=verypermissive", + "--format=brief", + "--log=error" + ], + "messages": [ + { + "string": "/dev/nonexistent: Unable to detect device type", + "severity": "error" + } + ], + "exit_status": 1 + }, + "local_time": { + "time_t": 1767059001, + "asctime": "Mon Dec 29 17:43:21 2025 PST" + } +} diff --git a/testdata/standby-sdc.json b/testdata/standby-sdc.json new file mode 100644 index 0000000..450b661 --- /dev/null +++ b/testdata/standby-sdc.json @@ -0,0 +1,49 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 5 + ], + "pre_release": false, + "svn_revision": "5714", + "platform_info": "x86_64-linux-6.12.43", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--nocheck=standby", + "/dev/sdc", + "--json", + "--info", + "--health", + "--attributes", + "--tolerance=verypermissive", + "--format=brief", + "--log=error" + ], + "messages": [ + { + "string": "Device is in STANDBY mode, exit(2)", + "severity": "information" + } + ], + "exit_status": 2 + }, + "local_time": { + "time_t": 1767052155, + "asctime": "Mon Dec 29 15:49:15 2025 PST" + }, + "device": { + "name": "/dev/sdc", + "info_name": "/dev/sdc [SAT]", + "type": "sat", + "protocol": "ATA" + }, + "power_mode": { + "ata_value": 0, + "name": "STANDBY" + } +} From f9023ae78042a8c0d10967030634b8d0370f6471 Mon Sep 17 00:00:00 2001 From: Yonathan Randolph Date: Mon, 29 Dec 2025 16:14:19 -0800 Subject: [PATCH 2/2] Skip metrics when standby JSON cached When smartctl returns standby (exit status bit 1), we cache the minimal JSON so power_mode can still be exported. That JSON omits capacity, block size, device info, and NVMe health fields, so collectors must skip those metrics when fields are missing to avoid emitting zeros or empty-label series. Signed-off-by: Yonathan Randolph --- main.go | 4 ++- metrics.go | 41 +++++++++++++++++++++ smartctl.go | 92 ++++++++++++++++++++++++++++++++++++++---------- smartctl_test.go | 73 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 190 insertions(+), 20 deletions(-) diff --git a/main.go b/main.go index c5b138a..4443d82 100644 --- a/main.go +++ b/main.go @@ -57,7 +57,9 @@ type SMARTctlManagerCollector struct { // Describe sends the super-set of all possible descriptors of metrics func (i *SMARTctlManagerCollector) Describe(ch chan<- *prometheus.Desc) { - prometheus.DescribeByCollect(i, ch) + for _, desc := range allMetricDescs { + ch <- desc + } } // Collect is called by the Prometheus registry when collecting metrics. diff --git a/metrics.go b/metrics.go index df6160b..ba40a36 100644 --- a/metrics.go +++ b/metrics.go @@ -362,4 +362,45 @@ var ( }, nil, ) + allMetricDescs = []*prometheus.Desc{ + metricSmartctlVersion, + metricDeviceModel, + metricDeviceCount, + metricDeviceCapacityBlocks, + metricDeviceCapacityBytes, + metricDeviceTotalCapacityBytes, + metricDeviceBlockSize, + metricDeviceInterfaceSpeed, + metricDeviceAttribute, + metricDevicePowerOnSeconds, + metricDeviceRotationRate, + metricDeviceTemperature, + metricDevicePowerCycleCount, + metricDevicePercentageUsed, + metricDeviceAvailableSpare, + metricDeviceAvailableSpareThreshold, + metricDeviceCriticalWarning, + metricDeviceMediaErrors, + metricDeviceNumErrLogEntries, + metricDeviceBytesRead, + metricDeviceBytesWritten, + metricDeviceSmartStatus, + metricDeviceExitStatus, + metricDeviceState, + metricDevicePowerMode, + metricDeviceStatistics, + metricDeviceErrorLogCount, + metricDeviceSelfTestLogCount, + metricDeviceSelfTestLogErrorCount, + metricDeviceERCSeconds, + metricSCSIGrownDefectList, + metricReadErrorsCorrectedByRereadsRewrites, + metricReadErrorsCorrectedByEccFast, + metricReadErrorsCorrectedByEccDelayed, + metricReadTotalUncorrectedErrors, + metricWriteErrorsCorrectedByRereadsRewrites, + metricWriteErrorsCorrectedByEccFast, + metricWriteErrorsCorrectedByEccDelayed, + metricWriteTotalUncorrectedErrors, + } ) diff --git a/smartctl.go b/smartctl.go index 474a85f..a814c55 100644 --- a/smartctl.go +++ b/smartctl.go @@ -132,6 +132,23 @@ func (smart *SMARTctl) mineExitStatus() { } func (smart *SMARTctl) mineDevice() { + hasInfo := false + for _, key := range []string{ + "model_name", + "scsi_vendor", + "scsi_product", + "serial_number", + "firmware_version", + "model_family", + } { + if smart.json.Get(key).Exists() { + hasInfo = true + break + } + } + if !hasInfo { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceModel, prometheus.GaugeValue, @@ -171,18 +188,27 @@ func (smart *SMARTctl) mineCapacity() { // The user_capacity exists only when NVMe have single namespace. Otherwise, // for NVMe devices with multiple namespaces, when device name used without // namespace number (exporter case) user_capacity will be absent - smart.ch <- prometheus.MustNewConstMetric( - metricDeviceCapacityBlocks, - prometheus.GaugeValue, - smart.json.Get("user_capacity.blocks").Float(), - smart.device.device, - ) - smart.ch <- prometheus.MustNewConstMetric( - metricDeviceCapacityBytes, - prometheus.GaugeValue, - smart.json.Get("user_capacity.bytes").Float(), - smart.device.device, - ) + userCapacity := smart.json.Get("user_capacity") + if userCapacity.Exists() { + blocks := userCapacity.Get("blocks") + if blocks.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceCapacityBlocks, + prometheus.GaugeValue, + blocks.Float(), + smart.device.device, + ) + } + bytes := userCapacity.Get("bytes") + if bytes.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceCapacityBytes, + prometheus.GaugeValue, + bytes.Float(), + smart.device.device, + ) + } + } nvme_total_capacity := smart.json.Get("nvme_total_capacity") if nvme_total_capacity.Exists() { smart.ch <- prometheus.MustNewConstMetric( @@ -196,10 +222,14 @@ func (smart *SMARTctl) mineCapacity() { func (smart *SMARTctl) mineBlockSize() { for _, blockType := range []string{"logical", "physical"} { + blockSize := smart.json.Get(fmt.Sprintf("%s_block_size", blockType)) + if !blockSize.Exists() { + continue + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceBlockSize, prometheus.GaugeValue, - smart.json.Get(fmt.Sprintf("%s_block_size", blockType)).Float(), + blockSize.Float(), smart.device.device, blockType, ) @@ -342,55 +372,79 @@ func (smart *SMARTctl) mineDeviceSCTStatus() { } func (smart *SMARTctl) mineNvmePercentageUsed() { + percentageUsed := smart.json.Get("nvme_smart_health_information_log.percentage_used") + if !percentageUsed.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDevicePercentageUsed, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.percentage_used").Float(), + percentageUsed.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeAvailableSpare() { + availableSpare := smart.json.Get("nvme_smart_health_information_log.available_spare") + if !availableSpare.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceAvailableSpare, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.available_spare").Float(), + availableSpare.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeAvailableSpareThreshold() { + availableSpareThreshold := smart.json.Get("nvme_smart_health_information_log.available_spare_threshold") + if !availableSpareThreshold.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceAvailableSpareThreshold, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.available_spare_threshold").Float(), + availableSpareThreshold.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeCriticalWarning() { + criticalWarning := smart.json.Get("nvme_smart_health_information_log.critical_warning") + if !criticalWarning.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceCriticalWarning, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.critical_warning").Float(), + criticalWarning.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeMediaErrors() { + mediaErrors := smart.json.Get("nvme_smart_health_information_log.media_errors") + if !mediaErrors.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceMediaErrors, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.media_errors").Float(), + mediaErrors.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeNumErrLogEntries() { + numErrLogEntries := smart.json.Get("nvme_smart_health_information_log.num_err_log_entries") + if !numErrLogEntries.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceNumErrLogEntries, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.num_err_log_entries").Float(), + numErrLogEntries.Float(), smart.device.device, ) } diff --git a/smartctl_test.go b/smartctl_test.go index 8c9836c..09e0a30 100644 --- a/smartctl_test.go +++ b/smartctl_test.go @@ -14,7 +14,14 @@ package main import ( + "io" + "log/slog" + "os" + "strings" "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/tidwall/gjson" ) func TestBuildDeviceLabel(t *testing.T) { @@ -42,3 +49,69 @@ func TestBuildDeviceLabel(t *testing.T) { } } } + +func TestStandbyJSONSkipsMissingMetrics(t *testing.T) { + names := collectMetricNames(t, "testdata/standby-sdc.json") + expected := map[string]struct{}{ + "smartctl_device_power_mode": {}, + "smartctl_device_smartctl_exit_status": {}, + } + + if len(names) != len(expected) { + t.Fatalf("expected %d metrics, got %d: %v", len(expected), len(names), names) + } + for name := range expected { + if _, ok := names[name]; !ok { + t.Fatalf("missing metric %q", name) + } + } + for name := range names { + if _, ok := expected[name]; !ok { + t.Fatalf("unexpected metric %q", name) + } + } +} + +func collectMetricNames(t *testing.T, jsonPath string) map[string]struct{} { + t.Helper() + + data, err := os.ReadFile(jsonPath) + if err != nil { + t.Fatalf("read json: %v", err) + } + json := gjson.ParseBytes(data) + + ch := make(chan prometheus.Metric) + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + smart := NewSMARTctl(logger, json, ch) + + go func() { + smart.Collect() + close(ch) + }() + + names := make(map[string]struct{}) + for metric := range ch { + name := metricName(metric) + if name == "" { + t.Fatalf("missing metric name for %v", metric) + } + names[name] = struct{}{} + } + return names +} + +func metricName(metric prometheus.Metric) string { + desc := metric.Desc().String() + const prefix = `fqName: "` + start := strings.Index(desc, prefix) + if start == -1 { + return "" + } + start += len(prefix) + end := strings.Index(desc[start:], `"`) + if end == -1 { + return "" + } + return desc[start : start+end] +}