diff --git a/main.go b/main.go index c5b138a..4443d82 100644 --- a/main.go +++ b/main.go @@ -57,7 +57,9 @@ type SMARTctlManagerCollector struct { // Describe sends the super-set of all possible descriptors of metrics func (i *SMARTctlManagerCollector) Describe(ch chan<- *prometheus.Desc) { - prometheus.DescribeByCollect(i, ch) + for _, desc := range allMetricDescs { + ch <- desc + } } // Collect is called by the Prometheus registry when collecting metrics. diff --git a/metrics.go b/metrics.go index 0ac083f..ba40a36 100644 --- a/metrics.go +++ b/metrics.go @@ -234,6 +234,14 @@ var ( }, nil, ) + metricDevicePowerMode = prometheus.NewDesc( + "smartctl_device_power_mode", + "Device power mode from ATA CHECK POWER MODE command (ata_value: -1=sleep, 0x00=0=standby, 0x01=1=standby_y, 0x40=64=active_nv_down, 0x41=65=active_nv_up, 0x80=128=idle, 0x81=129=idle_a, 0x82=130=idle_b, 0x83=131=idle_c, 0xff=255=active_or_idle). Source: https://github.com/smartmontools/smartmontools/blob/RELEASE_7_5/smartmontools/ataprint.cpp#L3401-L3431", + []string{ + "device", + }, + nil, + ) metricDeviceStatistics = prometheus.NewDesc( "smartctl_device_statistics", "Device statistics", @@ -354,4 +362,45 @@ var ( }, nil, ) + allMetricDescs = []*prometheus.Desc{ + metricSmartctlVersion, + metricDeviceModel, + metricDeviceCount, + metricDeviceCapacityBlocks, + metricDeviceCapacityBytes, + metricDeviceTotalCapacityBytes, + metricDeviceBlockSize, + metricDeviceInterfaceSpeed, + metricDeviceAttribute, + metricDevicePowerOnSeconds, + metricDeviceRotationRate, + metricDeviceTemperature, + metricDevicePowerCycleCount, + metricDevicePercentageUsed, + metricDeviceAvailableSpare, + metricDeviceAvailableSpareThreshold, + metricDeviceCriticalWarning, + metricDeviceMediaErrors, + metricDeviceNumErrLogEntries, + metricDeviceBytesRead, + metricDeviceBytesWritten, + metricDeviceSmartStatus, + metricDeviceExitStatus, + metricDeviceState, + metricDevicePowerMode, + metricDeviceStatistics, + metricDeviceErrorLogCount, + metricDeviceSelfTestLogCount, + metricDeviceSelfTestLogErrorCount, + metricDeviceERCSeconds, + metricSCSIGrownDefectList, + metricReadErrorsCorrectedByRereadsRewrites, + metricReadErrorsCorrectedByEccFast, + metricReadErrorsCorrectedByEccDelayed, + metricReadTotalUncorrectedErrors, + metricWriteErrorsCorrectedByRereadsRewrites, + metricWriteErrorsCorrectedByEccFast, + metricWriteErrorsCorrectedByEccDelayed, + metricWriteTotalUncorrectedErrors, + } ) diff --git a/readjson.go b/readjson.go index a03b1d6..98f19c9 100644 --- a/readjson.go +++ b/readjson.go @@ -74,7 +74,8 @@ func readSMARTctl(logger *slog.Logger, device Device, wg *sync.WaitGroup) { // Accommodate a smartmontools pre-7.3 bug cleaned_out := strings.TrimPrefix(string(out), " Pending defect count:") json := parseJSON(cleaned_out) - rcOk := resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int()) + exitStatus := json.Get("smartctl.exit_status").Int() + rcOk := resultCodeIsOk(logger, device, exitStatus, json) jsonOk := jsonIsOk(logger, json) logger.Debug("Collected S.M.A.R.T. json data", "device", device, "duration", time.Since(start)) if rcOk && jsonOk { @@ -134,7 +135,7 @@ func readData(logger *slog.Logger, device Device) gjson.Result { } // Parse smartctl return code -func resultCodeIsOk(logger *slog.Logger, device Device, SMARTCtlResult int64) bool { +func resultCodeIsOk(logger *slog.Logger, device Device, SMARTCtlResult int64, json gjson.Result) bool { result := true if SMARTCtlResult > 0 { b := SMARTCtlResult @@ -143,8 +144,12 @@ func resultCodeIsOk(logger *slog.Logger, device Device, SMARTCtlResult int64) bo result = false } if (b & (1 << 1)) != 0 { - logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device) - result = false + if json.Get("power_mode").Exists() { + logger.Info("Device in low-power mode", "device", device) + } else { + logger.Error("Device open failed, device did not return an IDENTIFY DEVICE structure, or device is in a low-power mode", "device", device) + result = false + } } if (b & (1 << 2)) != 0 { logger.Warn("Some SMART or other ATA command to the disk failed, or there was a checksum error in a SMART data structure", "device", device) diff --git a/readjson_test.go b/readjson_test.go new file mode 100644 index 0000000..9d77f40 --- /dev/null +++ b/readjson_test.go @@ -0,0 +1,66 @@ +// Copyright 2024 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "io" + "log/slog" + "os" + "testing" + + "github.com/tidwall/gjson" +) + +func TestResultCodeIsOkStandbyJSON(t *testing.T) { + // output from a standby hard drive: + // sudo hdparm -y /dev/sdc + // sudo smartctl --nocheck=standby /dev/sdc --json --info --health --attributes --tolerance=verypermissive --format=brief --log=error + json := readTestJSON(t, "testdata/standby-sdc.json") + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + device := Device{ + Name: "/dev/sdc", + Type: "sat", + Label: "sdc", + } + + if !resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int(), json) { + t.Fatalf("expected exit status to be ok for standby json") + } +} + +func TestResultCodeIsOkNonexistentDeviceJSON(t *testing.T) { + // output from a nonexistent disk: + // sudo smartctl --nocheck=standby /dev/nonexistent --json --info --health --attributes --tolerance=verypermissive --format=brief --log=error + json := readTestJSON(t, "testdata/nonexistent-device.json") + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + device := Device{ + Name: "/dev/nonexistent", + Type: "auto", + Label: "nonexistent", + } + + if resultCodeIsOk(logger, device, json.Get("smartctl.exit_status").Int(), json) { + t.Fatalf("expected exit status to be not ok for nonexistent device json") + } +} + +func readTestJSON(t *testing.T, path string) gjson.Result { + t.Helper() + + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read json: %v", err) + } + return gjson.ParseBytes(data) +} diff --git a/smartctl.go b/smartctl.go index b136b8b..a814c55 100644 --- a/smartctl.go +++ b/smartctl.go @@ -87,6 +87,7 @@ func (smart *SMARTctl) Collect() { smart.logger.Debug("Collecting metrics from", "device", smart.device.device, "family", smart.device.family, "model", smart.device.model) smart.mineExitStatus() smart.mineDevice() + smart.minePowerMode() smart.mineCapacity() smart.mineBlockSize() smart.mineInterfaceSpeed() @@ -131,6 +132,23 @@ func (smart *SMARTctl) mineExitStatus() { } func (smart *SMARTctl) mineDevice() { + hasInfo := false + for _, key := range []string{ + "model_name", + "scsi_vendor", + "scsi_product", + "serial_number", + "firmware_version", + "model_family", + } { + if smart.json.Get(key).Exists() { + hasInfo = true + break + } + } + if !hasInfo { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceModel, prometheus.GaugeValue, @@ -154,22 +172,43 @@ func (smart *SMARTctl) mineDevice() { ) } +func (smart *SMARTctl) minePowerMode() { + powerMode := smart.json.Get("power_mode") + if powerMode.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDevicePowerMode, + prometheus.GaugeValue, + powerMode.Get("ata_value").Float(), + smart.device.device, + ) + } +} + func (smart *SMARTctl) mineCapacity() { // The user_capacity exists only when NVMe have single namespace. Otherwise, // for NVMe devices with multiple namespaces, when device name used without // namespace number (exporter case) user_capacity will be absent - smart.ch <- prometheus.MustNewConstMetric( - metricDeviceCapacityBlocks, - prometheus.GaugeValue, - smart.json.Get("user_capacity.blocks").Float(), - smart.device.device, - ) - smart.ch <- prometheus.MustNewConstMetric( - metricDeviceCapacityBytes, - prometheus.GaugeValue, - smart.json.Get("user_capacity.bytes").Float(), - smart.device.device, - ) + userCapacity := smart.json.Get("user_capacity") + if userCapacity.Exists() { + blocks := userCapacity.Get("blocks") + if blocks.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceCapacityBlocks, + prometheus.GaugeValue, + blocks.Float(), + smart.device.device, + ) + } + bytes := userCapacity.Get("bytes") + if bytes.Exists() { + smart.ch <- prometheus.MustNewConstMetric( + metricDeviceCapacityBytes, + prometheus.GaugeValue, + bytes.Float(), + smart.device.device, + ) + } + } nvme_total_capacity := smart.json.Get("nvme_total_capacity") if nvme_total_capacity.Exists() { smart.ch <- prometheus.MustNewConstMetric( @@ -183,10 +222,14 @@ func (smart *SMARTctl) mineCapacity() { func (smart *SMARTctl) mineBlockSize() { for _, blockType := range []string{"logical", "physical"} { + blockSize := smart.json.Get(fmt.Sprintf("%s_block_size", blockType)) + if !blockSize.Exists() { + continue + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceBlockSize, prometheus.GaugeValue, - smart.json.Get(fmt.Sprintf("%s_block_size", blockType)).Float(), + blockSize.Float(), smart.device.device, blockType, ) @@ -329,55 +372,79 @@ func (smart *SMARTctl) mineDeviceSCTStatus() { } func (smart *SMARTctl) mineNvmePercentageUsed() { + percentageUsed := smart.json.Get("nvme_smart_health_information_log.percentage_used") + if !percentageUsed.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDevicePercentageUsed, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.percentage_used").Float(), + percentageUsed.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeAvailableSpare() { + availableSpare := smart.json.Get("nvme_smart_health_information_log.available_spare") + if !availableSpare.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceAvailableSpare, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.available_spare").Float(), + availableSpare.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeAvailableSpareThreshold() { + availableSpareThreshold := smart.json.Get("nvme_smart_health_information_log.available_spare_threshold") + if !availableSpareThreshold.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceAvailableSpareThreshold, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.available_spare_threshold").Float(), + availableSpareThreshold.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeCriticalWarning() { + criticalWarning := smart.json.Get("nvme_smart_health_information_log.critical_warning") + if !criticalWarning.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceCriticalWarning, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.critical_warning").Float(), + criticalWarning.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeMediaErrors() { + mediaErrors := smart.json.Get("nvme_smart_health_information_log.media_errors") + if !mediaErrors.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceMediaErrors, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.media_errors").Float(), + mediaErrors.Float(), smart.device.device, ) } func (smart *SMARTctl) mineNvmeNumErrLogEntries() { + numErrLogEntries := smart.json.Get("nvme_smart_health_information_log.num_err_log_entries") + if !numErrLogEntries.Exists() { + return + } smart.ch <- prometheus.MustNewConstMetric( metricDeviceNumErrLogEntries, prometheus.CounterValue, - smart.json.Get("nvme_smart_health_information_log.num_err_log_entries").Float(), + numErrLogEntries.Float(), smart.device.device, ) } diff --git a/smartctl_test.go b/smartctl_test.go index 8c9836c..09e0a30 100644 --- a/smartctl_test.go +++ b/smartctl_test.go @@ -14,7 +14,14 @@ package main import ( + "io" + "log/slog" + "os" + "strings" "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/tidwall/gjson" ) func TestBuildDeviceLabel(t *testing.T) { @@ -42,3 +49,69 @@ func TestBuildDeviceLabel(t *testing.T) { } } } + +func TestStandbyJSONSkipsMissingMetrics(t *testing.T) { + names := collectMetricNames(t, "testdata/standby-sdc.json") + expected := map[string]struct{}{ + "smartctl_device_power_mode": {}, + "smartctl_device_smartctl_exit_status": {}, + } + + if len(names) != len(expected) { + t.Fatalf("expected %d metrics, got %d: %v", len(expected), len(names), names) + } + for name := range expected { + if _, ok := names[name]; !ok { + t.Fatalf("missing metric %q", name) + } + } + for name := range names { + if _, ok := expected[name]; !ok { + t.Fatalf("unexpected metric %q", name) + } + } +} + +func collectMetricNames(t *testing.T, jsonPath string) map[string]struct{} { + t.Helper() + + data, err := os.ReadFile(jsonPath) + if err != nil { + t.Fatalf("read json: %v", err) + } + json := gjson.ParseBytes(data) + + ch := make(chan prometheus.Metric) + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + smart := NewSMARTctl(logger, json, ch) + + go func() { + smart.Collect() + close(ch) + }() + + names := make(map[string]struct{}) + for metric := range ch { + name := metricName(metric) + if name == "" { + t.Fatalf("missing metric name for %v", metric) + } + names[name] = struct{}{} + } + return names +} + +func metricName(metric prometheus.Metric) string { + desc := metric.Desc().String() + const prefix = `fqName: "` + start := strings.Index(desc, prefix) + if start == -1 { + return "" + } + start += len(prefix) + end := strings.Index(desc[start:], `"`) + if end == -1 { + return "" + } + return desc[start : start+end] +} diff --git a/testdata/nonexistent-device.json b/testdata/nonexistent-device.json new file mode 100644 index 0000000..a764880 --- /dev/null +++ b/testdata/nonexistent-device.json @@ -0,0 +1,39 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 5 + ], + "pre_release": false, + "svn_revision": "5714", + "platform_info": "x86_64-linux-6.12.43", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--nocheck=standby", + "/dev/nonexistent", + "--json", + "--info", + "--health", + "--attributes", + "--tolerance=verypermissive", + "--format=brief", + "--log=error" + ], + "messages": [ + { + "string": "/dev/nonexistent: Unable to detect device type", + "severity": "error" + } + ], + "exit_status": 1 + }, + "local_time": { + "time_t": 1767059001, + "asctime": "Mon Dec 29 17:43:21 2025 PST" + } +} diff --git a/testdata/standby-sdc.json b/testdata/standby-sdc.json new file mode 100644 index 0000000..450b661 --- /dev/null +++ b/testdata/standby-sdc.json @@ -0,0 +1,49 @@ +{ + "json_format_version": [ + 1, + 0 + ], + "smartctl": { + "version": [ + 7, + 5 + ], + "pre_release": false, + "svn_revision": "5714", + "platform_info": "x86_64-linux-6.12.43", + "build_info": "(local build)", + "argv": [ + "smartctl", + "--nocheck=standby", + "/dev/sdc", + "--json", + "--info", + "--health", + "--attributes", + "--tolerance=verypermissive", + "--format=brief", + "--log=error" + ], + "messages": [ + { + "string": "Device is in STANDBY mode, exit(2)", + "severity": "information" + } + ], + "exit_status": 2 + }, + "local_time": { + "time_t": 1767052155, + "asctime": "Mon Dec 29 15:49:15 2025 PST" + }, + "device": { + "name": "/dev/sdc", + "info_name": "/dev/sdc [SAT]", + "type": "sat", + "protocol": "ATA" + }, + "power_mode": { + "ata_value": 0, + "name": "STANDBY" + } +}