7474 "Device error log entry count" ,
7575 ["device" ], namespace = namespace , registry = registry ,
7676 ),
77+ # FIXME: The "nvmecli" metric ought to be an Info type, not a Gauge. However, making this change
78+ # will result in the metric having a "_info" suffix automatically appended, which is arguably
79+ # a breaking change.
7780 "nvmecli" : Gauge (
7881 "nvmecli" ,
7982 "nvme-cli tool information" ,
@@ -142,7 +145,11 @@ def exec_nvme_json(*args):
142145 """
143146 Execute nvme CLI tool with specified arguments and return parsed JSON output.
144147 """
145- output = exec_nvme (* args , "--output-format" , "json" )
148+ # Note: nvme-cli v2.11 effectively introduced a breaking change by forcing JSON output to always
149+ # be verbose. Older versions of nvme-cli optionally produced verbose output if the --verbose
150+ # flag was specified. In order to avoid having to handle two different JSON schemas, always
151+ # add the --verbose flag.
152+ output = exec_nvme (* args , "--output-format" , "json" , "--verbose" )
146153 return json .loads (output )
147154
148155
@@ -157,49 +164,70 @@ def main():
157164 device_list = exec_nvme_json ("list" )
158165
159166 for device in device_list ["Devices" ]:
160- device_path = device ["DevicePath" ]
161- device_name = os .path .basename (device_path )
162-
163- metrics ["device_info" ].labels (
164- device_name ,
165- device ["ModelNumber" ],
166- device ["Firmware" ],
167- device ["SerialNumber" ].strip (),
168- )
169-
170- metrics ["sector_size" ].labels (device_name ).set (device ["SectorSize" ])
171- metrics ["physical_size" ].labels (device_name ).set (device ["PhysicalSize" ])
172- metrics ["used_bytes" ].labels (device_name ).set (device ["UsedBytes" ])
173-
174- smart_log = exec_nvme_json ("smart-log" , device_path )
175-
176- # Various counters in the NVMe specification are 128-bit, which would have to discard
177- # resolution if converted to a JSON number (i.e., float64_t). Instead, nvme-cli marshals
178- # them as strings. As such, they need to be explicitly cast to int or float when using them
179- # in Counter metrics.
180- metrics ["data_units_read" ].labels (device_name ).inc (int (smart_log ["data_units_read" ]))
181- metrics ["data_units_written" ].labels (device_name ).inc (int (smart_log ["data_units_written" ]))
182- metrics ["host_read_commands" ].labels (device_name ).inc (int (smart_log ["host_read_commands" ]))
183- metrics ["host_write_commands" ].labels (device_name ).inc (
184- int (smart_log ["host_write_commands" ])
185- )
186- metrics ["avail_spare" ].labels (device_name ).set (smart_log ["avail_spare" ] / 100 )
187- metrics ["spare_thresh" ].labels (device_name ).set (smart_log ["spare_thresh" ] / 100 )
188- metrics ["percent_used" ].labels (device_name ).set (smart_log ["percent_used" ] / 100 )
189- metrics ["critical_warning" ].labels (device_name ).set (smart_log ["critical_warning" ])
190- metrics ["media_errors" ].labels (device_name ).inc (int (smart_log ["media_errors" ]))
191- metrics ["num_err_log_entries" ].labels (device_name ).inc (
192- int (smart_log ["num_err_log_entries" ])
193- )
194- metrics ["power_cycles" ].labels (device_name ).inc (int (smart_log ["power_cycles" ]))
195- metrics ["power_on_hours" ].labels (device_name ).inc (int (smart_log ["power_on_hours" ]))
196- metrics ["controller_busy_time" ].labels (device_name ).inc (
197- int (smart_log ["controller_busy_time" ])
198- )
199- metrics ["unsafe_shutdowns" ].labels (device_name ).inc (int (smart_log ["unsafe_shutdowns" ]))
200-
201- # NVMe reports temperature in kelvins; convert it to degrees Celsius.
202- metrics ["temperature" ].labels (device_name ).set (smart_log ["temperature" ] - 273 )
167+ for subsys in device ["Subsystems" ]:
168+ for ctrl in subsys ["Controllers" ]:
169+ for ns in ctrl ["Namespaces" ]:
170+ device_name = ns ["NameSpace" ]
171+
172+ # FIXME: This metric ought to be refactored into a "controller_info" metric,
173+ # since it contains information that is not unique to the namespace. However,
174+ # previous versions of this collector erroneously referred to namespaces, e.g.
175+ # "nvme0n1", as devices, so preserve the former behaviour for now.
176+ metrics ["device_info" ].labels (
177+ device_name ,
178+ ctrl ["ModelNumber" ],
179+ ctrl ["Firmware" ],
180+ ctrl ["SerialNumber" ].strip (),
181+ )
182+
183+ metrics ["sector_size" ].labels (device_name ).set (ns ["SectorSize" ])
184+ metrics ["physical_size" ].labels (device_name ).set (ns ["PhysicalSize" ])
185+ metrics ["used_bytes" ].labels (device_name ).set (ns ["UsedBytes" ])
186+
187+ # FIXME: The smart-log should only need to be fetched once per controller, not
188+ # per namespace. However, in order to preserve legacy metric labels, fetch it
189+ # per namespace anyway. Most consumer grade SSDs will only have one namespace.
190+ smart_log = exec_nvme_json ("smart-log" , os .path .join ("/dev" , device_name ))
191+
192+ # Various counters in the NVMe specification are 128-bit, which would have to
193+ # discard resolution if converted to a JSON number (i.e., float64_t). Instead,
194+ # nvme-cli marshals them as strings. As such, they need to be explicitly cast
195+ # to int or float when using them in Counter metrics.
196+ metrics ["data_units_read" ].labels (device_name ).inc (
197+ int (smart_log ["data_units_read" ])
198+ )
199+ metrics ["data_units_written" ].labels (device_name ).inc (
200+ int (smart_log ["data_units_written" ])
201+ )
202+ metrics ["host_read_commands" ].labels (device_name ).inc (
203+ int (smart_log ["host_read_commands" ])
204+ )
205+ metrics ["host_write_commands" ].labels (device_name ).inc (
206+ int (smart_log ["host_write_commands" ])
207+ )
208+ metrics ["avail_spare" ].labels (device_name ).set (smart_log ["avail_spare" ] / 100 )
209+ metrics ["spare_thresh" ].labels (device_name ).set (smart_log ["spare_thresh" ] / 100 )
210+ metrics ["percent_used" ].labels (device_name ).set (smart_log ["percent_used" ] / 100 )
211+ metrics ["critical_warning" ].labels (device_name ).set (
212+ smart_log ["critical_warning" ]["value" ]
213+ )
214+ metrics ["media_errors" ].labels (device_name ).inc (int (smart_log ["media_errors" ]))
215+ metrics ["num_err_log_entries" ].labels (device_name ).inc (
216+ int (smart_log ["num_err_log_entries" ])
217+ )
218+ metrics ["power_cycles" ].labels (device_name ).inc (int (smart_log ["power_cycles" ]))
219+ metrics ["power_on_hours" ].labels (device_name ).inc (
220+ int (smart_log ["power_on_hours" ])
221+ )
222+ metrics ["controller_busy_time" ].labels (device_name ).inc (
223+ int (smart_log ["controller_busy_time" ])
224+ )
225+ metrics ["unsafe_shutdowns" ].labels (device_name ).inc (
226+ int (smart_log ["unsafe_shutdowns" ])
227+ )
228+
229+ # NVMe reports temperature in kelvins; convert it to degrees Celsius.
230+ metrics ["temperature" ].labels (device_name ).set (smart_log ["temperature" ] - 273 )
203231
204232
205233if __name__ == "__main__" :
0 commit comments