Skip to content

Commit d628abc

Browse files
committed
Add support for NVMe drives in smartmon.py
Adds support for collecting SMART metrics from NVMe drives with the use of pysmart and smartctl JSON output. Includes updates to the deployment playbooks, tests, and dashboards.
1 parent 0a7d5b2 commit d628abc

File tree

11 files changed

+933
-471
lines changed

11 files changed

+933
-471
lines changed

etc/kayobe/ansible/deployment/get-nvme-drives.yml

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,31 @@
33
hosts: overcloud
44
gather_facts: no
55
tasks:
6-
- name: Retrieve NVMe device information
7-
ansible.builtin.command: "nvme list -o json"
8-
register: nvme_list
6+
- name: Scan for NVMe devices with smartctl
7+
ansible.builtin.command: "smartctl --scan -j"
8+
register: smartctl_scan
99
changed_when: false
1010
become: true
1111

12+
- name: Extract NVMe device paths
13+
ansible.builtin.set_fact:
14+
nvme_devices: "{{ smartctl_scan.stdout | from_json | json_query('devices[?type==`nvme`].info_name') | default([]) }}"
15+
changed_when: false
16+
17+
- name: Retrieve NVMe device information via smartctl
18+
ansible.builtin.command: "smartctl -i -j {{ item }}"
19+
register: smartctl_info
20+
loop: "{{ nvme_devices }}"
21+
loop_control:
22+
label: "{{ item }}"
23+
changed_when: false
24+
become: true
25+
when: nvme_devices | length > 0
26+
1227
- name: Parse NVMe device model names
1328
ansible.builtin.set_fact:
14-
nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}"
15-
loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}"
29+
nvme_models: "{{ nvme_models | default([]) + [item.model_name] }}"
30+
loop: "{{ smartctl_info.results | default([]) | map(attribute='stdout') | map('from_json') | selectattr('model_name', 'defined') | list }}"
1631
changed_when: false
1732

1833
- name: Set unique NVMe models as host facts

etc/kayobe/ansible/deployment/smartmon-tools.yml

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22
- name: Install and set up SMART monitoring tools
33
hosts: overcloud
44
tasks:
5-
- name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed
5+
- name: Ensure smartmontools, jq, and cron/cronie are installed
66
ansible.builtin.package:
77
name:
88
- smartmontools
9-
- nvme-cli
109
- jq
1110
- "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'cronie' }}"
1211
state: present
@@ -54,7 +53,7 @@
5453
enabled: true
5554
become: true
5655

57-
- name: Copy smartmon.py and nvmemon.sh from scripts folder
56+
- name: Copy smartmon.py from scripts folder
5857
ansible.builtin.copy:
5958
src: "{{ lookup('env', 'KAYOBE_CONFIG_PATH') }}/ansible/scripts/{{ item }}"
6059
dest: /usr/local/bin/{{ item }}
@@ -63,7 +62,6 @@
6362
mode: "0700"
6463
loop:
6564
- smartmon.py
66-
- nvmemon.sh
6765
become: true
6866

6967
- name: Set PATH Variable for cron
@@ -84,17 +82,6 @@
8482
mv -f /var/lib/docker/volumes/textfile/_data/smartmon.prom.temp /var/lib/docker/volumes/textfile/_data/smartmon.prom
8583
become: true
8684

87-
- name: Schedule cronjob to run nvmemon.sh every 5 minutes and save output to file
88-
ansible.builtin.cron:
89-
name: SMART metrics for drive monitoring using nvmemon.sh
90-
user: root
91-
minute: "*/5"
92-
job: >-
93-
umask 0022 && /usr/local/bin/nvmemon.sh >
94-
/var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp &&
95-
mv -f /var/lib/docker/volumes/textfile/_data/nvmemon.prom.temp /var/lib/docker/volumes/textfile/_data/nvmemon.prom
96-
become: true
97-
9885
- name: Remove old cronjobs if present
9986
ansible.builtin.cron:
10087
name: SMART metrics for drive monitoring using {{ item }}
@@ -104,11 +91,15 @@
10491
loop:
10592
- smartmon
10693
- nvmemon
94+
- nvmemon.sh
10795

108-
- name: Remove old smartmon.sh if present
96+
- name: Remove old monitoring scripts if present
10997
ansible.builtin.file:
110-
path: /usr/local/bin/smartmon.sh
98+
path: /usr/local/bin/{{ item }}
11199
state: absent
100+
loop:
101+
- smartmon.sh
102+
- nvmemon.sh
112103
become: true
113104

114105
- name: Gather NVMe drives and generate dwpd ratings

etc/kayobe/ansible/scripts/generate_fixtures.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python3
22
import json
33
import re
4+
import subprocess
45
from pySMART import DeviceList
56

67
SMARTMON_ATTRS = {
@@ -63,6 +64,8 @@
6364
"critical_comp_time",
6465
}
6566

67+
SMARTCTL_PATH = "/usr/sbin/smartctl"
68+
6669
DISK_INFO = {
6770
"name",
6871
"interface",
@@ -84,6 +87,17 @@ def camel_to_snake(name):
8487
"""
8588
return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()
8689

90+
def canonical_device_path(name):
91+
"""
92+
Ensure device name is returned as absolute /dev path for smartctl.
93+
94+
pySMART sometimes reports bare device names (e.g. 'nvme0'); smartctl on the
95+
CLI expects the canonical /dev path, so normalise here to avoid surprises.
96+
"""
97+
if not name:
98+
return name
99+
return name if name.startswith("/dev/") else f"/dev/{name}"
100+
87101
def attrs_to_dict(obj, allowed_keys):
88102
"""
89103
Build {attr: value} for every public, non-callable attribute whose
@@ -105,14 +119,52 @@ def attrs_to_dict(obj, allowed_keys):
105119
attributes[name] = value
106120
return attributes
107121

122+
def smartctl_json(device_name, device_type):
123+
"""
124+
Execute smartctl -x -j for the given device and return the parsed JSON payload.
125+
126+
The goal is to mirror the exact data smartmon.py consumes at runtime so our
127+
fixtures stay faithful to real hardware output.
128+
"""
129+
if not device_name:
130+
return {}
131+
132+
target = canonical_device_path(device_name)
133+
134+
cmd = [SMARTCTL_PATH, "-x", "-j", target]
135+
if device_type and device_type.lower() not in (None, "", "nvme"):
136+
cmd.insert(3, device_type)
137+
cmd.insert(3, "-d")
138+
139+
try:
140+
result = subprocess.run(
141+
cmd,
142+
stdout=subprocess.PIPE,
143+
stderr=subprocess.PIPE,
144+
check=False,
145+
text=True,
146+
)
147+
except OSError:
148+
return {}
149+
150+
if not result.stdout:
151+
return {}
152+
153+
try:
154+
return json.loads(result.stdout)
155+
except json.JSONDecodeError:
156+
return {}
157+
108158
for disk in DeviceList().devices:
109159

110160
fixtures = {}
111161
disk_info = attrs_to_dict(disk, DISK_INFO)
112162
if_stats = attrs_to_dict(disk.if_attributes, SMARTMON_ATTRS)
163+
smartctl_payload = smartctl_json(disk.name, disk.interface)
113164

114165
fixtures["device_info"] = disk_info
115166
fixtures["if_attributes"] = if_stats
167+
fixtures["smartctl"] = smartctl_payload
116168

117169
print(f'Disk: {disk.name}: \n')
118170
print(json.dumps(fixtures, indent=2, default=str))

etc/kayobe/ansible/scripts/nvmemon.sh

Lines changed: 0 additions & 150 deletions
This file was deleted.

0 commit comments

Comments
 (0)