Skip to content

Commit dad7292

Browse files
committed
Update smart metrics to include rated DWPD
1 parent 6e71ff1 commit dad7292

File tree

4 files changed

+173
-20
lines changed

4 files changed

+173
-20
lines changed
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
---
2+
- name: Gather unique NVMe disk models and generate a prepopulated variable template
3+
hosts: overcloud
4+
gather_facts: no
5+
tasks:
6+
- name: Get NVMe device information
7+
command: "nvme list -o json"
8+
register: nvme_list
9+
changed_when: false
10+
become: true
11+
12+
- name: Parse NVMe device model names
13+
set_fact:
14+
nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}"
15+
loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}"
16+
changed_when: false
17+
18+
- name: Set gathered NVMe models as host facts
19+
set_fact:
20+
unique_nvme_models: "{{ nvme_models | unique }}"
21+
run_once: true
22+
23+
- name: Update stackhpc-monitoring.yml with DWPD ratings
24+
hosts: localhost
25+
gather_facts: no
26+
tasks:
27+
- name: Aggregate unique NVMe models from all hosts
28+
set_fact:
29+
all_nvme_models: "{{ all_nvme_models | default([]) | union(hostvars[item].unique_nvme_models | default([])) }}"
30+
with_items: "{{ groups['overcloud'] }}"
31+
run_once: true
32+
33+
- name: Ensure unique NVMe models
34+
set_fact:
35+
all_nvme_models: "{{ all_nvme_models | unique }}"
36+
run_once: true
37+
38+
- name: Create a dictionary for quick lookup of DWPD ratings
39+
set_fact:
40+
dwpd_lookup: "{{ stackhpc_dwpd_ratings | items2dict(key_name='model_name', value_name='rated_dwpd') }}"
41+
when: stackhpc_dwpd_ratings is defined and stackhpc_dwpd_ratings | length > 0
42+
run_once: true
43+
44+
- name: Generate new DWPD ratings section
45+
set_fact:
46+
new_dwpd_section: |
47+
stackhpc_dwpd_ratings:
48+
{% for model in all_nvme_models %}
49+
- model_name: "{{ model }}"
50+
rated_dwpd: "{{ dwpd_lookup[model] if model in dwpd_lookup else '#FILL ME IN' }}"
51+
{% endfor %}
52+
run_once: true
53+
54+
- name: Read the current stackhpc-monitoring.yml file
55+
slurp:
56+
src: "{{ playbook_dir }}/../stackhpc-monitoring.yml"
57+
register: monitoring_file_content
58+
59+
- name: Ensure markers exist in the file
60+
set_fact:
61+
markers_exist: "{{ ('# BEGIN DWPD Ratings' in old_content) and ('# END DWPD Ratings' in old_content) }}"
62+
vars:
63+
old_content: "{{ monitoring_file_content.content | b64decode }}"
64+
run_once: true
65+
66+
- name: Fail if markers do not exist
67+
fail:
68+
msg: "The stackhpc-monitoring.yml file does not contain the required markers: # BEGIN DWPD Ratings and # END DWPD Ratings"
69+
when: not markers_exist
70+
run_once: true
71+
72+
- name: Update the content with new DWPD ratings section
73+
set_fact:
74+
updated_monitoring_content: |
75+
{% set old_content = monitoring_file_content.content | b64decode %}
76+
{% set before_section = old_content.split('# BEGIN DWPD Ratings')[0] %}
77+
{% set after_section = old_content.split('# END DWPD Ratings')[1] %}
78+
{{ before_section }}# BEGIN DWPD Ratings
79+
{{ new_dwpd_section }}
80+
# END DWPD Ratings{{ after_section }}
81+
when: markers_exist
82+
run_once: true
83+
84+
- name: Write the updated content back to stackhpc-monitoring.yml
85+
copy:
86+
content: "{{ updated_monitoring_content }}"
87+
dest: "{{ playbook_dir }}/../stackhpc-monitoring.yml"
88+
backup: yes
89+
when: markers_exist
90+
run_once: true
91+
92+
- name: Print new DWPD ratings section
93+
debug:
94+
msg:
95+
- "{{ new_dwpd_section }}"
96+
- "PLEASE UPDATE stackhpc-monitoring.yml IF NEEDED"
97+
run_once: true

etc/kayobe/ansible/scripts/nvmemon.sh

Lines changed: 55 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,31 @@ if ! command -v nvme >/dev/null 2>&1; then
2121
exit 1
2222
fi
2323

24+
# Set path to the DWPD ratings file
25+
dwpd_file="/opt/kayobe/etc/monitoring/dwpd_ratings.yml"
26+
27+
# Function to load rated DWPD values from the YML file
28+
load_dwpd_ratings() {
29+
declare -gA rated_dwpd
30+
if [[ -f "$dwpd_file" ]]; then
31+
while IFS= read -r line; do
32+
key="$(echo "$line" | jq -r '.model_name')"
33+
value="$(echo "$line" | jq -r '.rated_dwpd')"
34+
# Strip trailing spaces
35+
key="$(echo "$key" | sed 's/[[:space:]]*$//')"
36+
value="$(echo "$value" | sed 's/[[:space:]]*$//')"
37+
rated_dwpd["$key"]="$value"
38+
done < <(jq -c '.[]' "$dwpd_file")
39+
else
40+
echo "Warning: DWPD ratings file not found at $dwpd_file. Defaulting to 1 DWPD."
41+
fi
42+
}
43+
44+
load_dwpd_ratings
45+
46+
# Debugging purpose: print the array
47+
# echo "${rated_dwpd[@]}"
48+
2449
output_format_awk="$(
2550
cat <<'OUTPUTAWK'
2651
BEGIN { v = "" }
@@ -45,57 +70,68 @@ nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')"
4570
echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output
4671

4772
# Get devices (DevicePath and PhysicalSize)
48-
device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath: .DevicePath, PhysicalSize: .PhysicalSize}')"
73+
device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath, PhysicalSize, ModelNumber}')"
74+
75+
# Convert device_info to an array
76+
device_info_array=()
77+
while IFS= read -r line; do
78+
device_info_array+=("$line")
79+
done <<< "$device_info"
4980

5081
# Loop through the NVMe devices
51-
echo "$device_info" | while read -r device_data; do
52-
device=$(echo "$device_data" | jq -r '.DevicePath')
82+
for device_data in "${device_info_array[@]}"; do
83+
device="$(echo "$device_data" | jq -r '.DevicePath')"
5384
json_check="$(nvme smart-log -o json "${device}")"
5485
disk="${device##*/}"
86+
model_name="$(echo "$device_data" | jq -r '.ModelNumber')"
5587

56-
physical_size=$(echo "$device_data" | jq -r '.PhysicalSize')
57-
echo "physical_size_bytes{device=\"${disk}\"} ${physical_size}"
88+
physical_size="$(echo "$device_data" | jq -r '.PhysicalSize')"
89+
echo "physical_size_bytes{device=\"${disk}\",model=\"${model_name}\"} ${physical_size}"
5890

5991
# The temperature value in JSON is in Kelvin, we want Celsius
6092
value_temperature="$(echo "$json_check" | jq '.temperature - 273')"
61-
echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}"
93+
echo "temperature_celsius{device=\"${disk}\",model=\"${model_name}\"} ${value_temperature}"
94+
95+
# Get the rated DWPD from the dictionary or default to 1 if not found
96+
value_rated_dwpd="${rated_dwpd[$model_name]:-1}"
97+
echo "rated_dwpd{device=\"${disk}\",model=\"${model_name}\"} ${value_rated_dwpd}"
6298

6399
value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')"
64-
echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}"
100+
echo "available_spare_ratio{device=\"${disk}\",model=\"${model_name}\"} ${value_available_spare}"
65101

66102
value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')"
67-
echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}"
103+
echo "available_spare_threshold_ratio{device=\"${disk}\",model=\"${model_name}\"} ${value_available_spare_threshold}"
68104

69105
value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')"
70-
echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}"
106+
echo "percentage_used_ratio{device=\"${disk}\",model=\"${model_name}\"} ${value_percentage_used}"
71107

72108
value_critical_warning="$(echo "$json_check" | jq '.critical_warning')"
73-
echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}"
109+
echo "critical_warning_total{device=\"${disk}\",model=\"${model_name}\"} ${value_critical_warning}"
74110

75111
value_media_errors="$(echo "$json_check" | jq '.media_errors')"
76-
echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}"
112+
echo "media_errors_total{device=\"${disk}\",model=\"${model_name}\"} ${value_media_errors}"
77113

78114
value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')"
79-
echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}"
115+
echo "num_err_log_entries_total{device=\"${disk}\",model=\"${model_name}\"} ${value_num_err_log_entries}"
80116

81117
value_power_cycles="$(echo "$json_check" | jq '.power_cycles')"
82-
echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}"
118+
echo "power_cycles_total{device=\"${disk}\",model=\"${model_name}\"} ${value_power_cycles}"
83119

84120
value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')"
85-
echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}"
121+
echo "power_on_hours_total{device=\"${disk}\",model=\"${model_name}\"} ${value_power_on_hours}"
86122

87123
value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')"
88-
echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}"
124+
echo "controller_busy_time_seconds{device=\"${disk}\",model=\"${model_name}\"} ${value_controller_busy_time}"
89125

90126
value_data_units_written="$(echo "$json_check" | jq '.data_units_written')"
91-
echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}"
127+
echo "data_units_written_total{device=\"${disk}\",model=\"${model_name}\"} ${value_data_units_written}"
92128

93129
value_data_units_read="$(echo "$json_check" | jq '.data_units_read')"
94-
echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}"
130+
echo "data_units_read_total{device=\"${disk}\",model=\"${model_name}\"} ${value_data_units_read}"
95131

96132
value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')"
97-
echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}"
133+
echo "host_read_commands_total{device=\"${disk}\",model=\"${model_name}\"} ${value_host_read_commands}"
98134

99135
value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')"
100-
echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}"
136+
echo "host_write_commands_total{device=\"${disk}\",model=\"${model_name}\"} ${value_host_write_commands}"
101137
done | format_output

etc/kayobe/ansible/smartmon-tools.yml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
---
22
- hosts: overcloud
3-
43
tasks:
54
- name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed
65
package:
@@ -49,3 +48,21 @@
4948
- smartmon
5049
- nvmemon
5150
become: yes
51+
52+
- name: Ensure the DWPD Ratings directory exists
53+
file:
54+
path: /opt/kayobe/etc/monitoring
55+
state: directory
56+
mode: '0755'
57+
when: stackhpc_dwpd_ratings is defined
58+
become: true
59+
60+
- name: Create a DWPD ratings file
61+
copy:
62+
content: |
63+
{% for drive in stackhpc_dwpd_ratings %}
64+
{{ drive.model_name }}: {{ drive.rated_dwpd }}
65+
{% endfor %}
66+
dest: /opt/kayobe/etc/monitoring/dwpd_ratings.yml
67+
when: stackhpc_dwpd_ratings is defined
68+
become: true

etc/kayobe/stackhpc-monitoring.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,6 @@ stackhpc_enable_os_capacity: true
2323
# Whether TLS certificate verification is enabled for the OpenStack Capacity
2424
# exporter during Keystone authentication.
2525
stackhpc_os_capacity_openstack_verify: true
26+
27+
# BEGIN DWPD Ratings
28+
# END DWPD Ratings

0 commit comments

Comments
 (0)