Skip to content

Commit 8c31d47

Browse files
Modified healthcheck script with additional warnings for Ceph health, drive health, and will now collect snapshot info by dataset in addition to by pool.
1 parent 5fb8259 commit 8c31d47

File tree

1 file changed

+58
-5
lines changed

1 file changed

+58
-5
lines changed

health-check.sh

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,18 @@ else
2424
fi
2525

2626
start_time=$(date +"%Y-%m-%dT%H:%M:%S%:z")
27-
echo "Starting health check script at $start_time for platform: $platform" | tee -a "$logfile"
27+
echo "Starting health check script at $start_time for platform: $platform" | tee -a "$logfile"
2828
echo "The Health Check Report has been saved in tmp/ folder." | tee -a "$logfile"
2929

30-
# Extract valid remote hostnames from /etc/hosts
30+
# Extract valid remote hostnames from /etc/hosts
3131
remote_hosts=$(awk '$1 ~ /^[0-9]+(\.[0-9]+){3}$/ && $2 !~ /localhost/ {print $2}' /etc/hosts | sort -u)
3232

3333
collect_from_all_hosts() {
3434
local cmd="$1"
3535
local file_prefix="$2"
3636
local out_file="$out_dir/${file_prefix}.txt"
3737

38-
> "$out_file"
38+
> "$out_file"
3939

4040
for host in $remote_hosts; do
4141
if [ "$host" = "$(hostname)" ]; then
@@ -111,6 +111,15 @@ for i in $(ls /dev | grep -i "^sd[a-z]$"); do
111111
done
112112
' "drive_age"
113113

114+
# Snapshots (by dataset)
115+
collect_from_all_hosts '
116+
datasets=$(zfs list -H -o name -t filesystem,volume 2>/dev/null)
117+
for ds in $datasets; do
118+
echo "Dataset: $ds"
119+
zfs list -H -t snapshot -o name -s creation -r "$ds" 2>/dev/null | tail -n 25
120+
done
121+
' "zfs_dataset_snapshots"
122+
114123
# Snapshots
115124
collect_from_all_hosts '
116125
zpools=$(zpool list -H -o name 2>/dev/null)
@@ -211,6 +220,10 @@ ceph health > "$out_dir/ceph/health_summary" 2>/dev/null
211220
ceph health detail > "$out_dir/ceph/health_detail" 2>/dev/null
212221
ceph report > "$out_dir/ceph/health_report" 2>/dev/null
213222
ceph df > "$out_dir/ceph/health_df" 2>/dev/null
223+
if grep -qE 'HEALTH_WARN|HEALTH_ERR' "$out_dir/ceph/health_detail"; then
224+
echo "Ceph cluster has warnings or errors:"
225+
cat "$out_dir/ceph/health_detail"
226+
fi
214227
if command -v lsb_release &> /dev/null; then
215228
collect_from_all_hosts "lsb_release -a" "lsb_release"
216229
fi
@@ -260,7 +273,7 @@ for host in $remote_hosts; do
260273

261274
if [ $? -eq 0 ]; then
262275
echo "CTDB is running on $host. Gathering status..." | tee -a "$logfile"
263-
276+
264277
# Save CTDB output to file
265278
{
266279
echo "===== ctdb status ($host) ====="
@@ -279,4 +292,44 @@ done
279292
# Tarball folder
280293
if tar -czf "$out_dir.tar.gz" -C "$(dirname "$out_dir")" "$(basename "$out_dir")"; then
281294
rm -rf "$out_dir"
282-
fi
295+
fi
296+
297+
echo "=== SMART Health Warnings $(date) ==="
298+
299+
for dev in $(ls /dev | grep -E "^sd[a-z]$"); do
300+
device="/dev/$dev"
301+
302+
# Collect info
303+
serial=$(smartctl -i "$device" 2>/dev/null | grep -i "Serial Number" | awk -F: '{print $2}' | xargs)
304+
smartout=$(smartctl -a "$device" 2>/dev/null)
305+
306+
# Quick health assessment
307+
health=$(echo "$smartout" | grep -i "SMART overall-health" | awk -F: '{print $2}' | xargs)
308+
309+
# Extract key values
310+
realloc=$(echo "$smartout" | grep -i "Reallocated_Sector_Ct" | awk '{print $10}')
311+
pending=$(echo "$smartout" | grep -i "Current_Pending_Sector" | awk '{print $10}')
312+
offline_uncorr=$(echo "$smartout" | grep -i "Offline_Uncorrectable" | awk '{print $10}')
313+
crc=$(echo "$smartout" | grep -i "UDMA_CRC_Error_Count" | awk '{print $10}')
314+
reserved=$(echo "$smartout" | grep -i "Available_Reservd_Space" | awk '{print $10}')
315+
316+
# Print warnings only
317+
if [[ "$health" == "FAILED" ]]; then
318+
echo "[$device | $serial] SMART overall health test FAILED!"
319+
fi
320+
if [[ -n "$realloc" && "$realloc" -gt 0 ]]; then
321+
echo "[$device | $serial] Reallocated sectors detected: $realloc"
322+
fi
323+
if [[ -n "$pending" && "$pending" -gt 0 ]]; then
324+
echo "[$device | $serial] Pending sectors detected: $pending"
325+
fi
326+
if [[ -n "$offline_uncorr" && "$offline_uncorr" -gt 0 ]]; then
327+
echo "[$device | $serial] Offline uncorrectable sectors detected: $offline_uncorr"
328+
fi
329+
if [[ -n "$crc" && "$crc" -gt 0 ]]; then
330+
echo "[$device | $serial] CRC interface errors detected: $crc"
331+
fi
332+
if [[ -n "$reserved" && "$reserved" -lt 100 ]]; then
333+
echo "[$device | $serial] SSD spare blocks low: $reserved%"
334+
fi
335+
done

0 commit comments

Comments
 (0)