2424fi
2525
2626start_time=$( date +" %Y-%m-%dT%H:%M:%S%:z" )
27- echo " Starting health check script at $start_time for platform: $platform " | tee -a " $logfile "
27+ echo " Starting health check script at $start_time for platform: $platform " | tee -a " $logfile "
2828echo " The Health Check Report has been saved in tmp/ folder." | tee -a " $logfile "
2929
30- # Extract valid remote hostnames from /etc/hosts
30+ # Extract valid remote hostnames from /etc/hosts
3131remote_hosts=$( awk ' $1 ~ /^[0-9]+(\.[0-9]+){3}$/ && $2 !~ /localhost/ {print $2}' /etc/hosts | sort -u)
3232
3333collect_from_all_hosts () {
3434 local cmd=" $1 "
3535 local file_prefix=" $2 "
3636 local out_file=" $out_dir /${file_prefix} .txt"
3737
38- > " $out_file "
38+ > " $out_file "
3939
4040 for host in $remote_hosts ; do
4141 if [ " $host " = " $( hostname) " ]; then
@@ -111,6 +111,15 @@ for i in $(ls /dev | grep -i "^sd[a-z]$"); do
111111done
112112' " drive_age"
113113
114+ # Snapshots (by dataset)
115+ collect_from_all_hosts '
116+ datasets=$(zfs list -H -o name -t filesystem,volume 2>/dev/null)
117+ for ds in $datasets; do
118+ echo "Dataset: $ds"
119+ zfs list -H -t snapshot -o name -s creation -r "$ds" 2>/dev/null | tail -n 25
120+ done
121+ ' " zfs_dataset_snapshots"
122+
114123# Snapshots
115124collect_from_all_hosts '
116125zpools=$(zpool list -H -o name 2>/dev/null)
@@ -211,6 +220,10 @@ ceph health > "$out_dir/ceph/health_summary" 2>/dev/null
211220ceph health detail > " $out_dir /ceph/health_detail" 2> /dev/null
212221ceph report > " $out_dir /ceph/health_report" 2> /dev/null
213222ceph df > " $out_dir /ceph/health_df" 2> /dev/null
223+ if grep -qE ' HEALTH_WARN|HEALTH_ERR' " $out_dir /ceph/health_detail" ; then
224+ echo " Ceph cluster has warnings or errors:"
225+ cat " $out_dir /ceph/health_detail"
226+ fi
214227if command -v lsb_release & > /dev/null; then
215228 collect_from_all_hosts " lsb_release -a" " lsb_release"
216229fi
@@ -260,7 +273,7 @@ for host in $remote_hosts; do
260273
261274 if [ $? -eq 0 ]; then
262275 echo " CTDB is running on $host . Gathering status..." | tee -a " $logfile "
263-
276+
264277 # Save CTDB output to file
265278 {
266279 echo " ===== ctdb status ($host ) ====="
@@ -279,4 +292,44 @@ done
279292# Tarball folder
280293if tar -czf " $out_dir .tar.gz" -C " $( dirname " $out_dir " ) " " $( basename " $out_dir " ) " ; then
281294 rm -rf " $out_dir "
282- fi
295+ fi
296+
297+ echo " === SMART Health Warnings $( date) ==="
298+
299+ for dev in $( ls /dev | grep -E " ^sd[a-z]$" ) ; do
300+ device=" /dev/$dev "
301+
302+ # Collect info
303+ serial=$( smartctl -i " $device " 2> /dev/null | grep -i " Serial Number" | awk -F: ' {print $2}' | xargs)
304+ smartout=$( smartctl -a " $device " 2> /dev/null)
305+
306+ # Quick health assessment
307+ health=$( echo " $smartout " | grep -i " SMART overall-health" | awk -F: ' {print $2}' | xargs)
308+
309+ # Extract key values
310+ realloc=$( echo " $smartout " | grep -i " Reallocated_Sector_Ct" | awk ' {print $10}' )
311+ pending=$( echo " $smartout " | grep -i " Current_Pending_Sector" | awk ' {print $10}' )
312+ offline_uncorr=$( echo " $smartout " | grep -i " Offline_Uncorrectable" | awk ' {print $10}' )
313+ crc=$( echo " $smartout " | grep -i " UDMA_CRC_Error_Count" | awk ' {print $10}' )
314+ reserved=$( echo " $smartout " | grep -i " Available_Reservd_Space" | awk ' {print $10}' )
315+
316+ # Print warnings only
317+ if [[ " $health " == " FAILED" ]]; then
318+ echo " [$device | $serial ] SMART overall health test FAILED!"
319+ fi
320+ if [[ -n " $realloc " && " $realloc " -gt 0 ]]; then
321+ echo " [$device | $serial ] Reallocated sectors detected: $realloc "
322+ fi
323+ if [[ -n " $pending " && " $pending " -gt 0 ]]; then
324+ echo " [$device | $serial ] Pending sectors detected: $pending "
325+ fi
326+ if [[ -n " $offline_uncorr " && " $offline_uncorr " -gt 0 ]]; then
327+ echo " [$device | $serial ] Offline uncorrectable sectors detected: $offline_uncorr "
328+ fi
329+ if [[ -n " $crc " && " $crc " -gt 0 ]]; then
330+ echo " [$device | $serial ] CRC interface errors detected: $crc "
331+ fi
332+ if [[ -n " $reserved " && " $reserved " -lt 100 ]]; then
333+ echo " [$device | $serial ] SSD spare blocks low: $reserved %"
334+ fi
335+ done
0 commit comments