From 3f55af548dbdffedcdfdb7d82f628faa78afcd52 Mon Sep 17 00:00:00 2001 From: Yevgenii Yevtushenko Date: Sat, 7 May 2016 10:08:13 +0300 Subject: [PATCH] Add check_smartmon for checking S.M.A.R.T. status. This naigos check allows to check the S.M.A.R.T. status of disks attached directly or disks attached to the RAID controllers. The supported controllers are cciss (hpsa) and megaraid compatible controllers. To define which disks are attached to the controller the RAID utility is required (megacli or hpacucli/hpssacli), in other case any checks won't be added. The lib/facter/nagios_smartmon.rb adds the facts which contain hashes with information about controller and disks. Examples: Two SATA disks without RAID controller { 0=>{"dev"=>"sda", "controller"=>"ata"}, 1=>{"dev"=>"sdb","controller"=>"ata"} } Single SAS disk attached to the MegaRAID controller (ServeRAID M5110e) { 0=>{"dev"=>"sda", "controller"=>"megaraid", "port"=>"8","interface"=>"SAS"} } Six SATA disks (two arrays) attached to the MegaRAID controller (ServeRAID M5110e) { 5=>{"port"=>"5", "interface"=>"SATA", "dev"=>"sda", "controller"=>"megaraid"}, 0=>{"port"=>"0", "interface"=>"SATA", "dev"=>"sda", "controller"=>"megaraid"}, 1=>{"port"=>"1", "interface"=>"SATA", "dev"=>"sda", "controller"=>"megaraid"}, 2=>{"port"=>"2", "interface"=>"SATA", "dev"=>"sda", "controller"=>"megaraid"}, 3=>{"port"=>"3", "interface"=>"SATA", "dev"=>"sda", "controller"=>"megaraid"}, 4=>{"port"=>"4", "interface"=>"SATA", "dev"=>"sda", "controller"=>"megaraid"} } Four SAS disks with Smart Array P840 { 0=>{"port"=>"0", "dev"=>"sda", "controller"=>"cciss", "interface"=>nil}, 1=>{"port"=>"1", "dev"=>"sda", "controller"=>"cciss", "interface"=>nil}, 2=>{"port"=>"2", "dev"=>"sda", "controller"=>"cciss", "interface"=>nil}, 3=>{"port"=>"3", "dev"=>"sda", "controller"=>"cciss", "interface"=>nil} } The information from ::nagios_smartmon fact is used for creating the nagios services and nrpe checks. For doing this Puppet 3.x clients must to have the following configuration option in puppet.conf: [agent] stringify_facts = false Set this option before you will use the check_smartmon. Details: https://docs.puppet.com/puppet/latest/reference/lang_facts_and_builtin_vars.html#handling-boolean-facts-in-older-puppet-versions --- lib/facter/nagios_smartmon.rb | 191 +++++++++++ manifests/check/smartmon.pp | 49 +++ manifests/client.pp | 3 + manifests/server.pp | 3 + templates/plugins/check_smartmon | 446 +++++++++++++++++++++++++ templates/plugins/smartmon-sudoers.erb | 2 + 6 files changed, 694 insertions(+) create mode 100644 lib/facter/nagios_smartmon.rb create mode 100644 manifests/check/smartmon.pp create mode 100644 templates/plugins/check_smartmon create mode 100644 templates/plugins/smartmon-sudoers.erb diff --git a/lib/facter/nagios_smartmon.rb b/lib/facter/nagios_smartmon.rb new file mode 100644 index 00000000..abf0dfd4 --- /dev/null +++ b/lib/facter/nagios_smartmon.rb @@ -0,0 +1,191 @@ +# This facter script adds the fact "nagios_smartmon" and puts there the hash +# that contain information about connected disks, controllers, their ports, etc. +# This information could be passed to the smartctl utility for checking the +# SMART status. +# +# Example of hashes: +# +# The SATA disks attached directly +# { +# 0=>{"dev"=>"sda", "controller"=>"ata"}, +# 1=>{"dev"=>"sdb", "controller"=>"ata"} +# } +# +# The SATA disk connected to the LSI MegaRAID controller +# { +# 5=>{"interface"=>"SATA", "controller"=>"megaraid", "port"=>"1", "dev"=>"sda"}, +# 0=>{"interface"=>"SATA", "controller"=>"megaraid", "port"=>"2", "dev"=>"sda"}, +# 1=>{"interface"=>"SATA", "controller"=>"megaraid", "port"=>"0", "dev"=>"sda"}, +# 2=>{"interface"=>"SATA", "controller"=>"megaraid", "port"=>"3", "dev"=>"sda"}, +# 3=>{"interface"=>"SATA", "controller"=>"megaraid", "port"=>"4", "dev"=>"sda"}, +# 4=>{"interface"=>"SATA", "controller"=>"megaraid", "port"=>"5", "dev"=>"sda"} +# } +# +# The SATA disk connected to the LSI MegaRAID controller +# {0=>{"dev"=>"sda", "interface"=>"SAS", "port"=>"8", "controller"=>"megaraid"}} + + +# Check if the RAID controller utility is present and get the full path to it. +def getRaidUtil(utilNames) + raidUtil = nil + + utilNames.each do |name| + if FileTest.exists?(name) + raidUtil = name + end + end + + return raidUtil +end + +# Controller megaraid +def getPhysicalDisksPorts_megaraid + # List of possible names of RAID utility + utilNames = [ + '/usr/sbin/megacli', + '/usr/sbin/MegaCli', + '/usr/sbin/MegaCli64' + ] + + # Get the full path to RAID utility + raidUtil = getRaidUtil(utilNames) + + # Check the connected ports only if the RAID utility is present. + if raidUtil + + # Get the list of connected ports. + physicalDisksPorts = Facter::Core::Execution.exec("#{raidUtil} -PDList -Aall | awk '/Device\ Id/{print $3}'") + return physicalDisksPorts + + # Else return nil + else + return nil + end +end + +# This method checks the interface to which the disk is connected. +# This needed with the MegaRAID controllers in CentOS 6. The smartctl 5.43 +# requires the "sat+megaraid,N" in case of SATA disk and just "megaraid,N" in +# case of SAS. +def checkDiskInterface(port) + # List of possible names of RAID utility + utilNames = [ + '/usr/sbin/megacli', + '/usr/sbin/MegaCli', + '/usr/sbin/MegaCli64' + ] + + # Get the full path to RAID utility + raidUtil = getRaidUtil(utilNames) + + # Get the disk interface (SATA/SAS) + diskInterface = Facter::Core::Execution.exec("#{raidUtil} -PDList -aALL | grep -e '^Device Id: #{port}' -A 10 | awk '/PD Type:/{print $3}'") +end + + +# Controller hpsa +def getPhysicalDisksPorts_cciss + # List of possible names of RAID utility + utilNames = [ + '/usr/sbin/hpssacli', + '/usr/sbin/hpacucli' + ] + + # Get the full path to RAID utility + raidUtil = getRaidUtil(utilNames) + + # Check the connected ports only if the RAID utility is present. + if raidUtil + + # Get slot of SmartArray controller. This required for checking the connected ports. + hpsaSlot = Facter::Core::Execution.exec("#{raidUtil} controller all show status | awk '/Slot/{print $6}'") + + # Get the list of connected ports. + physicalDisksPorts = Facter::Core::Execution.exec("#{raidUtil} controller slot=#{hpsaSlot} physicaldrive all show status | awk '/bay/{ gsub(\",\",\"\"); print (\$6-1)}'") + return physicalDisksPorts + + # Else return the nil + else + return nil + end +end + +# Get the list of connected disks and their attributes (name, port, interface). +def getDisks (controller) + + # Get the list of block devices and transform it to string divided by comma. + blockdevices = Facter.value(:blockdevices).split(",") + + # Delete the CD-drive from array of blockdevices. + # TODO: delete all CD-drives (sr*) and virtualdrives (vd*) + blockdevices.delete('sr0') + + disks = {} + diskInterface = nil + + # Controller "ata" in smartmontools terminology means that there is no any + # hardware RAID controllers and disks are connected directly to the (S)ATA + # ports + if controller == "ata" + i = 0 + + # Add all blockdevices to the "disks" array. + blockdevices.each do |blockdevice| + disks[i] = { + "dev" => blockdevice, + "controller" => controller + } + i += 1 + end + else + + # Check the connected ports using the RAID controller utility (if present) + ports = send("getPhysicalDisksPorts_#{controller}").split("\n") + + # If controller returned the list of non-empty ports then fill the "disks" + # array. In other case do not add elements to array. This means that there + # is no RAID controller utility and there is no way to check to which ports + # disks are connected. + if ports + i = 0 + + # Add all connected to RAID controller disks as separate devices to the + # "disks" array. The smartctl requires the blockdevice for cheching the + # SMART status. Let's pass the first blockdevice what we have to the + # smartctl utility. + ports.each do |port| + + # For the LSI MegaRAID controller we have to check the interface of the + # disk. It may be SAS or SATA + if controller == "megaraid" + diskInterface = checkDiskInterface(port) + end + + disks[i] = { + "dev" => blockdevices[0], + "controller" => controller, + "port" => port, + "interface" => diskInterface + } + i += 1 + end + end + end + + return disks +end + +Facter.add(:nagios_smartmon) do + setcode do + # Check if there is LSI MegaRAID controller + if Facter.value(:nagios_pci_megaraid_sas) + getDisks("megaraid") + # Check if there is HP SmartArray controller + elsif Facter.value(:nagios_pci_hpsa) + getDisks("cciss") + # Else use the "ata" driver + else + getDisks("ata") + end + end +end diff --git a/manifests/check/smartmon.pp b/manifests/check/smartmon.pp new file mode 100644 index 00000000..c64ed61a --- /dev/null +++ b/manifests/check/smartmon.pp @@ -0,0 +1,49 @@ +class nagios::check::smartmon ( + $package = "smartmontools", + $ensure = undef, + $args = '', + $check_title = $::nagios::client::host_name, + $servicegroups = undef, + $check_period = $::nagios::client::service_check_period, + $contact_groups = $::nagios::client::service_contact_groups, + $first_notification_delay = $::nagios::client::first_notification_delay, + $max_check_attempts = $::nagios::client::service_max_check_attempts, + $notification_period = $::nagios::client::service_notification_period, + $use = $::nagios::client::service_use, +) { + + # Service specific script, taken from: + file { "${nagios::client::plugin_dir}/check_smartmon": + ensure => $ensure, + owner => 'root', + group => 'root', + mode => '0755', + content => template("${module_name}/plugins/check_smartmon"), + } + + # The check is being executed via sudo + file { "/etc/sudoers.d/nagios_check_smartmon": + ensure => $ensure, + owner => 'root', + group => 'root', + mode => '0440', + # We customize the user, the nagios plugin dir and few other things + content => template("${module_name}/plugins/smartmon-sudoers.erb"), + } + + ensure_packages($package) + + nagios::client::nrpe_file { "check_smartmon": + ensure => $ensure, + plugin => "check_smartmon", + args => '-d /dev/$ARG1$ -i $ARG2$', + } + + $disks = $::nagios_smartmon + $defaults = { + ensure => $ensure, + } + # Generate resources for each physical disk + create_resources(nagios::check::smartmon::disk, $disks, $defaults) + +} diff --git a/manifests/client.pp b/manifests/client.pp index 53838e3c..b4d11348 100644 --- a/manifests/client.pp +++ b/manifests/client.pp @@ -172,6 +172,9 @@ } } if $::nagios_postgres { class { '::nagios::check::postgres': } } + if $::nagios_smartmon and ($::is_virtual != true) { + class { '::nagios::check::smartmon': } + } } # With selinux, some nrpe plugins require additional rules to work diff --git a/manifests/server.pp b/manifests/server.pp index 9ceb0b60..1729bd29 100644 --- a/manifests/server.pp +++ b/manifests/server.pp @@ -819,6 +819,9 @@ nagios_command {'check_nrpe_mountpoints': command_line => "${nrpe} -c check_mountpoints", } + nagios_command {'check_nrpe_smartmon': + command_line => "${nrpe} -c check_smartmon -a \$ARG1\$ \$ARG2\$", + } # Nagios contacts and contactgroups # Taken from contacts.cfg diff --git a/templates/plugins/check_smartmon b/templates/plugins/check_smartmon new file mode 100644 index 00000000..971a57a4 --- /dev/null +++ b/templates/plugins/check_smartmon @@ -0,0 +1,446 @@ +#!/usr/bin/perl -w +# Check SMART status of ATA/SCSI disks, returning any usable metrics as perfdata. +# For usage information, run ./check_smart -h +# +# This script was created under contract for the US Government and is therefore Public Domain +# +# Changes and Modifications +# ========================= +# Feb 3, 2009: Kurt Yoder - initial version of script (rev 1.0) +# Jul 8, 2013: Claudio Kuenzler - support hardware raids like megaraid (rev 2.0) +# Jul 9, 2013: Claudio Kuenzler - update help output (rev 2.1) +# Oct 11, 2013: Claudio Kuenzler - making the plugin work on FreeBSD (rev 3.0) +# Oct 11, 2013: Claudio Kuenzler - allowing -i sat (SATA on FreeBSD) (rev 3.1) +# Nov 4, 2013: Claudio Kuenzler - works now with CCISS on FreeBSD (rev 3.2) +# Nov 4, 2013: Claudio Kuenzler - elements in grown defect list causes warning (rev 3.3) +# Nov 6, 2013: Claudio Kuenzler - add threshold option "bad" (-b) (rev 4.0) +# Nov 7, 2013: Claudio Kuenzler - modified help (rev 4.0) +# Nov 7, 2013: Claudio Kuenzler - bugfix in threshold logic (rev 4.1) +# Mar 19, 2014: Claudio Kuenzler - bugfix in defect list perfdata (rev 4.2) +# Apr 22, 2014: Jerome Lauret - implemented -g to do a global lookup (rev 5.0) +# Apr 25, 2014: Claudio Kuenzler - cleanup, merge Jeromes code, perfdata output fix (rev 5.1) +# May 5, 2014: Caspar Smit - Fixed output bug in global check / issue #3 (rev 5.2) +# Feb 4, 2015: Caspar Smit and cguadall - Allow detection of more than 26 devices / issue #5 (rev 5.3) +# Feb 5, 2015: Bastian de Groot - Different ATA vs. SCSI lookup (rev 5.4) +# Feb 11, 2015: Josh Behrends - Allow script to run outside of nagios plugins dir / wiki url update (rev 5.5) +# Feb 11, 2015: Claudio Kuenzler - Allow script to run outside of nagios plugins dir for FreeBSD too (rev 5.5) +# Mar 12, 2015: Claudio Kuenzler - Change syntax of -g parameter (regex is now awaited from input) (rev 5.6) + +use strict; +use Getopt::Long; + +use File::Basename qw(basename); +my $basename = basename($0); + +my $revision = '$Revision: 5.6 $'; + +use FindBin; +use lib $FindBin::Bin; +BEGIN { + push @INC,'/usr/lib/nagios/plugins','/usr/lib64/nagios/plugins','/usr/local/libexec/nagios'; +} +use utils qw(%ERRORS &print_revision &support &usage); + +$ENV{'PATH'}='/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin:/usr/local/sbin'; +$ENV{'BASH_ENV'}=''; +$ENV{'ENV'}=''; + +use vars qw($opt_b $opt_d $opt_g $opt_debug $opt_h $opt_i $opt_v); +Getopt::Long::Configure('bundling'); +GetOptions( + "debug" => \$opt_debug, + "b=i" => \$opt_b, "bad=i" => \$opt_b, + "d=s" => \$opt_d, "device=s" => \$opt_d, + "g=s" => \$opt_g, "global=s" => \$opt_g, + "h" => \$opt_h, "help" => \$opt_h, + "i=s" => \$opt_i, "interface=s" => \$opt_i, + "v" => \$opt_v, "version" => \$opt_v, +); + +if ($opt_v) { + print_revision($basename,$revision); + exit $ERRORS{'OK'}; +} + +if ($opt_h) { + print_help(); + exit $ERRORS{'OK'}; +} + +my ($device, $interface) = qw//; +if ($opt_d || $opt_g ) { + unless($opt_i){ + print "must specify an interface for $opt_d using -i/--interface!\n\n"; + print_help(); + exit $ERRORS{'UNKNOWN'}; + } + + # list of devices for a loop + my(@dev); + + if ( $opt_d ){ + # normal mode - push opt_d on the list of devices + push(@dev,$opt_d); + } else { + # glob all devices - try '?' first + @dev =glob($opt_g); + } + + foreach my $opt_dl (@dev){ + warn "Found $opt_dl\n" if $opt_debug; + if (-b $opt_dl || -c $opt_dl){ + $device .= $opt_dl.":"; + + } else { + warn "$opt_dl is not a valid block/character special device!\n\n" if $opt_debug; + } + } + + if ( ! defined($device) ){ + print "Could not find any valid block/character special device for ". + ($opt_d?"device $opt_d ":"pattern $opt_g")." !\n\n"; + exit $ERRORS{'UNKNOWN'}; + } + + # Allow all device types currently supported by smartctl + # See http://www.smartmontools.org/wiki/Supported_RAID-Controllers + if ($opt_i =~ m/(ata|scsi|3ware|areca|hpt|cciss|megaraid|sat)/) { + $interface = $opt_i; + } else { + print "invalid interface $opt_i for $opt_d!\n\n"; + print_help(); + exit $ERRORS{'UNKNOWN'}; + } +} + + +if ($device eq "") { + print "must specify a device!\n\n"; + print_help(); + exit $ERRORS{'UNKNOWN'}; +} + + +my $smart_command = 'sudo smartctl'; +my $exit_status = 'OK'; +my $exit_status_local = 'OK'; +my $status_string = ''; +my $perf_string = ''; +my $Terminator = ' --- '; + + +foreach $device ( split(":",$device) ){ + my @error_messages = qw//; + my($status_string_local)=''; + my($tag,$label); + $exit_status_local = 'OK'; + + if ($opt_g){ + # we had a pattern based on $opt_g + $tag = $device; + $tag =~ s/$opt_g//; + $label = "[$device] - "; + } else { + # we had a device specified using $opt_d (traditional) + $label = ""; + $tag = $device; + } + + + warn "###########################################################\n" if $opt_debug; + warn "(debug) CHECK 1: getting overall SMART health status for $tag \n" if $opt_debug; + warn "###########################################################\n\n\n" if $opt_debug; + + my $full_command = "$smart_command -d $interface -H $device"; + warn "(debug) executing:\n$full_command\n\n" if $opt_debug; + + my @output = `$full_command`; + warn "(debug) output:\n@output\n\n" if $opt_debug; + + my $output_mode = ""; + # parse ata output, looking for "health status: passed" + my $found_status = 0; + my $line_str_ata = 'SMART overall-health self-assessment test result: '; # ATA SMART line + my $ok_str_ata = 'PASSED'; # ATA SMART OK string + + my $line_str_scsi = 'SMART Health Status: '; # SCSI and CCISS SMART line + my $ok_str_scsi = 'OK'; #SCSI and CCISS SMART OK string + + foreach my $line (@output){ + if($line =~ /$line_str_scsi(.+)/){ + $found_status = 1; + $output_mode = "scsi"; + warn "(debug) parsing line:\n$line\n\n" if $opt_debug; + if ($1 eq $ok_str_scsi) { + warn "(debug) found string '$ok_str_scsi'; status OK\n\n" if $opt_debug; + } + else { + warn "(debug) no '$ok_str_scsi' status; failing\n\n" if $opt_debug; + push(@error_messages, "Health status: $1"); + escalate_status('CRITICAL'); + } + } + if($line =~ /$line_str_ata(.+)/){ + $found_status = 1; + $output_mode = "ata"; + warn "(debug) parsing line:\n$line\n\n" if $opt_debug; + if ($1 eq $ok_str_ata) { + warn "(debug) found string '$ok_str_ata'; status OK\n\n" if $opt_debug; + } + else { + warn "(debug) no '$ok_str_ata' status; failing\n\n" if $opt_debug; + push(@error_messages, "Health status: $1"); + escalate_status('CRITICAL'); + } + } + } + + unless ($found_status) { + push(@error_messages, 'No health status line found'); + escalate_status('UNKNOWN'); + } + + + warn "###########################################################\n" if $opt_debug; + warn "(debug) CHECK 2: getting silent SMART health check\n" if $opt_debug; + warn "###########################################################\n\n\n" if $opt_debug; + + $full_command = "$smart_command -d $interface -q silent -A $device"; + warn "(debug) executing:\n$full_command\n\n" if $opt_debug; + + system($full_command); + my $return_code = $?; + warn "(debug) exit code:\n$return_code\n\n" if $opt_debug; + + if ($return_code & 0x01) { + push(@error_messages, 'Commandline parse failure'); + escalate_status('UNKNOWN'); + } + if ($return_code & 0x02) { + push(@error_messages, 'Device could not be opened'); + escalate_status('UNKNOWN'); + } + if ($return_code & 0x04) { + push(@error_messages, 'Checksum failure'); + escalate_status('WARNING'); + } + if ($return_code & 0x08) { + push(@error_messages, 'Disk is failing'); + escalate_status('CRITICAL'); + } + if ($return_code & 0x10) { + push(@error_messages, 'Disk is in prefail'); + escalate_status('WARNING'); + } + if ($return_code & 0x20) { + push(@error_messages, 'Disk may be close to failure'); + escalate_status('WARNING'); + } + if ($return_code & 0x40) { + push(@error_messages, 'Error log contains errors'); + escalate_status('WARNING'); + } + if ($return_code & 0x80) { + push(@error_messages, 'Self-test log contains errors'); + escalate_status('WARNING'); + } + if ($return_code && !$exit_status_local) { + push(@error_messages, 'Unknown return code'); + escalate_status('CRITICAL'); + } + + if ($return_code) { + warn "(debug) non-zero exit code, generating error condition\n\n" if $opt_debug; + } else { + warn "(debug) zero exit code, status OK\n\n" if $opt_debug; + } + + + warn "###########################################################\n" if $opt_debug; + warn "(debug) CHECK 3: getting detailed statistics\n" if $opt_debug; + warn "(debug) information contains a few more potential trouble spots\n" if $opt_debug; + warn "(debug) plus, we can also use the information for perfdata/graphing\n" if $opt_debug; + warn "###########################################################\n\n\n" if $opt_debug; + + $full_command = "$smart_command -d $interface -A $device"; + warn "(debug) executing:\n$full_command\n\n" if $opt_debug; + @output = `$full_command`; + warn "(debug) output:\n@output\n\n" if $opt_debug; + my @perfdata = qw//; + + # separate metric-gathering and output analysis for ATA vs SCSI SMART output + # Yeah - but megaraid is the same output as ata + if ($output_mode =~ "ata") { + foreach my $line(@output){ + # get lines that look like this: + # 9 Power_On_Minutes 0x0032 241 241 000 Old_age Always - 113h+12m + next unless $line =~ /^\s*\d+\s(\S+)\s+(?:\S+\s+){6}(\S+)\s+(\d+)/; + my ($attribute_name, $when_failed, $raw_value) = ($1, $2, $3); + if ($when_failed ne '-'){ + push(@error_messages, "Attribute $attribute_name failed at $when_failed"); + escalate_status('WARNING'); + warn "(debug) parsed SMART attribute $attribute_name with error condition:\n$when_failed\n\n" if $opt_debug; + } + # some attributes produce questionable data; no need to graph them + if (grep {$_ eq $attribute_name} ('Unknown_Attribute', 'Power_On_Minutes') ){ + next; + } + push (@perfdata, "$attribute_name=$raw_value") if $opt_d; + + # do some manual checks + if ( ($attribute_name eq 'Current_Pending_Sector') && $raw_value ) { + if ($opt_b) { + if (($raw_value > 0) && ($raw_value >= $opt_b)) { + push(@error_messages, "$raw_value Sectors pending re-allocation"); + escalate_status('WARNING'); + warn "(debug) Current_Pending_Sector is non-zero ($raw_value)\n\n" if $opt_debug; + } + elsif (($raw_value > 0) && ($raw_value < $opt_b)) { + push(@error_messages, "$raw_value Sectors pending re-allocation (but less than threshold $opt_b)"); + warn "(debug) Current_Pending_Sector is non-zero ($raw_value) but less than $opt_b\n\n" if $opt_debug; + } + } else { + push(@error_messages, "Sectors pending re-allocation"); + escalate_status('WARNING'); + warn "(debug) Current_Pending_Sector is non-zero ($raw_value)\n\n" if $opt_debug; + } + } + } + + } else { + my ($current_temperature, $max_temperature, $current_start_stop, $max_start_stop) = qw//; + foreach my $line(@output){ + if ($line =~ /Current Drive Temperature:\s+(\d+)/){ + $current_temperature = $1; + } + elsif ($line =~ /Drive Trip Temperature:\s+(\d+)/){ + $max_temperature = $1; + } + elsif ($line =~ /Current start stop count:\s+(\d+)/){ + $current_start_stop = $1; + } + elsif ($line =~ /Recommended maximum start stop count:\s+(\d+)/){ + $max_start_stop = $1; + } + elsif ($line =~ /Elements in grown defect list:\s+(\d+)/){ + my $defectlist = $1; + # check for elements in grown defect list + if ($opt_b) { + push (@perfdata, "defect_list=$defectlist;;$opt_b") if $opt_d; + if (($defectlist > 0) && ($defectlist >= $opt_b)) { + push(@error_messages, "$defectlist Elements in grown defect list (threshold $opt_b)"); + escalate_status('WARNING'); + warn "(debug) Elements in grown defect list is non-zero ($defectlist)\n\n" if $opt_debug; + } + elsif (($defectlist > 0) && ($defectlist < $opt_b)) { + push(@error_messages, "Note: $defectlist Elements in grown defect list"); + warn "(debug) Elements in grown defect list is non-zero ($defectlist) but less than $opt_b\n\n" if $opt_debug; + } + } + else { + if ($defectlist > 0) { + push (@perfdata, "defect_list=$defectlist") if $opt_d; + push(@error_messages, "$defectlist Elements in grown defect list"); + escalate_status('WARNING'); + warn "(debug) Elements in grown defect list is non-zero ($defectlist)\n\n" if $opt_debug; + } + } + } + elsif ($line =~ /Blocks sent to initiator =\s+(\d+)/){ + push (@perfdata, "sent_blocks=$1") if $opt_d; + } + } + if($current_temperature){ + if($max_temperature){ + push (@perfdata, "temperature=$current_temperature;;$max_temperature") if $opt_d; + if($current_temperature > $max_temperature){ + warn "(debug) Disk temperature is greater than max ($current_temperature > $max_temperature)\n\n" if $opt_debug; + push(@error_messages, 'Disk temperature is higher than maximum'); + escalate_status('CRITICAL'); + } + } + else{ + push (@perfdata, "temperature=$current_temperature") if $opt_d; + } + } + if($current_start_stop){ + if($max_start_stop){ + push (@perfdata, "start_stop=$current_start_stop;$max_start_stop") if $opt_d; + if($current_start_stop > $max_start_stop){ + warn "(debug) Disk start_stop is greater than max ($current_start_stop > $max_start_stop)\n\n" if $opt_debug; + push(@error_messages, 'Disk start_stop is higher than maximum'); + escalate_status('WARNING'); + } + } + else{ + push (@perfdata, "start_stop=$current_start_stop") if $opt_d; + } + } + } + warn "(debug) gathered perfdata:\n@perfdata\n\n" if $opt_debug; + $perf_string = join(' ', @perfdata); + + warn "###########################################################\n" if $opt_debug; + warn "(debug) LOCAL STATUS: $exit_status_local, FINAL STATUS: $exit_status\n" if $opt_debug; + warn "###########################################################\n\n\n" if $opt_debug; + + if($exit_status_local ne 'OK'){ + if ($opt_g) { + $status_string_local = $label.join(', ', @error_messages); + $status_string .= $status_string_local.$Terminator; + } + else { + $status_string = join(', ', @error_messages); + } + } + else { + if ($opt_g) { + $status_string_local = $label."Device is clean"; + $status_string .= $status_string_local.$Terminator; + } + else { + $status_string = "no SMART errors detected. ".join(', ', @error_messages); + } + } + +} + + warn "(debug) final status/output: $exit_status\n" if $opt_debug; + +$status_string =~ s/$Terminator$//; +print "$exit_status: $status_string|$perf_string\n"; +exit $ERRORS{$exit_status}; + + +sub print_help { + print_revision($basename,$revision); + print "\nUsage: $basename {-d=|-g=} -i=(ata|scsi|3ware,N|areca,N|hpt,L/M/N|cciss,N|megaraid,N) [-b N] [--debug]\n\n"; + print "At least one of the below. -d supersedes -g\n"; + print " -d/--device: a physical block device to be SMART monitored, eg /dev/sda\n"; + print " -g/--global: a regular expression name of physical devices to be SMART monitored\n"; + print " Example: '/dev/sd[a-z]' will search for all /dev/sda until /dev/sdz devices and report errors globally.\n"; + print "Note that -g only works with a fixed interface input (e.g. scsi, ata), not with special interface ids like cciss,1\n"; + print "\n"; + print "Other options\n"; + print " -i/--interface: device's interface type\n"; + print " (See http://www.smartmontools.org/wiki/Supported_RAID-Controllers for interface convention)\n"; + print " -b/--bad: Threshold value (integer) when to warn for N bad entries\n"; + print " -h/--help: this help\n"; + print " --debug: show debugging information\n"; + print " -v/--version: Version number\n"; +} + +# escalate an exit status IFF it's more severe than the previous exit status +sub escalate_status { + my $requested_status = shift; + # no test for 'CRITICAL'; automatically escalates upwards + if ($requested_status eq 'WARNING') { + return if ($exit_status|$exit_status_local) eq 'CRITICAL'; + } + if ($requested_status eq 'UNKNOWN') { + return if ($exit_status|$exit_status_local) eq 'WARNING'; + return if ($exit_status|$exit_status_local) eq 'CRITICAL'; + } + $exit_status = $requested_status; + $exit_status_local = $requested_status; +} diff --git a/templates/plugins/smartmon-sudoers.erb b/templates/plugins/smartmon-sudoers.erb new file mode 100644 index 00000000..9f3ccc6c --- /dev/null +++ b/templates/plugins/smartmon-sudoers.erb @@ -0,0 +1,2 @@ +Defaults !requiretty +<%= scope['nagios::client::nrpe_user'] %> ALL = (root) NOPASSWD: /usr/sbin/smartctl