#!/usr/local/bin/perl # # check an APC ups # # This version uses Net::SNMP. # # $Header: /opt/home/doke/work/nagios/RCS/check_apc_ups,v 1.13 2008/07/03 21:03:49 doke Exp $ use vars qw( $warn_runtime $crit_runtime $warn_temphi $crit_temphi $warn_templo $crit_templo $warn_load $crit_load ); $warn_runtime = 20; $crit_runtime = 15; # only applies if on battery $warn_temphi = 100; # a lot of the temp sensors are bad $crit_temphi = 120; $warn_templo = 5; $crit_templo = 0; $warn_load = 70; $crit_load = 95; ########################### use strict; use warnings; no warnings 'redefine'; use Getopt::Std; use Net::SNMP; use vars qw( %opts $host $community @crit_msgs @warn_msgs @unknown_msgs @ok_msgs $rc $verbose $mib2 $enterprises $apcmib ); $verbose = 0; $mib2 = '1.3.6.1.2.1'; $enterprises = '.1.3.6.1.4.1'; $apcmib = "$enterprises.318"; ############################# sub check_apc_ups_usage { die "Usage: $0 [-v] -H -C \n"; } getopts( 'vH:C:', \%opts ); &check_apc_ups_usage() if ( ! $opts{ H } ); &check_apc_ups_usage() if ( ! $opts{ C } ); $host = $opts{ H }; $community = $opts{ C }; $verbose = 1 if ( $opts{ v } ); # Just in case of problems, let's not hang Nagios $SIG{'ALRM'} = sub { print ("ERROR: No response from snmp server (alarm)\n"); exit -1; }; alarm( 20 ); &check_ups( $host, $community ); $rc = 0; # nagios ok exit code $" = ", "; if ( scalar( @crit_msgs ) ) { print "CRITICAL: @crit_msgs "; $rc = 2; } if ( scalar( @warn_msgs ) ) { print "; " if ( $rc != 0 ); print "Warning: @warn_msgs "; $rc = 1 if ( $rc == 0 ); } if ( scalar( @unknown_msgs ) ) { print "; " if ( $rc != 0 ); print "Unknown: @unknown_msgs "; $rc = -1 if ( $rc == 0 ); } elsif ( $rc == 0 ) { print "OK @ok_msgs"; } print "\n"; exit $rc; ################################## sub check_ups { my( $host, $community ) = @_; my( $session, $error, %variables, @oids, $result, $var, $val, $msg, $status, $time_on_battery, $load ); ( $session, $error ) = Net::SNMP->session( -version => 'snmpv1', -hostname => $host, -community => $community, -timeout => 5, -translate => [ -timeticks => 0x0 ], # Turn off so sysUpTime is numeric #-debug => 0x02 ); if ( ! defined( $session ) ) { print "snmp error: $error\n"; exit -1; } %variables = ( "upsBasicBatteryStatus" => "$apcmib.1.1.1.2.1.1.0", "upsBasicBatteryTimeOnBattery" => "$apcmib.1.1.1.2.1.2.0", "upsBasicBatteryLastReplaceDate" => "$apcmib.1.1.1.2.1.3.0", "upsAdvBatteryCapacity" => "$apcmib.1.1.1.2.2.1.0", "upsAdvBatteryTemperature" => "$apcmib.1.1.1.2.2.2.0", "upsAdvBatteryRunTimeRemaining" => "$apcmib.1.1.1.2.2.3.0", "upsAdvBatteryReplaceIndicator" => "$apcmib.1.1.1.2.2.4.0", "upsAdvInputLineFailCause" => "$apcmib.1.1.1.3.2.5.0", "upsBasicOutputStatus" => "$apcmib.1.1.1.4.1.1.0", #"upsAdvOutputVoltage" => "$apcmib.1.1.1.4.2.1.0", "upsAdvOutputLoad" => "$apcmib.1.1.1.4.2.3.0", #"upsAdvOutputCurrent" => "$apcmib.1.1.1.4.2.4.0", # broken, reports 0 "upsAdvTestDiagnosticsResults" => "$apcmib.1.1.1.7.2.3.0", "upsAdvTestCalibrationResults" => "$apcmib.1.1.1.7.2.6.0", ); @oids = sort values %variables; $result = $session->get_request( -varbindlist => \@oids ); if ( ! defined( $result ) ) { push @unknown_msgs, $session->error(); return; } if ( $verbose ) { print "raw results:\n"; foreach $var ( sort keys %variables ) { $val = $result->{ $variables{ $var } }; print " $var $val\n"; } } $val = $result->{ $variables{ 'upsBasicOutputStatus' } }; my %statuses = ( 1 => 'unknown', 2 => 'onLine', 3 => 'onBattery', 4 => 'onSmartBoost', 5 => 'timedSleeping', 6 => 'softwareBypass', 7 => 'off', 8 => 'rebooting', 9 => 'switchedBypass', 10 => 'hardwareFailureBypass', 11 => 'sleepingUntilPowerReturn', 12 => 'onSmartTrim', 13 => 'unknown' ); $status = $statuses{ $val }; $verbose && print "upsBasicOutputStatus $val $status\n"; if ( $val == 2 ) { push @ok_msgs, "output status $status"; } elsif ( $val == 4 || $val == 12 ) { push @warn_msgs, "output status $status"; } elsif ( $val == 1 || $val >= 13 ) { push @unknown_msgs, "output status $status"; } else { push @crit_msgs, "output status $status"; } $load = $result->{ $variables{ 'upsAdvOutputLoad' } }; $msg = "load $load%"; $verbose && print $msg, "\n"; if ( $val > $crit_load ) { push @crit_msgs, $msg; } elsif ( $val > $crit_load ) { push @warn_msgs, $msg; } else { push @ok_msgs, $msg; } $val = $result->{ $variables{ 'upsBasicBatteryStatus' } }; $verbose && print "upsBasicBatteryStatus $val\n"; if ( $val == 2 ) { #push @ok_msgs, "batteryNormal"; } elsif ( $val == 3 ) { push @crit_msgs, "batteryLow"; } else { push @crit_msgs, "upsBasicBatteryStatus $val"; } $time_on_battery = $result->{ $variables{ 'upsBasicBatteryTimeOnBattery' } }; $verbose && print "upsBasicBatteryTimeOnBattery $time_on_battery\n"; if ( $time_on_battery != 0 ) { push @warn_msgs, "time on battery $time_on_battery"; } $val = $result->{ $variables{ 'upsBasicBatteryLastReplaceDate' } }; $verbose && print "upsBasicBatteryLastReplaceDate $val\n"; # TODO: parse date string and complain if too old? push @ok_msgs, "battery date $val"; $val = $result->{ $variables{ 'upsAdvBatteryCapacity' } }; $verbose && print "upsAdvBatteryCapacity $val\n"; if ( $val < 95 ) { push @warn_msgs, "battery capacity $val%"; } else { push @ok_msgs, "battery capacity $val%"; } $val = $result->{ $variables{ 'upsAdvBatteryTemperature' } }; $verbose && print "upsAdvBatteryTemperature $val\n"; if ( $val >= $crit_temphi || $val <= $crit_templo ) { push @crit_msgs, "battery temp ${val}C"; } elsif ( $val >= $warn_temphi || $val <= $warn_templo ) { push @warn_msgs, "battery temp ${val}C"; } else { push @ok_msgs, "battery temp ${val}C"; } $val = $result->{ $variables{ 'upsAdvBatteryRunTimeRemaining' } }; my $timestr = &check_apc_ups_ticks_to_str( $val ); $verbose && print "upsAdvBatteryRunTimeRemaining $val $timestr\n"; if ( $val < ( $crit_runtime * 6000 ) && $time_on_battery > 0 ) { push @crit_msgs, "battery time remaining $timestr"; } elsif ( $val < ( $warn_runtime * 6000 ) ) { push @warn_msgs, "battery time remaining $timestr"; } else { push @ok_msgs, "battery time remaining $timestr"; } $val = $result->{ $variables{ 'upsAdvBatteryReplaceIndicator' } }; $verbose && print "upsAdvBatteryReplaceIndicator $val\n"; if ( $val == 2 ) { push @crit_msgs, "battery needs replacing"; } $val = $result->{ $variables{ 'upsAdvInputLineFailCause' } }; my %causes = ( 1 => 'noTransfer', 2 => 'highLineVoltage', 3 => 'brownout', 4 => 'blackout', 5 => 'smallMomentarySag', 6 => 'deepMomentarySag', 7 => 'smallMomentarySpike', 8 => 'largeMomentarySpike', 9 => 'selfTest', 10 => 'rateOfVoltageChange', 11 => 'unknown', ); my $cause = $causes{ $val }; $verbose && print "upsAdvInputLineFailCause $val $cause\n"; if ( $val != 9 ) { push @ok_msgs, "last line failure $cause"; } $val = $result->{ $variables{ 'upsAdvTestDiagnosticsResults' } }; $verbose && print "upsAdvTestDiagnosticsResults $val\n"; if ( $val == 1 ) { #push @ok_msgs, "last diag ok"; } elsif ( $val == 2 ) { push @crit_msgs, "last diag failed"; } elsif ( $val == 3 ) { push @crit_msgs, "last diag invalid"; } elsif ( $val == 4 ) { push @warn_msgs, "diags in progress"; } else { push @unknown_msgs, "last diag unknown"; } $val = $result->{ $variables{ 'upsAdvTestCalibrationResults' } }; $verbose && print "upsAdvTestCalibrationResults $val\n"; if ( $val == 1 ) { #push @ok_msgs, "last calibration ok"; } elsif ( $val == 2 ) { push @crit_msgs, "last calibration invalid"; } elsif ( $val == 3 ) { push @warn_msgs, "calibration in progress"; } else { push @unknown_msgs, "last calibration unknown"; } } sub check_apc_ups_ticks_to_str { my( $ticks ) = @_; my( @intervals, @letters, $interval, $str, $i, $n, $started ); @intervals = ( 60480000, 8640000, 360000, 6000, #100, ); @letters = ( 'w', 'd', 'h', 'm', 's', ); $str = ''; for ( $i = 0; $i <= $#intervals; $i++ ) { $interval = $intervals[ $i ]; if ( $ticks >= $interval || $started ) { $n = int( $ticks / $interval ); $ticks -= $n * $interval; $str .= sprintf( "%u%s ", $n, $letters[ $i ] ); $started = 1; # show days in 3weeks 0days 3hours } } $str =~ s/\s+$//; return $str; }