#!/usr/local/bin/perl
################################################################################
# #
# check_perc_raid, v1.1 #
# #
# Fraser Gutteridge, blue@udel.edu 11/08/2014 #
# University of Delaware Network & Systems Services #
# #
# Checks on Dell PowerEdge Raid Controller (PERC) devices. Output can be used #
# with Nagios Remote Plugin Executor. #
# #
# Pre-reqs: LSI Corporation SNMP modules must be installed on queried host #
# available at www.lsi.com #
# #
# Tested on following PERC models: #
# PERC H700 (Dell PowerEdge R715) #
# PERC H710 Mini (Dell PowerEdge R720) #
# #
################################################################################
#
# changelog
#
# v1.0 -> v1.1 by blue, 2016-03-08
# - added "Optimal" battery backup state for PERC H700
#
use strict;
use warnings;
no warnings 'redefine';
use Getopt::Long;
use Net::SNMP;
use Socket;
use Data::Dumper;
# globals
use vars qw ( $host $community $enterprise $lsi %snmp_sessions $rc $sep @crits
@warns @unknowns @oks $use_snmpv2c %name2ip $prefered_maxmsgsize
$roc_warn_temp $roc_crit_temp $bbu_warn_cap $bbu_crit_cap
$session $verbose $help $bbu_warn_temp $bbu_crit_temp $adapters
$drives $drive_warn_temp $drive_crit_temp );
############################
# config vars default vals #
############################
$enterprise = "1.3.6.1.4.1";
$lsi = "$enterprise.3582";
$verbose = 0;
$use_snmpv2c = 1;
$prefered_maxmsgsize = 1024;
# RAID On Chip (ROC) temp thresholds in Centigrade
$roc_warn_temp = 80;
$roc_crit_temp = 90;
# battery backup unit capacity remaining thresholds
$bbu_warn_cap = 20;
$bbu_crit_cap = 10;
# battery backup unit temp thresholds in Centigrade
$bbu_warn_temp = 60;
$bbu_crit_temp = 70;
# Seagate ST3300657SS operating temp thresholds in Centigrade
# might need to come up with a hash for this based on drive
# model no. if we start getting a lot of PERC controllers, hmm...
$drive_warn_temp = 45;
$drive_crit_temp = 50;
###################
# end config vars #
###################
# get command line options
Getopt::Long::Configure ( "bundling" );
GetOptions ( 'H=s' => \$host,
'C=s' => \$community,
'a+' => \$adapters,
'd+' => \$drives,
'v+' => \$verbose,
'h' => \$help );
&help if $help;
if ( ! $host ) {
print "You need to enter a hostname!\n";
exit 2;
}
if ( ! $community ) {
print "You need to enter a community string!\n";
exit 2;
}
if ( ! $adapters && ! $drives ) {
print "You need to pass either the -a or -d options, or both!\n";
exit 2;
}
# start snmp session
$session = snmp_session ( $host, $community );
check_if_lsi ( );
if ( $adapters ) {
check_adapters ( );
check_bbus ( );
}
if ( $drives ) {
check_phys_drives ( );
check_virt_drives ( );
}
$rc = 0;
$sep = '';
if( $#unknowns >= 0 ) {
$rc = -1 if ( $rc == 0 );
print $sep, "Unknown ", join( ", ", @unknowns );
}
if ($#crits >= 0) {
$rc = 2;
print "CRITICAL ", join(", ", @crits);
$sep = '; ';
}
if ($#warns >= 0) {
$rc = 1 if ($rc == 0);
print $sep, "Warning ", join(", ", @warns);
$sep = '; ';
}
if ( $rc == 0 ) {
print "OK ", join( ", ", @oks );
}
print "\n";
exit $rc;
###############
# subroutines #
###############
#
# does a pretty low intelligence check to see if the host queried has an LSI
# section in the enterprises area and if the expected module is there
#
sub check_if_lsi {
my $agentModuleName_oid = "$lsi.4.1.3.2.0";
my $agentModuleName = snmp_get_one ( $session, 'agent module name',
$agentModuleName_oid );
if ( ! $agentModuleName ) {
print "The LSI SNMP module does not appear to be installed on this " .
"host.\n";
exit 2;
}
$verbose && print "Module name: " . $agentModuleName . "\n";
if ( $agentModuleName !~ m/lsi/ ) {
print "This does not appear to be the expected LSI module.\n";
exit 2;
}
}
#
# checks a few things on the MegaRAID SAS controller;
# the virtual device and disk counts won't give us much
# info on what's wrong, but they'll hopefully alert us
# that there is a problem
#
sub check_adapters {
# first check and make sure there are actually adapters present
# if not we can stop here
my $adpNumber_oid = "$lsi.4.1.4.1.1.0";
my $result = snmp_get_one ( $session, 'number of adapters', $adpNumber_oid );
if ( $result == 0 ) {
push @unknowns, "No adapters appear to be present";
exit 2;
}
my $adapterPropertiesTable_oid = "$lsi.4.1.4.1.2";
my @data = walk_table ( $session, 'adapter properties table',
$adapterPropertiesTable_oid );
foreach my $row ( 0 .. $#{$data[1]} ) {
my $adpID = $data[1][$row];
my $alarmState = $data[4][$row];
my $vdDegradedCount = $data[19][$row];
my $pdDiskPredFailureCount = $data[23][$row];
my $pdDiskFailedCount = $data[24][$row];
my $temperatureROC = $data[42][$row];
# check virtual and physical failure counts
if ( $vdDegradedCount > 0 ) {
push @crits, "adp $adpID : $$vdDegradedCount virtual devices critical";
}
if ( $pdDiskPredFailureCount > 0 ) {
push @warns, "adp $adpID : $pdDiskPredFailureCount disks are failing";
}
if ( $pdDiskFailedCount > 0 ) {
push @crits, "adp $adpID : $pdDiskFailedCount disks have failed";
}
# check ROC temp
if ( $temperatureROC >= $roc_crit_temp ) {
push @crits, "adp $adpID : ROC temp $temperatureROC degrees C"
} elsif ( $temperatureROC >= $roc_warn_temp ) {
push @warns, "adp $adpID : ROC temp $temperatureROC degrees C"
} else {
push @oks, "adp $adpID : ROC temp $temperatureROC degrees C"
}
}
}
#
# checks the battery backup units on the controllers, if there are any
#
sub check_bbus {
# get the number of bbus on this controller, if any
my $bbuNumber_oid = "$lsi.4.1.4.1.6.1.0";
my $bbuNumber = snmp_get_one ( $session, 'bbu number', $bbuNumber_oid );
if ( $bbuNumber > 0 ) {
my $bbuTable_oid = "$lsi.4.1.4.1.6.2";
my @data = walk_table ( $session, 'bbu table', $bbuTable_oid );
foreach my $row ( 0 .. $#{$data[1]} ) {
my $bbuID = $data[1][$row];
my $voltage = $data[9][$row];
my $relStateOfCharge = $data[11][$row];
my $temp = $data[16][$row];
my $adpID = $data[21][$row];
my $bbuState = $data[29][$row];
# check battery voltage - most of the LSI MegaRAID SAS controllers
# have lithium-polymer rechargeable batteries that should be around
# 3.75V
# units for reported voltage by LSI SNMP are usually mV so convert to
# volts first
$voltage = $voltage / 1000;
if ( $voltage > 4.1 || $voltage < 3.5 ) {
push @warns, "adp $adpID : bbu $bbuID : voltage out of range, " .
"reporting $voltage V";
} else {
push @oks, "adp $adpID : bbu $bbuID : battery voltage $voltage V";
}
# check remaining capacity of battery
if ( $relStateOfCharge <= $bbu_crit_cap ) {
push @crits, "adp $adpID : bbu $bbuID : battery capacity " .
"$relStateOfCharge%";
} elsif ( $relStateOfCharge <= $bbu_warn_cap ) {
push @warns, "adp $adpID : bbu $bbuID : battery capacity " .
"$relStateOfCharge%";
} else {
push @oks, "adp $adpID : bbu $bbuID : battery capacity " .
"$relStateOfCharge%";
}
# LSI doesn't provide temperature thresholds for their batteries in
# their MIB files, but most Li-polymer 3.75V battery SEI layers start
# melting at around 80 degrees C, so let's use 70 degrees as our
# critical temp
# the temperature is reported in degrees centigrade but there's also
# an operating status string attached to that, so strip it away
$temp =~ m/^(\d+).+/;
$temp = $1;
if ( $temp >= $bbu_crit_temp ) {
push @crits, "adp $adpID : bbu $bbuID : battery temp $temp C";
} elsif ( $temp >= $bbu_warn_temp ) {
push @warns, "adp $adpID : bbu $bbuID : battery temp $temp C";
} else {
push @oks, "adp $adpID : bbu $bbuID : battery temp $temp C";
}
# lastly, check bbu state
# Operational|Optimal - working fine
# Non-Operational - problem
if ( $bbuState ne "Operational" && $bbuState ne "Optimal" ) {
push @crits, "adp $adpID : bbu $bbuID : reporting $bbuState";
}
}
}
}
#
# checks on the physical drives assigned to each adapter
#
sub check_phys_drives {
# check if there are any physical drives detected
# if not, just stop here
my $pdNumber_oid = "$lsi.4.1.4.2.1.1.0";
my $result = snmp_get_one ( $session, 'num physical drives', $pdNumber_oid );
if ( $result == 0 ) {
push @unknowns, "No physical drives detected";
return;
}
# check all the physical drives
my $physicalDriveTable_oid = "$lsi.4.1.4.2.1.2";
my @data = walk_table ( $session, 'phys drive table',
$physicalDriveTable_oid );
foreach my $row ( 0 .. $#{$data[1]} ) {
my $physDevID = $data[2][$row];
my $mediaErrCount = $data[7][$row];
my $otherErrCount = $data[8][$row];
my $predFailCount = $data[9][$row];
my $pdState = $data[10][$row];
my $slotNumber = $data[20][$row];
my $adpID = $data[22][$row];
my $temperature = $data[36][$row];
# first check the state of the device
my %drive_states = ( "0", "unconfigured-good",
"1", "unconfigured-bad",
"2", "hot-spare",
"16", "offline",
"17", "failed",
"20", "rebuild",
"24", "online",
"32", "copyback",
"64", "system",
"128", "unconfigured-shielded",
"130", "hotspare-shielded",
"144", "configured-shielded" );
if ( $pdState == 17 ) {
push @crits, "adp $adpID : slot $slotNumber drive : state is failed";
} elsif ( $pdState == 16 || $pdState == 1 || $pdState == 20 ||
$pdState == 32 ) {
push @warns, "adp $adpID : slot $slotNumber drive : " .
"state is " . $drive_states{$pdState};
} elsif ( $pdState == 0 || $pdState == 2 || $pdState == 24 ||
$pdState == 64 || $pdState == 128 || $pdState == 130 ||
$pdState == 144 ) {
push @oks, "adp $adpID : slot $slotNumber drive : " .
"state is " . $drive_states{$pdState};
} else {
push @unknowns, "adp $adpID : slot $slotNumber drive : " .
"state is unknown";
}
# warn about any error counts above 0 - this is probably excessive;
# sometimes shouting into the front of a drive tray can cause disk errors
if ( $predFailCount > 0 ) {
push @warns, "adp $adpID : slot $slotNumber drive : reporting " .
"$predFailCount predictive failure errors";
}
if ( $mediaErrCount > 0 ) {
push @warns, "adp $adpID : slot $slotNumber drive : reporting " .
"$mediaErrCount media errors";
}
if ( $otherErrCount > 0 ) {
push @warns, "adp $adpID : slot $slotNumber drive : reporting " .
"$otherErrCount other errors";
}
# check drive temps
if ( $temperature >= $drive_crit_temp ) {
push @crits, "adp $adpID : slot $slotNumber drive : temp " .
"$temperature C";
} elsif ( $temperature >= $drive_warn_temp ) {
push @warns, "adp $adpID : slot $slotNumber drive : temp " .
"$temperature C";
} else {
push @oks, "adp $adpID : slot $slotNumber drive : temp " .
"$temperature C";
}
}
}
#
# checks on the virtual drives assigned to each adapter
#
sub check_virt_drives {
my $vdNumbers_oid = "$lsi.4.1.4.3.1.1.0";
my $result = snmp_get_one ( $session, 'num virt drives', $vdNumbers_oid );
if ( $result == 0 ) {
push @unknowns, "No virtual drives detected";
return;
}
# check all the virtual drives
my $virtualDriveTable_oid = "$lsi.4.1.4.3.1.2";
my @data = walk_table ( $session, 'virt drive table',
$virtualDriveTable_oid );
foreach my $row ( 0 .. $#{$data[1]} ) {
my $virtualDevID = $data[1][$row];
my $state = $data[5][$row];
my $initState = $data[17][$row];
my $adpID = $data[20][$row];
my $badBlocksExist = $data[23][$row];
my %vdrive_states = ( "0", "offline",
"1", "partially-degraded",
"2", "degraded",
"3", "optimal" );
# check state first
if ( $state == 2 ) {
push @crits, "adp $adpID : vd $virtualDevID : state is " .
$vdrive_states{$state};
} elsif ( $state == 0 || $state == 1 ) {
push @warns, "adp $adpID : vd $virtualDevID : state is " .
$vdrive_states{$state};
} elsif ( $state == 3 ) {
push @oks, "adp $adpID : vd $virtualDevID : " . $vdrive_states{$state};
} else {
push @unknowns, "adp $adpID : vd $virtualDevID : state is unknown";
}
# if initialization state is anything but 0, warn
if ( $initState != 0 ) {
push @warns, "adp $adpID : vd $virtualDevID : initializing";
}
# if there are bad blocks detected, warn
if ( $badBlocksExist != 0 ) {
push @warns, "adp $adpID : vd $virtualDevID : bad blocks detected";
}
}
}
sub help {
print "Usage: $0 -H -C [-a] [-d] [-v]
-H host utilizing an LSI PERC RAID controller
-C SNMP community string for host
-a check controller adapters
-d check controller disk drives
-v verbose mode
-h print this message\n";
exit 0;
}
#
# thanks to Doke Scott for his nifty SNMP walking and storage subroutines :)
#
sub snmp_session {
my ( $host, $community ) = @_;
my ( $ipaddr, $filename, $session, $error );
if ( ! $ipaddr ) {
$ipaddr = scalar gethostbyname( $host );
if ( ! $ipaddr ) {
warn "can't lookup ip address for $host\n";
return 0;
}
$ipaddr = inet_ntoa( $ipaddr );
}
$verbose && print "scanning $host, $ipaddr, $community\n";
( $session, $error ) = Net::SNMP->session(
-version => $use_snmpv2c ? 'snmpv2c' : 'snmpv1',
-hostname => $ipaddr,
-community => $community,
-retries => 3,
-maxmsgsize => 2000,
#-debug => 0x02
);
if ( ! defined( $session ) ) {
warn "snmp setup error: $error\n";
unlink "$filename.new";
return;
}
$session->translate( [ '-octetstring' => 0 ] );
return $session;
}
sub snmp_get_one {
my( $session, $name, $oid ) = @_;
my( @oids, $result, $oid2, $val );
$verbose && print "snmp_get $name\n";
@oids = ( $oid );
$result = $session->get_request( -varbindlist => \@oids );
if ( ! defined( $result ) ) {
warn sprintf "error getting %s from %s: %s\n",
$name, $session->hostname, $session->error();
return undef;
}
if ( ! exists $result->{ $oid } ) {
warn "snmp_get error: requested $name oid $oid not in response\n";
return undef;
}
return $result->{ $oid };
}
sub snmp_get {
my( $session, $name, $oids ) = @_;
my( $result, $oid );
$verbose && print "snmp_get $name\n";
$result = $session->get_request( -varbindlist => $oids );
if ( ! defined( $result ) ) {
warn sprintf "error getting %s from %s: %s\n",
$name, $session->hostname, $session->error();
return undef;
}
# clean the noSuchObject errors out of the result
foreach $oid ( keys %{$result} ) {
if ( $result->{ $oid } eq 'noSuchObject' ) {
delete $result->{ $oid };
}
}
return $result;
}
sub snmp_walk {
my( $session, $name, $baseoid ) = @_;
my( $result, $maxmsgsize );
$verbose && print "snmp_walk $name\n";
foreach $maxmsgsize ( $prefered_maxmsgsize, 1472, 1600, 1800, 2048, 4096, 9000 ) {
$session->max_msg_size( $maxmsgsize );
$verbose > 1 && print "snmp get_entries tring maxmsgsize $maxmsgsize\n";
$result = $session->get_table( -baseoid => $baseoid );
# print "session error ", $session->error(), "\n";
if ( defined( $result ) ) {
return $result;
}
if ( $session->error() =~ m/Requested table is empty/ ) {
return undef;
}
elsif ( $session->error() =~ m/Message size exceeded buffer maxMsgSize/ ) {
# retry with different size
$verbose && print "snmp get_entries failed with maxmsgsize $maxmsgsize\n";
next;
}
else {
warn sprintf "error walking %s table on %s: %s\n",
$name, $session->hostname, $session->error();
return undef;
}
}
return undef;
}
# for tables with simple 1 integer indecies, ie most normal ones
# Returns an array of arrays: data[ column ][ row ]
sub walk_table {
my( $session, $name, $baseoid ) = @_;
my( $result, $rows, $oid, $val, $col, $row, @data );
$verbose && print "walk_table $name\n";
$result = snmp_walk( $session, $name, $baseoid ) || return undef;
$rows = 0;
foreach $oid ( keys %$result ) {
$val = $result->{ $oid };
if($verbose) { print "$oid = $val\n"; }
next if ( $val eq 'endOfMibView' );
if ( $oid =~ m/.*\.(\d+)\.(\d+)$/ ) { #print $1, ":", $2, "\n";
$col = $1; $row = $2;
$data[$col][$row] = $val;
}
}
if ( ! @data ) {
push @unknowns, "no rows in $name table";
return undef;
}
return @data;
}
|