Home | Code Samples | Software | Networking | Curriculum Vitae (PDF) blue@udel.edu    Phone: (302) 831-7281    Office 212P, The Computing Center
192 S. Chapel St., Newark, DE 19716

We have several Dell rack servers that use hardware LSI PERC RAID controllers for hard disks. LSI provided an SNMP module for these a few years back, so I decided to write a Nagios plugin to monitor the state of the controllers and the disks they controlled. Code is below. The SNMP modules for LSI Corporation PERC RAID controllers can be obtained at www.lsi.com.

Required Perl modules:
Getopt::Long
Net::SNMP
Data::Dumper

#!/usr/local/bin/perl

################################################################################
#                                                                              #
# check_perc_raid, v1.1                                                        #
#                                                                              #
# Fraser Gutteridge, blue@udel.edu   11/08/2014                                #
# University of Delaware Network & Systems Services                            #
#                                                                              #
# Checks on Dell PowerEdge Raid Controller (PERC) devices. Output can be used  #
# with Nagios Remote Plugin Executor.                                          #
#                                                                              #
# Pre-reqs: LSI Corporation SNMP modules must be installed on queried host     #
#           available at www.lsi.com                                           #
#                                                                              #
# Tested on following PERC models:                                             #
#    PERC H700 (Dell PowerEdge R715)                                           #
#    PERC H710 Mini (Dell PowerEdge R720)                                      #
#                                                                              #
################################################################################

#
# changelog
#
# v1.0 -> v1.1 by blue, 2016-03-08
# - added "Optimal" battery backup state for PERC H700
#

use strict;
use warnings;
no warnings 'redefine';
use Getopt::Long;
use Net::SNMP;
use Socket;
use Data::Dumper;

# globals
use vars qw ( $host $community $enterprise $lsi %snmp_sessions $rc $sep @crits
              @warns @unknowns @oks $use_snmpv2c %name2ip $prefered_maxmsgsize
              $roc_warn_temp $roc_crit_temp $bbu_warn_cap $bbu_crit_cap
              $session $verbose $help $bbu_warn_temp $bbu_crit_temp $adapters
              $drives $drive_warn_temp $drive_crit_temp );

############################
# config vars default vals #
############################

$enterprise = "1.3.6.1.4.1";
$lsi = "$enterprise.3582";

$verbose = 0;

$use_snmpv2c = 1;

$prefered_maxmsgsize = 1024;

# RAID On Chip (ROC) temp thresholds in Centigrade
$roc_warn_temp = 80;
$roc_crit_temp = 90;

# battery backup unit capacity remaining thresholds
$bbu_warn_cap = 20;
$bbu_crit_cap = 10;

# battery backup unit temp thresholds in Centigrade
$bbu_warn_temp = 60;
$bbu_crit_temp = 70;

# Seagate ST3300657SS operating temp thresholds in Centigrade
# might need to come up with a hash for this based on drive
# model no. if we start getting a lot of PERC controllers, hmm...
$drive_warn_temp = 45;
$drive_crit_temp = 50;

###################
# end config vars #
###################

# get command line options
Getopt::Long::Configure ( "bundling" );
GetOptions ( 'H=s' => \$host,
             'C=s' => \$community,
             'a+'  => \$adapters,
             'd+'  => \$drives,
             'v+'  => \$verbose,
             'h'   => \$help );
&help if $help;

if ( ! $host ) {
   print "You need to enter a hostname!\n";
   exit 2;
}
if ( ! $community ) {
   print "You need to enter a community string!\n";
   exit 2;
}
if ( ! $adapters && ! $drives ) {
   print "You need to pass either the -a or -d options, or both!\n";
   exit 2;
}

# start snmp session
$session = snmp_session ( $host, $community );

check_if_lsi ( );

if ( $adapters ) {
   check_adapters ( );
   check_bbus ( );
}

if ( $drives ) {
   check_phys_drives ( );
   check_virt_drives ( );
}

$rc = 0;
$sep = '';

if( $#unknowns >= 0 ) {
   $rc = -1 if ( $rc == 0 );
   print $sep, "Unknown ", join( ", ", @unknowns );
}
if ($#crits >= 0) {
   $rc = 2;
   print "CRITICAL ", join(", ", @crits);
   $sep = '; ';
}
if ($#warns >= 0) {
   $rc = 1 if ($rc == 0);
   print $sep, "Warning ", join(", ", @warns);
   $sep = '; ';
}
if ( $rc == 0 ) {
   print "OK ", join( ", ", @oks );
}

print "\n";
exit $rc;

###############
# subroutines #
###############

#
# does a pretty low intelligence check to see if the host queried has an LSI
# section in the enterprises area and if the expected module is there
#
sub check_if_lsi {
   my $agentModuleName_oid = "$lsi.4.1.3.2.0";

   my $agentModuleName = snmp_get_one ( $session, 'agent module name',
                                        $agentModuleName_oid );

   if ( ! $agentModuleName ) {
      print "The LSI SNMP module does not appear to be installed on this " .
            "host.\n";
      exit 2;
   }

   $verbose && print "Module name: " . $agentModuleName . "\n";

   if ( $agentModuleName !~ m/lsi/ ) {
      print "This does not appear to be the expected LSI module.\n";
      exit 2;
   }
}

#
# checks a few things on the MegaRAID SAS controller;
# the virtual device and disk counts won't give us much
# info on what's wrong, but they'll hopefully alert us
# that there is a problem
#
sub check_adapters {
   # first check and make sure there are actually adapters present
   # if not we can stop here
   my $adpNumber_oid = "$lsi.4.1.4.1.1.0";

   my $result = snmp_get_one ( $session, 'number of adapters', $adpNumber_oid );

   if ( $result == 0 ) {
      push @unknowns, "No adapters appear to be present";
      exit 2;
   }

   my $adapterPropertiesTable_oid = "$lsi.4.1.4.1.2";

   my @data = walk_table ( $session, 'adapter properties table',
                           $adapterPropertiesTable_oid );

   foreach my $row ( 0 .. $#{$data[1]} ) {
      my $adpID = $data[1][$row];
      my $alarmState = $data[4][$row];
      my $vdDegradedCount = $data[19][$row];
      my $pdDiskPredFailureCount = $data[23][$row];
      my $pdDiskFailedCount = $data[24][$row];
      my $temperatureROC = $data[42][$row];

      # check virtual and physical failure counts
      if ( $vdDegradedCount > 0 ) {
         push @crits, "adp $adpID : $$vdDegradedCount virtual devices critical";
      }
      if ( $pdDiskPredFailureCount > 0 ) {
         push @warns, "adp $adpID : $pdDiskPredFailureCount disks are failing";
      }
      if ( $pdDiskFailedCount > 0 ) {
         push @crits, "adp $adpID : $pdDiskFailedCount disks have failed";
      }
      
      # check ROC temp
      if ( $temperatureROC >= $roc_crit_temp ) {
         push @crits, "adp $adpID : ROC temp $temperatureROC degrees C"
      } elsif ( $temperatureROC >= $roc_warn_temp ) {
         push @warns, "adp $adpID : ROC temp $temperatureROC degrees C"
      } else {
         push @oks, "adp $adpID : ROC temp $temperatureROC degrees C"
      }
   }
}

#
# checks the battery backup units on the controllers, if there are any
#
sub check_bbus {
   # get the number of bbus on this controller, if any
   my $bbuNumber_oid = "$lsi.4.1.4.1.6.1.0";

   my $bbuNumber = snmp_get_one ( $session, 'bbu number', $bbuNumber_oid );

   if ( $bbuNumber > 0 ) {
      my $bbuTable_oid = "$lsi.4.1.4.1.6.2";

      my @data = walk_table ( $session, 'bbu table', $bbuTable_oid );

      foreach my $row ( 0 .. $#{$data[1]} ) {
         my $bbuID = $data[1][$row];
         my $voltage = $data[9][$row];
         my $relStateOfCharge = $data[11][$row];
         my $temp = $data[16][$row];
         my $adpID = $data[21][$row];
         my $bbuState = $data[29][$row];

         # check battery voltage - most of the LSI MegaRAID SAS controllers
         # have lithium-polymer rechargeable batteries that should be around
         # 3.75V

         # units for reported voltage by LSI SNMP are usually mV so convert to
         # volts first
         $voltage = $voltage / 1000;

         if ( $voltage > 4.1 || $voltage < 3.5 ) {
            push @warns, "adp $adpID : bbu $bbuID : voltage out of range, " .
                         "reporting $voltage V";
         } else {
            push @oks, "adp $adpID : bbu $bbuID : battery voltage $voltage V";
         }

         # check remaining capacity of battery
         if ( $relStateOfCharge <= $bbu_crit_cap ) {
            push @crits, "adp $adpID : bbu $bbuID : battery capacity " .
                         "$relStateOfCharge%";
         } elsif ( $relStateOfCharge <= $bbu_warn_cap ) {
            push @warns, "adp $adpID : bbu $bbuID : battery capacity " .
                         "$relStateOfCharge%";
         } else {
            push @oks, "adp $adpID : bbu $bbuID : battery capacity " .
                       "$relStateOfCharge%";
         }

         # LSI doesn't provide temperature thresholds for their batteries in
         # their MIB files, but most Li-polymer 3.75V battery SEI layers start
         # melting at around 80 degrees C, so let's use 70 degrees as our
         # critical temp

         # the temperature is reported in degrees centigrade but there's also
         # an operating status string attached to that, so strip it away
         $temp =~ m/^(\d+).+/;
         $temp = $1;

         if ( $temp >= $bbu_crit_temp ) {
            push @crits, "adp $adpID : bbu $bbuID : battery temp $temp C";
         } elsif ( $temp >= $bbu_warn_temp ) {
            push @warns, "adp $adpID : bbu $bbuID : battery temp $temp C";
         } else {
            push @oks, "adp $adpID : bbu $bbuID : battery temp $temp C";
         }

         # lastly, check bbu state
         # Operational|Optimal - working fine
         # Non-Operational - problem
         if ( $bbuState ne "Operational" && $bbuState ne "Optimal" ) {
            push @crits, "adp $adpID : bbu $bbuID : reporting $bbuState";
         }
      }
   }
}

#
# checks on the physical drives assigned to each adapter
#
sub check_phys_drives {
   # check if there are any physical drives detected
   # if not, just stop here
   my $pdNumber_oid = "$lsi.4.1.4.2.1.1.0";

   my $result = snmp_get_one ( $session, 'num physical drives', $pdNumber_oid );

   if ( $result == 0 ) {
      push @unknowns, "No physical drives detected";
      return;
   }

   # check all the physical drives
   my $physicalDriveTable_oid = "$lsi.4.1.4.2.1.2";

   my @data = walk_table ( $session, 'phys drive table',
                           $physicalDriveTable_oid );

   foreach my $row ( 0 .. $#{$data[1]} ) {
      my $physDevID = $data[2][$row];
      my $mediaErrCount = $data[7][$row];
      my $otherErrCount = $data[8][$row];
      my $predFailCount = $data[9][$row];
      my $pdState = $data[10][$row];
      my $slotNumber = $data[20][$row];
      my $adpID = $data[22][$row];
      my $temperature = $data[36][$row];

      # first check the state of the device
      my %drive_states = ( "0", "unconfigured-good",
                           "1", "unconfigured-bad",
                           "2", "hot-spare",
                           "16", "offline",
                           "17", "failed",
                           "20", "rebuild",
                           "24", "online",
                           "32", "copyback",
                           "64", "system",
                           "128", "unconfigured-shielded",
                           "130", "hotspare-shielded",
                           "144", "configured-shielded" );

      if ( $pdState == 17 ) {
         push @crits, "adp $adpID : slot $slotNumber drive : state is failed";
      } elsif ( $pdState == 16 || $pdState == 1 || $pdState == 20 ||
                $pdState == 32 ) {
         push @warns, "adp $adpID : slot $slotNumber drive : " .
                      "state is " . $drive_states{$pdState};
      } elsif ( $pdState == 0 || $pdState == 2 || $pdState == 24 ||
                $pdState == 64 || $pdState == 128 || $pdState == 130 ||
                $pdState == 144 ) {
         push @oks, "adp $adpID : slot $slotNumber drive : " .
                    "state is " . $drive_states{$pdState};
      } else {
         push @unknowns, "adp $adpID : slot $slotNumber drive : " .
                         "state is unknown";
      }

      # warn about any error counts above 0 - this is probably excessive;
      # sometimes shouting into the front of a drive tray can cause disk errors
      if ( $predFailCount > 0 ) {
         push @warns, "adp $adpID : slot $slotNumber drive : reporting " .
                      "$predFailCount predictive failure errors";
      }
      if ( $mediaErrCount > 0 ) {
         push @warns, "adp $adpID : slot $slotNumber drive : reporting " .
                      "$mediaErrCount media errors";
      }
      if ( $otherErrCount > 0 ) {
         push @warns, "adp $adpID : slot $slotNumber drive : reporting " .
                      "$otherErrCount other errors";
      }

      # check drive temps
      if ( $temperature >= $drive_crit_temp ) {
         push @crits, "adp $adpID : slot $slotNumber drive : temp " .
                      "$temperature C";
      } elsif ( $temperature >= $drive_warn_temp ) {
         push @warns, "adp $adpID : slot $slotNumber drive : temp " .
                      "$temperature C";
      } else {
         push @oks, "adp $adpID : slot $slotNumber drive : temp " .
                    "$temperature C";
      }
   }
}

#
# checks on the virtual drives assigned to each adapter
#
sub check_virt_drives {
   my $vdNumbers_oid = "$lsi.4.1.4.3.1.1.0";

   my $result = snmp_get_one ( $session, 'num virt drives', $vdNumbers_oid );

   if ( $result == 0 ) {
      push @unknowns, "No virtual drives detected";
      return;
   }

   # check all the virtual drives
   my $virtualDriveTable_oid = "$lsi.4.1.4.3.1.2";

   my @data = walk_table ( $session, 'virt drive table',
                           $virtualDriveTable_oid );

   foreach my $row ( 0 .. $#{$data[1]} ) {
      my $virtualDevID = $data[1][$row];
      my $state = $data[5][$row];
      my $initState = $data[17][$row];
      my $adpID = $data[20][$row];
      my $badBlocksExist = $data[23][$row];

      my %vdrive_states = ( "0", "offline",
                            "1", "partially-degraded",
                            "2", "degraded",
                            "3", "optimal" );

      # check state first
      if ( $state == 2 ) {
         push @crits, "adp $adpID : vd $virtualDevID : state is " .
                      $vdrive_states{$state};
      } elsif ( $state == 0 || $state == 1 ) {
         push @warns, "adp $adpID : vd $virtualDevID : state is " .
                      $vdrive_states{$state};
      } elsif ( $state == 3 ) {
         push @oks, "adp $adpID : vd $virtualDevID : " . $vdrive_states{$state};
      } else {
         push @unknowns, "adp $adpID : vd $virtualDevID : state is unknown";
      }

      # if initialization state is anything but 0, warn
      if ( $initState != 0 ) {
         push @warns, "adp $adpID : vd $virtualDevID : initializing";
      }

      # if there are bad blocks detected, warn
      if ( $badBlocksExist != 0 ) {
         push @warns, "adp $adpID : vd $virtualDevID : bad blocks detected";
      }
   }
}

sub help {
   print "Usage: $0 -H  -C  [-a] [-d] [-v]
          -H    host utilizing an LSI PERC RAID controller
          -C    SNMP community string for host
          -a    check controller adapters
          -d    check controller disk drives
          -v    verbose mode
          -h    print this message\n";
   exit 0;
}

#
# thanks to Doke Scott for his nifty SNMP walking and storage subroutines :)
#  
sub snmp_session {
   my ( $host, $community ) = @_;
   my ( $ipaddr, $filename, $session, $error );

   if ( ! $ipaddr ) {
      $ipaddr = scalar gethostbyname( $host );
      if ( ! $ipaddr ) {
         warn "can't lookup ip address for $host\n";
         return 0;
         }
      $ipaddr = inet_ntoa( $ipaddr );
      }
   $verbose && print "scanning $host, $ipaddr, $community\n";

   ( $session, $error ) = Net::SNMP->session(
       -version => $use_snmpv2c ? 'snmpv2c' : 'snmpv1',
       -hostname => $ipaddr,
       -community => $community,
       -retries => 3,
       -maxmsgsize => 2000,
       #-debug => 0x02
       );
   if ( ! defined( $session ) ) {
       warn "snmp setup error: $error\n";
       unlink "$filename.new";
       return;
       }
   $session->translate( [ '-octetstring' => 0 ] );

   return $session;
}
   
sub snmp_get_one {
    my( $session, $name, $oid ) = @_;
    my( @oids, $result, $oid2, $val );

    $verbose && print "snmp_get $name\n";
    @oids = ( $oid );
    $result = $session->get_request( -varbindlist => \@oids );
    if ( ! defined( $result ) ) {
        warn sprintf "error getting %s from %s: %s\n",
            $name, $session->hostname, $session->error();
        return undef;
        }

    if ( ! exists $result->{ $oid } ) {
        warn "snmp_get error: requested $name oid $oid not in response\n";
        return undef;
        }
    return $result->{ $oid };
    }

sub snmp_get {
    my( $session, $name, $oids ) = @_;
    my( $result, $oid );

    $verbose && print "snmp_get $name\n";
    $result = $session->get_request( -varbindlist => $oids );
    if ( ! defined( $result ) ) {
        warn sprintf "error getting %s from %s: %s\n",
            $name, $session->hostname, $session->error();
        return undef;
        }

    # clean the noSuchObject errors out of the result
    foreach $oid ( keys %{$result} ) {
        if ( $result->{ $oid } eq 'noSuchObject' ) {
            delete $result->{ $oid };
            }
        }

    return $result;
    }

sub snmp_walk {
    my( $session, $name, $baseoid ) = @_;
    my( $result, $maxmsgsize );

    $verbose && print "snmp_walk $name\n";
    foreach $maxmsgsize ( $prefered_maxmsgsize, 1472, 1600, 1800, 2048, 4096, 9000 ) {
        $session->max_msg_size( $maxmsgsize );
        $verbose > 1 && print "snmp get_entries tring maxmsgsize $maxmsgsize\n";

        $result = $session->get_table( -baseoid => $baseoid );
#        print "session error ", $session->error(), "\n";

        if ( defined( $result ) ) {
            return $result;
            }

        if ( $session->error() =~ m/Requested table is empty/ ) {
            return undef;
            }
        elsif ( $session->error() =~ m/Message size exceeded buffer maxMsgSize/ ) {
            # retry with different size
            $verbose && print "snmp get_entries failed with maxmsgsize $maxmsgsize\n";
            next;
            }
        else {
            warn sprintf "error walking %s table on %s: %s\n",
                $name, $session->hostname, $session->error();
            return undef;
            }
        }
    return undef;
    }

# for tables with simple 1 integer indecies, ie most normal ones
# Returns an array of arrays: data[ column ][ row ]
sub walk_table {
    my( $session, $name, $baseoid ) = @_;
    my( $result, $rows, $oid, $val, $col, $row, @data );

    $verbose && print "walk_table $name\n";

    $result = snmp_walk( $session, $name, $baseoid ) || return undef;

    $rows = 0;
    foreach $oid ( keys %$result ) {
        $val = $result->{ $oid };
        if($verbose) { print "$oid = $val\n"; }

        next if ( $val eq 'endOfMibView' );
        if ( $oid =~ m/.*\.(\d+)\.(\d+)$/ ) { #print $1, ":", $2, "\n";
            $col = $1; $row = $2;
            $data[$col][$row] = $val;
            }
        }

    if ( ! @data ) {
        push @unknowns, "no rows in $name table";
        return undef;
        }

    return @data;
    }