#!/usr/local/bin/perl # # Check an elom equiped sun # So far, this means x2100s and x2200s # running sp firmware version 1.80, 2.70, 2.91, and maybe 3.09 and 3.13 # Though I don't trust 3.09. # # $Header: /home/doke/work/nagios/RCS/check_elom,v 1.20 2016/01/11 23:40:43 doke Exp $ my $community = 'public'; my $timeout = 10; my $delay = 500000; # microseconds use strict; # for testing use warnings; # for testing use Getopt::Long; use Net::SNMP; use Time::HiRes qw( usleep gettimeofday ); #use Data::Dumper; my $host = ''; my $verbose = 0; my $help = 0; my $mib2 = '1.3.6.1.2.1'; my $enterprises = '1.3.6.1.4.1'; my $sun = "$enterprises.42"; my $sp = "$sun.1"; # service processor mib for v1.8 - 2.91 my $sp309 = "$sun.2.208"; # They moved the mib between 2.91 and 3.09 $ENV{PATH} = "/usr/bin:/bin/:/usr/sbin"; my( @crits, @warns, @unknowns, @ignores, @oks, $all_fans_off, $all_temps_off, $all_voltages_off ); sub usage { my( $rc ) = @_; print "Usage: $0 [-vh] -H [-C ] -H s hostname -C s snmp community [public] -v verbose -h help "; exit $rc; } Getopt::Long::Configure ("bundling"); GetOptions( 'H=s' => \$host, 'C=s' => \$community, 'v+' => \$verbose, 'h' => \$help, ); &usage( 0 ) if ( $help ); &usage( 0 ) if ( ! $host ); &check(); my $rc = 0; my $sep = ''; if ( $#crits >= 0 ) { $rc = 2; print "CRITICAL ", join( ", ", @crits ); $sep = '; '; } if ( $#warns >= 0 ) { $rc = 1 if ( $rc == 0 ); print $sep, "Warning ", join( ", ", @warns ); $sep = '; '; } if ( $#unknowns >= 0 ) { $rc = -1 if ( $rc == 0 ); print $sep, "Unknown ", join( ", ", @unknowns ); $sep = '; '; } if ( $rc == 0 ) { print "Ok ", join( ", ", @oks ); $sep = '; '; } if ( $#ignores >= 0 ) { print $sep, "Ignoring ", join( ", ", @ignores ); $sep = '; '; } print "\n"; exit $rc; ################## sub check { my( $session, $error, $result, @oids, $model, $cmd, $pingout, $FWVersion, $before ); $before = gettimeofday(); ( $session, $error ) = Net::SNMP->session( -version => 'snmpv1', # elom can get overloaded by 2c -hostname => $host, -community => $community, -timeout => $timeout, #-debug => 0x02 ); if ( ! defined( $session ) ) { push @unknowns, "snmp setup error: $error"; return; } $session->translate( [ '-octetstring' => 0 ] ); #$session->translate( '-all' => 0 ); #$session->translate( '-unsigned' => 1 ); # model and firmware version # make sure we know how to read this unit # my $FWVersion_oid = "$sp.1.1.2.0"; my $BoardProductName = "$sp.1.2.1.2.0"; my $ChassisPartNumber = "$sp.1.2.6.2.0"; my $ChassisSerialNumber = "$sp.1.2.6.3.0"; # NOT system serial number @oids = ( $FWVersion_oid, $BoardProductName, $ChassisPartNumber, $ChassisSerialNumber ); $verbose && print "getting FWVersion $FWVersion_oid, BoardProductName $BoardProductName, ChassisPartNumber $ChassisPartNumber, ChassisSerialNumber $ChassisSerialNumber\n"; $result = $session->get_request( -varbindlist => \@oids ); if ( ! defined( $result ) || $result->{ $FWVersion_oid } eq 'noSuchObject' ) { # They moved the mib between 2.91 and 3.09 $sp = $sp309; $FWVersion_oid = "$sp.1.1.2.0"; $BoardProductName = "$sp.1.2.1.2.0"; $ChassisPartNumber = "$sp.1.2.6.2.0"; $ChassisSerialNumber = "$sp.1.2.6.3.0"; @oids = ( $FWVersion_oid, $BoardProductName, $ChassisPartNumber, $ChassisSerialNumber ); $verbose && print "getting alternate location for FWVersion $FWVersion_oid, BoardProductName $BoardProductName, ChassisPartNumber $ChassisPartNumber, ChassisSerialNumber $ChassisSerialNumber\n"; $result = $session->get_request( -varbindlist => \@oids ); if ( ! defined( $result ) ) { push @unknowns, "couldn't get fw version " . $session->error(); if ( -x "/bin/sun" ) { $cmd = "/usr/sbin/ping $host 1 2>&1"; } else { $cmd = "ping -c 2 -W 2 $host>&1"; } $verbose && print "+ $cmd\n"; if ( open( pH, '-|', $cmd ) ) { $pingout = ''; while ( ) { $verbose && print ">$_"; chomp; $pingout .= $_ . ', '; } close pH; $rc = $? >> 8; if ( $rc ) { $pingout =~ s/, $//; push @crits, "ping failed: $pingout"; } else { push @unknowns, "ping worked, snmp may not be configured\n"; } } else { push @unknowns, "unable to run ping: $!\n"; } return; } } $FWVersion = $result->{ $FWVersion_oid }; if ( $verbose ) { print "FWVersion = ", $FWVersion, "\n"; print "BoardProductName = ", $result->{ $BoardProductName }, "\n"; print "ChassisPartNumber = ", $result->{ $ChassisPartNumber }, "\n"; print "ChassisSerialNumber = ", $result->{ $ChassisSerialNumber }, "\n"; } if ( $result->{ $BoardProductName } eq 'S39' ) { # X2200 M2 $model = 'X2200 M2'; } elsif ( $result->{ $BoardProductName } eq 'S40' ) { # X2100, base or m2 $model = 'X2100'; } else { push @unknowns, "unknown board model " . $result->{ $BoardProductName }; return 0; } # do we know how to read this firmware version's mib? if ( $FWVersion eq '3.23' || $FWVersion eq '3.20' || $FWVersion eq '3.15' || $FWVersion eq '3.13' ) { # Ok } elsif ( $FWVersion eq '3.09' ) { # They moved the mib between 2.91 and 3.09 # from sun.1 to sun.products(2).208 # Ok } elsif ( $FWVersion eq '2.91' || $FWVersion eq '2.70' || $FWVersion eq '1.80' ) { # Ok } else { push @unknowns, "unknown firmware revision " . $FWVersion; return 0; } if ( $result->{ $ChassisSerialNumber } ) { push @oks, $model; #push @oks, "sn " . $result->{ $ChassisSerialNumber }; # confusing, not system sn push @oks, "fw " . $FWVersion; } $verbose && printf "elapsed %0.3f s\n", gettimeofday() - $before; $before = gettimeofday(); check_cpus( $session, $sp ); $verbose && printf "elapsed %0.3f s\n", gettimeofday() - $before; $before = gettimeofday(); check_memory( $session, $sp ); $verbose && printf "elapsed %0.3f s\n", gettimeofday() - $before; # nic1 my $mac1a_oid = "$sp.1.2.4.7.0"; my $mac1b_oid = "$sp.1.2.4.8.0"; # nic2 my $mac2a_oid = "$sp.1.2.5.7.0"; my $mac2b_oid = "$sp.1.2.5.8.0"; @oids = ( $mac1a_oid, $mac1b_oid, $mac2a_oid, $mac2b_oid ); usleep( $delay ); $result = $session->get_request( -varbindlist => \@oids ); if ( defined( $result ) ) { push @oks, "nic1 macs " . lc $result->{ $mac1a_oid } . ' ' . lc $result->{ $mac1b_oid }; push @oks, "nic2 macs " . lc $result->{ $mac2a_oid } . ' ' . lc $result->{ $mac2b_oid }; } else { push @unknowns, "couldn't get nic macs " . $session->error(); } $all_fans_off = $all_temps_off = $all_voltages_off = 0; $before = gettimeofday(); check_fans( $session, $sp, $FWVersion ); $verbose && printf "elapsed %0.3f s\n", gettimeofday() - $before; $before = gettimeofday(); check_temps( $session, $sp ); $verbose && printf "elapsed %0.3f s\n", gettimeofday() - $before; $before = gettimeofday(); check_voltages( $session, $sp ); $verbose && printf "elapsed %0.3f s\n", gettimeofday() - $before; # if they're all off, then the system is off. if ( $all_fans_off && $all_temps_off && $all_voltages_off ) { # system is off undef @warns; undef @crits; push @crits, "system is off"; } } sub check_cpus { my( $session, $sp ) = @_; my( @data, $row, $nrows ); # cpu table # sp.1.2.2.1.1 # 1 index # 2 Designation # 3 Manufacturer # 4 Name/model, ie Opteron # 5 speed in MHz # 6 status, 2 = enabled # @data = walk_table( $session, 'cpus', "$sp.1.2.2.1.1" ); $nrows = scalar( @{$data[1]} ); foreach $row ( 1 .. $nrows - 1 ) { if ( $data[6][$row] != 2 ) { push @warns, "$data[2][$row] not enabled"; } } } sub check_memory { my( $session, $sp ) = @_; my( $result, $error, $oid, $val, $row, $result2, @memory_desigs, $total ); # memory table # sp.1.2.3.1.1 # 1 index # 2 designation, ie "CPU0 DIMM 0" # 3 type, ie "DDR2 DRAM" or "no DIMM present" # 4 speed # 5 size in MB # 6 status, 1 = ok?, 2 = ok?, 4 = not present? # # @data = walk_table( $session, 'memory status', "$sp.1.2.3.1.1" ); # $total = 0; # $nrows = scalar( @{$data[1]} ); # foreach $row ( 1 .. $nrows - 1 ) { # next if ( $data[3][$row] =~ m/no .* present/i ); # if ( $data[6][$row] == 1 || $data[6][$row] == 2 ) { # $total += $data[5][$row] # } # else { # push @warns, "$data[2][$row] bad"; # } # } # push @oks, sprintf "memory %0.2f GB", $total / 1024; $result = snmpwalk( $session, 'memory status', "$sp.1.2.3.1.1.6" ); $error = 0; foreach $oid ( keys %$result ) { $val = $result->{ $oid }; $verbose && print "$oid $val\n"; if ( $val == 1 || $val == 2 || $val == 4 ) { # ok; } else { # bad $error = 1; last; } } if ( $error ) { # we've got a memory error # get the display names $result2 = snmpwalk( $session, 'memory designation', "$sp.1.2.3.1.1.2" ); if ( ! $result2 ) { push @warns, "there are memory errors, but unable to retrieve descriptions"; } else { foreach $oid ( keys %$result2 ) { $val = $result->{ $oid }; $verbose && print "$oid $val\n"; if ( $oid =~ m/\.(\d+)$/ ) { $row = $1; $memory_desigs[ $row ] = $val; } } # now show the errors foreach $oid ( keys %$result ) { $val = $result->{ $oid }; $verbose && print "$oid $val\n"; if ( $val == 1 || $val == 2 || $val == 4 ) { # ok; } elsif ( $oid =~ m/\.(\d+)$/ ) { # bad, and have row number $row = $1; push @warns, $memory_desigs[$row] . " bad"; } } } } elsif ( $#crits < 0 && $#warns < 0 ) { # no memory errors, and no other errors yet # get the sizes # this takes a long time #$total = 0; #$result = snmpwalk( $session, 'memory size', "$sp.1.2.3.1.1.5" ); #foreach $oid ( keys %$result ) { #$val = $result->{ $oid }; #$verbose && print "$oid $val\n"; #$total += $val; #} #push @oks, sprintf "memory %0.2f GB", $total / 1024; } } sub check_fans { my( $session, $sp, $FWVersion ) = @_; my( @data, $row, $n, $nbad, $oid, $val, $result ); # fan table # sp.1.2.8.1.1" # 1 index # 2 designation, ie "Blower Fan 0" # 3 status, 2 = ok, 4 = off/failed/bad? # 4 speed ? # 5 speed ?, min/max/avg speed ? # 6 LowCriticalValue if ( $FWVersion eq '3.09' ) { # fans table is broken in 3.09 #return; } $result = snmpwalk( $session, 'fan status', "$sp.1.2.8.1.1.3" ); $n = $nbad = 0; foreach $oid ( keys %$result ) { $n++; $val = $result->{ $oid }; $verbose && print "$oid $val\n"; if ( $oid =~ m/\.(\d+)$/ ) { $row = $1; $data[3][ $row ] = $val; if ( $val != 2 ) { $nbad++; } } } $verbose && print "fans n $n, nbad $nbad \n"; if ( $n > 0 && $n == $nbad ) { # If none of them were valid, assume fan sensing/reporting is broken. # It's definately broken in version 3.09 and sometimes in 3.13. # We'll just hope the temperature table will show problems. push @ignores, "no fan speeds available"; #$all_fans_off = 1; } elsif ( $nbad > 0 ) { # we've got fans, and at least one of them is bad $result = snmpwalk( $session, 'fan designations', "$sp.1.2.8.1.1.2" ); foreach $oid ( keys %$result ) { $val = $result->{ $oid }; $verbose && print "$oid $val\n"; if ( $oid =~ m/\.(\d+)$/ ) { $data[2][ $1 ] = $val; } } $result = snmpwalk( $session, 'fan speeds', "$sp.1.2.8.1.1.4" ); foreach $oid ( keys %$result ) { $val = $result->{ $oid }; $verbose && print "$oid $val\n"; if ( $oid =~ m/\.(\d+)$/ ) { $data[4][ $1 ] = $val; } } $result = snmpwalk( $session, 'fan lowCriticals', "$sp.1.2.8.1.1.6" ); foreach $oid ( keys %$result ) { $val = $result->{ $oid }; $verbose && print "$oid $val\n"; if ( $oid =~ m/\.(\d+)$/ ) { $row = $1; $data[6][ $row ] = $val; if ( $data[4][$row] < $data[6][$row] ) { if ( $data[4][$row] == 0 ) { push @warns, "$data[2][$row] is stopped"; } else { push @warns, "$data[2][$row] is slow " . $data[4][$row] ; } } } } } } sub check_temps { my( $session, $sp ) = @_; my( @data, $row, $n, $nbad, $result, $oid, $val, $nrows ); # temp table # sp.1.2.9.1.1" # 1 index # 2 designation, ie "CPU 0 Temp" # 3 status, 3 = ok, 4 = off/na/low? # 4 current temp # 5 high warning level # 6 high critical level # # @data = walk_table( $session, 'temps', "$sp.1.2.9.1.1" ); # $n = $nbad = 0; # $nrows = scalar( @{$data[1]} ); # foreach $row ( 1 .. $nrows - 1 ) { # $n++; # if ( $data[3][$row] == 4 ) { # $nbad++; # } # } # $verbose && print "temps n $n, nbad $nbad \n"; # if ( $n > $nbad ) { # # At least one of them was valid, so do the temp errors. # foreach $row ( 1 .. $#{@{$data[1]}} ) { # if ( $data[4][$row] >= $data[6][$row] ) { # push @crits, "$data[2][$row] is high " . $data[4][$row] ; # } # elsif ( $data[4][$row] >= $data[5][$row] ) { # push @warns, "$data[2][$row] is high " . $data[4][$row] ; # } # elsif ( $data[3][$row] == 4 ) { # push @warns, "$data[2][$row] not available"; # } # elsif ( $data[3][$row] != 3 ) { # push @warns, "$data[2][$row] bad " . $data[4][$row] ; # } # } # } # elsif ( $n > 0 ) { # # there are temp rows, but all are unavailable/invalid # push @warns, "no temps available"; # $all_temps_off = 1; # } $result = snmpwalk( $session, 'temp status', "$sp.1.2.9.1.1.3" ); $n = $nbad = 0; foreach $oid ( keys %$result ) { $n++; $val = $result->{ $oid }; $verbose && print "$oid $val\n"; if ( $oid =~ m/\.(\d+)$/ ) { $row = $1; $data[3][ $row ] = $val; if ( $val != 3 ) { $nbad++; } } } $verbose && print "temps n $n, nbad $nbad \n"; if ( $n > 0 && $n == $nbad ) { # there are temp rows, but all are unavailable/invalid push @ignores, "no temps available"; $all_temps_off = 1; } elsif ( $nbad > 0 ) { # we've got temps, and at least one of them is bad @data = walk_table( $session, 'temps', "$sp.1.2.9.1.1" ); $nrows = scalar( @{$data[1]} ); foreach $row ( 1 .. $nrows - 1 ) { $verbose && print "$row $data[2][$row] $data[3][$row] $data[4][$row] $data[5][$row] $data[6][$row] \n"; if ( $data[4][$row] >= $data[6][$row] ) { push @crits, "$data[2][$row] is high " . $data[4][$row] ; } elsif ( $data[4][$row] >= $data[5][$row] ) { push @warns, "$data[2][$row] is high " . $data[4][$row] ; } elsif ( $data[3][$row] == 4 ) { push @warns, "$data[2][$row] not available"; } elsif ( $data[3][$row] != 3 ) { push @warns, "$data[2][$row] bad " . $data[4][$row] ; } } } } sub check_voltages { my( $session, $sp ) = @_; my( @data, $row, $n, $nbad, $result, $oid, $val, $nrows ); # voltages table # sp.1.2.10.1.1" # 1 index # 2 designation, ie "Vcc 12V" # 3 status, 3 = ok, 4 = off/na/low? # 4 current voltage # 5 low warning level # 6 low critical level # 7 high warning level # 8 high critical level # $result = snmpwalk( $session, 'voltage status', "$sp.1.2.10.1.1.3" ); $n = $nbad = 0; foreach $oid ( keys %$result ) { $n++; $val = $result->{ $oid }; $verbose && print "$oid $val\n"; if ( $oid =~ m/\.(\d+)$/ ) { $row = $1; $data[3][ $row ] = $val; if ( $val != 3 ) { $nbad++; } } } $verbose && print "voltages n $n, nbad $nbad \n"; if ( $n > 0 && $n == $nbad ) { # there are voltage rows but they're all zero push @ignores, "no voltages available"; $all_voltages_off = 1; } elsif ( $nbad > 0 ) { # At least one of them was invalid, so do the voltage errors. @data = walk_table( $session, 'voltages', "$sp.1.2.10.1.1" ); $nrows = scalar( @{$data[1]} ); foreach $row ( 1 .. $nrows - 1 ) { $verbose && print "$row $data[2][$row] $data[3][$row] $data[4][$row] $data[5][$row] $data[6][$row] $data[7][$row] $data[8][$row] \n"; if ( $data[4][$row] <= $data[6][$row] ) { push @crits, "voltage $data[2][$row] is low " . $data[4][$row] ; } elsif ( $data[4][$row] <= $data[5][$row] ) { push @warns, "voltage $data[2][$row] is low " . $data[4][$row] ; } elsif ( $data[4][$row] >= $data[8][$row] ) { push @crits, "voltage $data[2][$row] is high " . $data[4][$row] ; } elsif ( $data[4][$row] >= $data[7][$row] ) { push @warns, "voltage $data[2][$row] is high " . $data[4][$row] ; } elsif ( $data[3][$row] == 4 ) { push @warns, "voltage $data[2][$row] not available"; } elsif ( $data[3][$row] != 3 ) { push @warns, "voltage $data[2][$row] bad " . $data[4][$row] ; } } } } sub walk_table { my( $session, $name, $baseoid ) = @_; my( $result, $rows, $oid, $val, $col, $row, @data ); $verbose && print "snmpwalking table $name\n"; $result = snmpwalk( $session, $name, $baseoid ) || return undef; $rows = 0; foreach $oid ( sort keys %$result ) { $val = $result->{ $oid }; $verbose > 1 && print "$oid = $val\n"; next if ( $val eq 'endOfMibView' ); if ( $oid =~ m/.*\.(\d+)\.(\d+)$/ ) { $col = $1; $row = $2; $data[$col][$row] = $val; $rows = $row if ( $row > $rows ); } } return @data; } sub snmpwalk { my( $session, $name, $baseoid ) = @_; my( $result ); $verbose && print "snmpwalking $name\n"; usleep( $delay ); $result = $session->get_table( -baseoid => $baseoid ); if ( ! defined( $result ) && $session->error() !~ m/Requested table is empty/ ) { push @unknowns, sprintf "error walking $name table on %s: %s", $session->hostname, $session->error(); return undef; } if ( scalar( keys %$result ) < 1 ) { push @unknowns, sprintf "no rows in $name table on %s: %s", $session->hostname, $session->error(); return undef; } return $result; }