#!/usr/local/bin/perl
#
# Check fmdump output for faults
#
# $Header: /opt/home/doke/work/nagios/RCS/check_fmdump,v 1.16 2015/03/17 20:28:19 doke Exp $



use strict;
use warnings;
use Getopt::Long;

$ENV{PATH} = "/usr/local/bin:/opt/sfw/bin:/usr/bin:/bin:/usr/sbin:/sbin";

use vars qw( $verbose $help @crits @warns
    @unknowns @ignores @oks $rc $sep );

$verbose = 0;
$help = 0;


sub usage {
    my( $rc ) = @_;
    print "Usage: $0 [-vh] 
    -v    verbose
    -h    help
";
    exit $rc;
    }

Getopt::Long::Configure ("bundling");
GetOptions(
    'v+' => \$verbose,
    'h' => \$help,
    );
&usage( 0 ) if ( $help );


$rc = &check_fmadm_faulty();   
if ( ! $rc ) { 
    &check_fmdump( "5d", 1 );   # report faults in last 5 days as warnings
    }



$rc = 0;
$sep = '';
if ( $#crits >= 0 ) {
    $rc = 2;
    print "CRITICAL ", join( ", ", @crits );
    $sep = '; ';
    }
if ( $#warns >= 0 ) {
    $rc = 1 if ( $rc == 0 );
    print $sep, "Warning ", join( ", ", @warns );
    $sep = '; ';
    }
if ( $#unknowns >= 0 ) {
    $rc = -1 if ( $rc == 0 );
    print $sep, "Unknown ", join( ", ", @unknowns );
    $sep = '; ';
    }
if ( $rc == 0 ) {
    print "Ok ", join( ", ", @oks );
    $sep = '; ';
    }
if ( $#ignores >= 0 ) {
    print $sep, "Ignoring ", join( ", ", @ignores );
    $sep = '; ';
    }
print "\n";
exit $rc;


##################

sub check_fmadm_faulty {
    my( $cmd, $nfaults, $saw_header, $zone, $fru, $severity, $msg );

    if ( ! -e '/usr/sbin/fmadm' ) { 
	print "n/a, fmadm not present on this os release\n";
	exit 0;
	}

    $cmd = "sudo -S /usr/sbin/fmadm faulty < /dev/null 2>&1 |";
    $verbose && print "cmd $cmd\n";
    if ( ! open( pH, $cmd ) ) { 
	if ( ! -x "/usr/sbin/fmadm" ) { 
	    print "n/a, fmadm not present on this os release\n";
	    exit 0;
	    }
	push @unknowns, "can't run fmadm: $!\n";
	return 1;
	}

#taxi:~62# fmadm  faulty
#--------------- ------------------------------------  -------------- ---------
#TIME            EVENT-ID                              MSG-ID         SEVERITY
#--------------- ------------------------------------  -------------- ---------
#Oct 02 20:06:36 a2a8fce9-7f6f-c949-c4c0-cf6b543815c9  AMD-8000-2F    Major    
#
#Fault class : fault.memory.dimm_sb
#Affects     : mem:///motherboard=0/chip=3/memory-controller=0/dimm=0/rank=1
#                  degraded but still in service
#FRU         : "CPU 3 DIMM 0" (hc://:product-id=Sun-Fire-V40z:chassis-id=XG043885026:server-id=taxi.nss.udel.edu/motherboard=0/chip=3/memory-controller=0/dimm=0)
#
#Description : The number of errors associated with this memory module has
#              exceeded acceptable levels.  Refer to
#              http://sun.com/msg/AMD-8000-2F for more information.
#
#Response    : Pages of memory associated with this memory module are being
#              removed from service as errors are reported.
#
#Impact      : Total system memory capacity will be reduced as pages are
#              retired.
#
#Action      : Schedule a repair procedure to replace the affected memory
#              module.  Use fmdump -v -u <EVENT_ID> to identify the module.
#

#gjallarhorn:~doke/work/nagios17# fmadm faulty
#   STATE RESOURCE / UUID
#-------- ----------------------------------------------------------------------


    $severity = $msg = '';
    $saw_header = 0;
    $nfaults = 0;
    while ( <pH> ) { 
	chomp;
	$verbose && print ">$_\n";
	if ( m/^[\s-]*$/ ) { 
	    # ignore blank and ---- lines
	    }
	elsif ( m/^TIME|^\s+STATE RESOURCE/ ) { 
	    $saw_header = 1;
	    }
	elsif ( m/ [\da-f-]{36} +(\S+) +(\S+) *$/i ) { 
	    $msg = $1;
	    $severity = $2;
	    $nfaults++;
	    $verbose && print "fault msg $msg, severity $severity, nfaults $nfaults\n";
	    }
	elsif ( m/^FRU\s*:\s*(\S.*)/ ) { 
	    $fru = $1;
	    push @crits, "$severity $msg $fru";
	    }
	elsif ( m/failed to retrieve/ ) { 
	    push @unknowns, $_;
	    }
	elsif ( m/Password:|is not in the sudoers file/ ) { 
	    push @unknowns, "sudo is not configured to allow this user";
	    }
	}
    close pH;

    $verbose && print "saw_header $saw_header, nfaults $nfaults\n";
    if ( $nfaults ) { 
	push @crits, "$nfaults faults";
	}
    elsif ( ! $saw_header ) { 
	if ( -x '/usr/sbin/zonename' ) { 
	    chomp( $zone = `zonename` );
	    $verbose && print "zone $zone\n";
	    if ( $zone eq 'global' ) { 
		#push @unknowns, "can't get fmadm data"; 
		}
	    else { 
		print "n/a, fmadm doesn't apply in a zone\n";
		exit 0;
		}
	    }
	}

    return ( $nfaults ? 2 : 0 );
    }




# check fmdump for faults in specified time period and return number found
#
sub check_fmdump {
    my( $period, $level ) = @_;
    my( $cmd, $nfaults, %faults, $saw_header, $zone );

    $cmd = "/usr/sbin/fmdump -t $period 2>&1 |";
    $verbose && print "cmd $cmd\n";
    if ( ! open( pH, $cmd ) ) { 
	if ( -x "/usr/sbin/fmdump" ) { 
	    print "n/a, fmdump not present on this os release\n";
	    exit 0;
	    }
	push @unknowns, "can't run fmdump: $!\n";
	return 1;
	}

#TIME                 UUID                                 SUNW-MSG-ID
#Mar 15 2007 16:27:08 2fb3c407-e257-6b01-bf89-c8b2a99bb705 ZFS-8000-CS
#Mar 15 2007 16:27:09 ceb644d3-5103-6c1d-b1dc-dead235584dc ZFS-8000-CS

#TIME                 UUID                                 SUNW-MSG-ID
#Aug 04 23:41:37.6455 b23284a7-563c-ceb7-8fcd-b33f3b3d5a26 SUN4U-8000-2S
#Aug 04 23:41:39.8733 7d8f124c-046a-6f62-fab5-dda6bb4a75cc SUN4U-8000-35


    $saw_header = 0;
    $nfaults = 0;
    undef %faults;
    while ( <pH> ) { 
	chomp;
	$verbose && print ">$_\n";
	if ( m/^TIME/ ) { 
	    $saw_header = 1;
	    }
	elsif ( m/ [\da-f-]{36} +(\S+) *$/i ) { 
	    $nfaults++;
	    $faults{ $1 }++;
	    }
	elsif ( m/failed/ ) { 
	    push @unknowns, $_;
	    }
	}
    close pH;

    $verbose && print "saw_header $saw_header, nfaults $nfaults\n";
    if ( $nfaults ) { 
	if ( $level == 2 ) { 
	    push @crits, "$nfaults faults in last $period: " . join( " ", sort keys %faults );
	    }
	else { 
	    push @warns, "$nfaults faults in last $period: " . join( " ", sort keys %faults );
	    }
	}
    elsif ( ! $saw_header ) { 
    	chomp( $zone = `zonename` );
	$verbose && print "zone $zone\n";
	if ( $zone eq 'global' ) { 
	    push @unknowns, "can't get fmdump data"; 
	    }
	else { 
	    print "n/a, fmdump doesn't apply in a zone\n";
	    exit 0;
	    }
	}

    return $nfaults;
    }




