#!/usr/bin/perl
#
# run 'iostat -e -n' and check for excessive hard drive errors 
# for Solaris 8 - 11
#
# Doke Scott doke@udel.edu 9.Feb.2004
# added performance reporting 2016.8.5 Doke
#
# $Header: /home/doke/work/nagios/RCS/check_disk_error_rates,v 1.4 2016/08/05 19:07:56 doke Exp $
#

use strict;
use warnings;
use Getopt::Long;

use vars qw( $warn_rate $crit_rate $state_file $verbose $help @crits @warns
    @oks @perf $rc $sep );

$warn_rate = 5;
$crit_rate = 10;
$state_file = "/usr/local/nagios/var/check_disk_error_rates.state";
$ENV{PATH} = "/usr/bin";

$verbose = 0;
$help = 0;

sub usage { 
    my( $rc ) = @_;
    print "Usage: $0 [-v] [-w <warn rate>] [-c <critical rate>] [-f <state file>]\n";
    exit $rc
    }

Getopt::Long::Configure ("bundling");
GetOptions(
    'w=i' => \$warn_rate,
    'c=i' => \$crit_rate,
    'f=s' => \$state_file,
    'v+' => \$verbose,
    'h' => \$help,
    );
&usage( 0 ) if ( $help );

if ( $warn_rate > $crit_rate ) { 
    print "warn rate is higher than critical rate\n";
    usage( 3 ); 
    }

check();

$" = ", ";
$rc = 0;
$sep = '';
if ( scalar( @crits ) ) { 
    print "CRITICAL: @crits";
    $rc = 2;
    $sep = '; ';
    }
if ( scalar( @warns ) ) { 
    print $sep, "Warning: @warns";
    $rc = 1 if ( ! $rc );
    $sep = '; ';
    }
if ( ! $rc || $verbose ) { 
    print $sep, "OK: @oks";
    }
if ( $#perf >= 0 ) {
    print ' | ', join( " ", @perf );
    }
print "\n";
exit $rc;


############################


sub check { 
    my( $sw, $hw, $trn, $tot, $device, $cmd, $to_save, $delta, %saved_sw,
	%saved_hw, %saved_trn, %saved_tot, %saved_defined );

    if ( ! -x "/bin/sun" ) { 
	# not solaris
	# this plugin only works on solaris 8 through 11
	push @oks, "not applicable, not a solaris system";
	return;
	}

    if ( ! -f $state_file ) { 
	# no state file, assume zeros
	}
    elsif ( ! open( fH, $state_file ) ) { 
	push @warns, "can't read-open state file $state_file: $!\n";
	# still assume zeros
	}
    else { 
	while ( <fH> ) { 
	    chomp;
	    # software errors, hardware errors, transport errors, total errors, device
	    ( $sw, $hw, $trn, $tot, $device ) = split;
	    next if ( ! defined $device );
	    next if ( $device =~ m!^rmt/\d+$! );
	    next if ( $tot !~ m!^\d+$! );
	    $verbose && print "saved: $sw, $hw, $trn, $tot, $device\n"; 
	    $saved_sw{ $device } = $sw;
	    $saved_hw{ $device } = $hw;
	    $saved_trn{ $device } = $trn;
	    $saved_tot{ $device } = $tot;
	    $saved_defined{ $device } = 1;
	    }
	close fH;
	}

    $cmd = "/usr/bin/iostat -e -n |";
    if ( ! open( fH, $cmd ) ) { 
	print "can't run iostat: $!\n";
	exit -1;  # unknown exit code
	}
    while ( <fH> ) { 
	$to_save .= $_;
	chomp;
	# software errors, hardware errors, transport errors, total errors, device
	( $sw, $hw, $trn, $tot, $device ) = split;
	next if ( ! defined $device );
	next if ( $device =~ m!:vold\(pid\d+\)! );
	next if ( $device =~ m!^rmt/\d+$! );
	next if ( $tot !~ m!^\d+$! );
	$verbose && print "current: $sw, $hw, $trn, $tot, $device\n"; 
	$delta = $tot - $saved_tot{ $device };
	push @perf, "delta_$device=$delta total_$device=$tot";
	if ( $delta >= $crit_rate && $saved_defined{ $device } ) { 
	    push @crits, "$device +$delta=$tot";
	    }
	elsif ( $delta >= $warn_rate && $saved_defined{ $device } ) { 
	    push @warns, "$device +$delta=$tot";
	    }
	else { 
	    push @oks, "$device +$delta=$tot";
	    }
	}
    close fH;

    unlink "${state_file}.new";
    if ( ! open( fH, ">${state_file}.new" ) ) { 
	push @warns, "can't write-open state file ${state_file}.new: $!\n";
	}
    else { 
	print fH $to_save;
	if ( ! close fH ) { 
	    push @warns, "can't write to state file ${state_file}.new: $!\n";
	    }
	elsif ( ! rename "${state_file}.new", "${state_file}" ) { 
	    push @warns, "can't rename state file ${state_file}.new to ${state_file}: $!\n";
	    }
	}
    chmod 0644, $state_file;
    }


