#!/usr/bin/perl
#
# run 'dmesg' and check for memory errors
# for Solaris
#
# Doke Scott doke@udel.edu 2006.1.30
#
# $Id: check_memory_error_rates,v 1.2 2017/07/10 18:36:32 doke Exp $

my $warn_rate = 1;
my $crit_rate = 5;
my $log_file = "/var/adm/messages";
my $state_file = "/usr/local/nagios/var/check_memory_error_rates.state";
my $state_file_backup = "/var/cache/nagios/check_memory_error_rates.state";
$ENV{PATH} = "/usr/bin";

#use strict;
#use warnings;
use Getopt::Long;

my $recalled_errors = 0;
my $errors = 0;
my $suppress_message = 0;
my $verbose = 0;
my $help = 0;

my( @crit_errors, @warn_errors, $rc, $delta );


sub usage { 
    my( $rc ) = @_;
    warn qq{Usage: $0 [-v] [-w n] [-c n] [-f <file>] [-l <file>]
    -w n      warn if more than n errors since last test
    -c n      critical if more than n errors since last test
    -f file   file to store error counts
    -l file   log file to check for errors
};
    exit $rc;
    }

Getopt::Long::Configure ("bundling");
GetOptions( 
    "w=i" => \$warn_rate,
    "c=i" => \$crit_rate,
    "f=s" => \$state_file,
    "l=s" => \$log_file,
    "v+" => \$verbose,
    "h" => \$help
    );
&usage( 0 ) if $help;


if ( ! open( fH, '>>', $state_file ) 
	&& $! =~ m/Read-only file system/i ) { 
    $state_file = $state_file_backup; 
    }
close fH;

if ( ! -f $state_file ) { 
    # no state file, suppress message
    $suppress_message = 1;
    }
elsif ( ! open( fH, '<', $state_file ) ) { 
    # it's there, but we can't open it.  assume zeros
    push @warn_errors, "can't read-open state file $state_file: $!";
    $rc = 1 if ( $rc == 0 );
    }
else { 
    while ( <fH> ) { 
	chomp;
	next unless ( m!^\d+$! );
	$verbose && print "found recalled errors: $_\n"; 
	$recalled_errors = 0 + $_;
	last;
	}
    close fH;
    }
$verbose && print "recalled: $_\n"; 
    


$rc = 0;
if ( ! open( fH, '<', $log_file ) ) { 
    print "can't open log file '$log_file': $!\n";
    exit -1;  # unknown exit code
    }
$errors = 0;
while ( <fH> ) { 
    if ( m/Corrected memory/ ) { 
	$errors++;
	}
    }
close fH;

$verbose && print "current: $errors\n"; 

if ( ! $suppress_message ) { 
    $delta = $errors - $recalled_errors;
    if ( $delta >= $crit_rate ) { 
	push @crit_errors, "+$delta=$errors";
	$rc = 2;
	}
    elsif ( $delta >= $warn_rate ) { 
	push @warn_errors, "+$delta=$errors";
	$rc = 1 if ( $rc == 0 );
	}
    }

unlink "${state_file}.new";
if ( ! open( fH, '>', "${state_file}.new" ) ) { 
    push @warn_errors, "can't write-open state file ${state_file}.new: $!";
    $rc = 1 if ( $rc == 0 );
    }
else { 
    print fH "$errors\n";
    if ( ! close fH ) { 
	push @warn_errors, "can't write to state file ${state_file}.new: $!";
	$rc = 1 if ( $rc == 0 );
	}
    elsif ( ! rename "${state_file}.new", "${state_file}" ) { 
	push @warn_errors, "can't rename state file ${state_file}.new to ${state_file}: $!";
	$rc = 1 if ( $rc == 0 );
	}
    }
chmod 0644, $state_file;
    


$rc = 0;
$" = ", ";
if ( scalar( @crit_errors ) ) { 
    print "CRITICAL: @crit_errors ";
    $rc = 2;
    }
if ( scalar( @warn_errors ) ) { 
    print "Warning: @warn_errors ";
    $rc = 1 if ( $rc == 0 );
    }
if ( ! $rc ) { 
    print "OK";
    }
print "\n";
exit $rc;

