#!/usr/bin/perl
#
# multiple checks of ps output
#
# by Doke Scott, doke@udel.edu, 4 Feb 2004
#
# $Id: check_ps_multi.pl,v 1.31 2017/07/10 19:26:50 doke Exp $


use warnings;
use strict;
use Fcntl;
#use NDBM_File;  # not on all systems, require below
#use Time::HiRes qw( usleep );  # not on many systems
use Getopt::Long;


my( $config_file, $last_runtimes_db, $last_runtimes_db_backup,
    $warn_zombies, $use_ucb_ps, $verbose, $help, @warns, $sysname, $sysrel,
    @crits, @unknowns, @oks, $zombie_count, $zonename, $need_zone_count,
    $zone_count, %config, %processes, %last_runtimes, $ntags, $rc, $sep,
    $has_ndbm );

$config_file = "/usr/local/nagios/etc/check_ps_multi.cfg";
$last_runtimes_db = "/usr/local/nagios/var/ps_multi";  
$last_runtimes_db_backup = "/var/cache/nagios/ps_multi";  

$warn_zombies = 50;
$use_ucb_ps = 0;  # default to /bin/ps, much faster, but truncs cmd line args


$verbose = 0;
$help = 0;

$ENV{PATH} = "/usr/sbin:/sbin:/usr/bin:/bin";

###########################

sub usage {
    print qq{Usage: $0 [-uvh] [-f <configfile>] 
    -f s   config file [$config_file]
    -u     try to use /usr/ucb/ps
    -z n   warn at n zombies [warn_zombies]
    -v     verbose
    -h     help
};
    exit -1;
    }

Getopt::Long::Configure ("bundling");
GetOptions( 
    'f=s' => \$config_file,
    'u' => \$use_ucb_ps,
    'z=i' => \$warn_zombies,
    'v+' => \$verbose,
    'h' => \$help,
    );
&usage( 0 ) if ( $help );

&read_config( $config_file );

chomp( $sysname = `uname -s` );
chomp( $sysrel = `uname -r` );

$zombie_count = 0;
$zone_count = 1;

if ( $sysname =~ m/SunOS/ ) { 
    if ( $use_ucb_ps && -x '/usr/ucb/ps' ) { 
	&run_ucb_ps();
	}
    elsif ( $sysrel =~ m/5.1[0-9]/ ) { 
	# solaris 10 and above have zones
	chomp( $zonename = `zonename` );

	if ( $need_zone_count ) { 
	    # get count of running zones, not including global
	    chomp( $zone_count = `zoneadm list -p | grep -v global | grep -c running` );
	    $zone_count ||= 1;
	    }

	if ( -x '/usr/bin/ps' ) { 
	    &run_ps( '/usr/bin/ps -efZ' );
	    }
	elsif ( -x '/bin/ps' ) { 
	    &run_ps( '/bin/ps -efZ' );
	    }
	}
    else { 
	if ( -x '/usr/bin/ps' ) { 
	    &run_ps( '/usr/bin/ps -ef' );
	    }
	elsif ( -x '/bin/ps' ) { 
	    &run_ps( '/bin/ps -ef' );
	    }
	}
    }
elsif ( $sysname =~ m/Linux/ ) { 
    # On linux -Z gives selinux stuff
    if ( -x '/usr/bin/ps' ) { 
	&run_ps( '/usr/bin/ps -ef' );
	}
    elsif ( -x '/bin/ps' ) { 
	&run_ps( '/bin/ps -ef' );
	}
    }

$has_ndbm = eval "require NDBM_File";
if ( $has_ndbm ) { 
    tie_last_runtimes_db();
    }
else { 
    $verbose && print "doesn't have NDBM, skipping rate checking\n";
    }

check_tags();

if ( $has_ndbm ) { 
    untie %last_runtimes;
    }

$verbose && print "$zombie_count zombies\n";
if ( $zombie_count >= $warn_zombies )  { 
    push @warns, "$zombie_count zombies";
    }
else { 
    push @oks, "$zombie_count zombies";
    }

$rc = 0;   # nagios ok exit code
$" = ", ";
$sep = '';
if ( scalar( @crits ) ) { 
    print "CRITICAL: @crits ";
    $rc = 2;
    $sep = '; ';
    }
if ( scalar( @warns ) ) { 
    print "${sep}Warning: @warns ";
    $rc = 1 if ( $rc == 0 );
    $sep = '; ';
    }
if ( scalar( @unknowns ) ) { 
    print "${sep}Unknown: @unknowns ";
    $rc = -1 if ( $rc == 0 );
    $sep = '; ';
    }
if ( ( $rc == 0 || $verbose ) && scalar( @oks ) ) { 
    print "${sep}Ok: @oks ";
    }
print "\n";
exit $rc;   




##################################


sub read_config{ 
    my( $config_file ) = @_;
    my ( $dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, 
	$atime, $mtime, $ctime, $blksize, $blocks, 
	$tag, $pat, $cntlo, $cnthi, $secshi, $rc );

    if ( ! -e $config_file ) { 
	print "no config file!\n";
	exit -1;   # unknown return code
	}
    elsif ( ! -f _ ) { 
	print "config file is not a plain file: $config_file\n";
	exit -1;   # unknown return code
	}
    ( $dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, 
	$atime, $mtime, $ctime, $blksize, $blocks )
	= stat( _ );
    #if ( $uid != 0 ) { 
	#print "config file is not owned by root: $config_file\n";
	#exit -1;   # unknown return code
	#}
    if ( $mode & 0133 ) { 
	print "config file permissions are too open: $config_file\n";
	exit -1;   # unknown return code
	}

    $need_zone_count = 0;

    if ( ! open( fH, $config_file ) ) { 
	print "can't read config file $config_file: $!\n";
	exit -1;   # unknown return code
	}
    while ( <fH> ) { 
	if ( m/^\s*#/ || m/^\s*$/ ) { 
	    next;
	    }
	elsif ( m/^ps_command\s*(\S.*)$/ ) { 
	    # Ignore it for now.  I'm not convinced it's useful enough to
	    # outweight the security risk.  If you turn this on, also turn 
	    # on checking the config file is owned by root above.
	    #$ps_command = $1;
	    }
	elsif ( m/^use_ucb_ps \s+ (\d)/ix ) { 
	    $use_ucb_ps = $1;   # global
	    }
	# new format
	elsif ( m/^(\S+) \s+ (\S+) \s+ (\d+):(\d+):(\d+):(\d+) \s+ (\d+):(\d+) \s+ (\S+)? \s*$/x ) { 
	    my( $tag, $pat, $cntlocrit, $cntlowarn, $cnthiwarn, $cnthicrit, 
		$secswarn, $secscrit, $options )
		= ( $1, $2, $3, $4, $5, $6, $7, $8, $9 );
	    if ( defined( $config{ $tag } ) ) { 
		push @warns, "Warning: $config_file line $. repeats tag $tag";
		next;
		}
	    $config{ $tag }{ pattern } = $pat;	    # perl regexp to search for 
	    $config{ $tag }{ cntlocrit } = $cntlocrit;  # low limit for proc count 
	    $config{ $tag }{ cntlowarn } = $cntlowarn;  # low limit for proc count 
	    $config{ $tag }{ cnthiwarn } = $cnthiwarn;  # upper limit 
	    $config{ $tag }{ cnthicrit } = $cnthicrit;  # upper limit 
	    $config{ $tag }{ secswarn } = $secswarn;  # upper limit of cpu seconds
	    $config{ $tag }{ secscrit } = $secscrit;  # upper limit of cpu seconds
	    $options ||= '';
	    $config{ $tag }{ options } = $options;	# options

	    if ( $options =~ m/\bper-zone\b/ ) { 
		$need_zone_count = 1;
		}
	    }
	# old format
	elsif ( m/^(\S+) \s+ (\S+) \s+ (\d+)(:(\d+))? \s+ (\d+) \s+ (\d+) \s*$/x ) { 
	    my( $tag, $pat, $cntlo, $cnthi, $secshi, $rc ) 
		= ( $1, $2, $3, $5, $6, $7 );
	    if ( defined( $config{ $tag } ) ) { 
		push @warns, "Warning: $config_file line $. repeats tag $tag";
		next;
		}
	    $config{ $tag }{ pattern } = $pat;	    # perl regexp to search for 
	    if ( ! defined $cnthi ) { 
		$cnthi = $cntlo;
		}
	    if ( $rc == 2 ) { 
		# critical
		$config{ $tag }{ cntlocrit } = $cntlo;  # low limit for proc count 
		$config{ $tag }{ cntlowarn } = $cntlo;  # low limit for proc count 
		$config{ $tag }{ cnthiwarn } = $cnthi;  # upper limit 
		$config{ $tag }{ cnthicrit } = $cnthi;  # upper limit 
		$config{ $tag }{ secswarn } = $secshi;  # upper limit of cpu seconds
		$config{ $tag }{ secscrit } = $secshi;  # upper limit of cpu seconds
		$config{ $tag }{ options } = '';	# options
		}
	    elsif ( $rc == 1 ) { 
		# warning
		$config{ $tag }{ cntlocrit } = 0;  # low limit for proc count 
		$config{ $tag }{ cntlowarn } = $cntlo;  # low limit for proc count 
		$config{ $tag }{ cnthiwarn } = $cnthi;  # upper limit 
		$config{ $tag }{ cnthicrit } = 999999999;  # upper limit 
		$config{ $tag }{ secswarn } = 0;  # upper limit of cpu seconds
		$config{ $tag }{ secscrit } = $secshi;  # upper limit of cpu seconds
		$config{ $tag }{ options } = '';	# options
		}
	    else { 
		push @unknowns, "Warning: can't parse $config_file line $.";
		next;
		}
	    }
	else { 
	    push @unknowns, "Warning: can't parse $config_file line $.";
	    next;
	    }
	}
    close fH;
    }
	



sub run_ps { 
    my( $ps_command ) = @_;
    my( $zonename8 );

    $verbose && print "run_ps( $ps_command )\n";

    # solaris ps truncates the zonename to 8 characters, sigh
    if ( $zonename ) { 
	$zonename8 = substr( $zonename, 0, 8 );
	}

    if ( ! open( pH, "$ps_command |" ) ) { 
	print "UNKNOWN -- unable to run $ps_command: $!";
	exit -1;
	}
    while ( <pH> ) { 
	$verbose && print "<$_";
	chomp;
	next if ( m/^\s* UID \s+ PID \s+ PPID/x );

	# www-data 33538  8080  0 17:25 ?        00:00:00 /usr/sbin/apache2 -k start
	# linux ps truncates usernames more than 8 characters, with a '+'
	# libstor+   1759      1  0 Aug19 ?        00:00:09 /usr/bin/lsmd -d
	# solaris 10 and 11 have zones
	# sc1dm1     root 36481 38647   0   Jun 14 ?           0:00 /usr/lib/ssh/sshd
	#    www 13618 27961  0                   0:02 <defunct>
	#    kate 14371 14353   0        - ?           0:00 <defunct>
	#  global    root   717   716   0        - ?           0:00 <defunct>
	#    root      4343  4331  0 Jun13 pts/0    00:00:00 [sh] <defunct>
	#  global     root  3131  3028   0        - ?           0:00 <defunct>

	if ( m/^\s* (?:(\w[\w\d-]*) \s+)? (\w[\w\d-]*\+?) \s+ (\d+) \s+ (\d+) \s+ (\d+) \s+ 
		(\w\w\w \s* \d+|\d\d:\d\d(?::\d\d)?|20\d\d|-) \s+ 
		(\?+|pts\/\d+|console|zoneconsole|syscon|term\/\w|tty\d+|ttyS\d+|vt\/\d+) \s+ 
		((?:\d+-)?\d+:\d\d(?::\d\d)?) \s+ (\S.*) $/ix ) {
	    my( $zone, $user, $pid, $ppid, $c, $stime, $tty, $time, $cmdline ) 
		= ( $1, $2, $3, $4, $5, $6, $7, $8, $9 );
	    $zone ||= '';
	    $verbose && print "> $zone, $pid, $time, '$cmdline'\n";
	    if ( $zonename && $zone ) { 
		if ( $zonename8 eq $zone ) { 
		    if ( $cmdline =~ m/(?:\[[^[]]+\]) \s* <defunct>/ix ) { 
			$zombie_count++;
			}
		    $processes{ $cmdline }{ $pid } = $time;
		    }
		}
	    else { 
		if ( $cmdline =~ m/(?:\[[^[]]+\]) \s* <defunct>/ix ) { 
		    $zombie_count++;
		    }
		$processes{ $cmdline }{ $pid } = $time;
		}
	    }
	elsif ( m/^\s* (:?ZONE)? \s* UID \s+ PID/ ) { 
	    # ignore column headers
	    }
	else { 
	    $verbose && print "can't parse '$_'\n";
	    push @unknowns, "can't parse: $_";
	    }
	}
    close pH;
    }




sub run_ucb_ps { 
    my( $pid, $tt, $s, $time, $cmdline );

    if ( ! open( pH, "/usr/ucb/ps -axww |" ) ) { 
	print "UNKNOWN -- unable to run /usr/ucb/ps: $!";
	exit -1;
	}
    while ( <pH> ) { 
	$verbose && print $_;
	chomp;
	next if ( m/^\s* PID \s+ TT \s+/ix );
	if ( m/^\s* (\d+) \s+ 
		(\?+|pts\/\d+|console|zoneconsole|syscon|term\/\w) \s+
		(\S) \s+ (\d+:\d\d) \s+ (\S.*)$/ix ) {
	    ( $pid, $tt, $s, $time, $cmdline ) = ( $1, $2, $3, $4, $5 );
	    $verbose && print "cmdline = '$cmdline'\n";
	    $processes{ $cmdline }{ $pid } = $time;
	    if ( $s eq 'Z' ) { 
		$zombie_count++;
		}
	    }
	elsif ( m/^\s* (\d+) \s+ Z \s+ (\d+:\d\d) \s+ (\S.*)$/ix ) { 
	    $zombie_count++;
	    }
	else { 
	    $verbose && print "can't parse '$_'\n";
	    push @unknowns, "can't parse: $_";
	    }
	}
    close pH;
    }






sub tie_last_runtimes_db { 
    my( $tied );

    $verbose && print "tie_last_runtimes_db()<br>\n";

    # try to tie the NDBM database, retry a few times with a small sleep
    $tied = 0;
    for ( 1 .. 20 ) { 
	$verbose && print "tring to tie $last_runtimes_db\n";
	# 0100 = O_CREAT 
	# 02 = O_RDWR 
	# 0644 = permissions on new dbm files, if created
	if ( tie( %last_runtimes, 'NDBM_File', $last_runtimes_db, 0102, 0644 ) )  { 
	    $tied = 1;
	    last;
	    }
	if ( $! =~ m/No such file or directory/i 
		& ! -f "$last_runtimes_db.dir" ) { 
	    if ( ! open( dfH, '>', "$last_runtimes_db.dir" ) ) { 
		my $dir = $last_runtimes_db; 
		$dir =~ s!/[^/]*!!; 
		mkdir $dir;
		chmod 0775, $dir; 
		open( dfH, '>', "$last_runtimes_db.dir" );
		}
	    close dfH;
	    open( pfH, '>', "$last_runtimes_db.pag" );
	    close pfH;
	    }
	elsif ( $! =~ m/Read-only file system/i ) { 
	    $last_runtimes_db = $last_runtimes_db_backup;
	    next;
	    }
	elsif ( $verbose ) { 
	    print "can't tie $last_runtimes_db: $!\n";
	    }
	# some systems don't have Time::HiRes
	#usleep( 10000 + int( rand 100000 ) );
	select( '', '', '', ( 10000 + int( rand 100000 ) ) / 1000000 );
	}
    if ( ! $tied ) { 
	warn "internal plugin error, can't tie dbm: $!\n"; 
	return -1;  
	}

    return 0;
    }







sub check_tags { 
    my( $now, $tag, $pattern, $count, $cmdline, $pid, $runtime, $days, $hours, $mins,
	$runsecs, $last_data, $last_runsecs, $last_epoch_time, $delta_runsecs,
	$delta_epoch_time, $rate, $key, $nok );

    $now = time();

    $nok = 0;
    foreach $tag ( sort keys %config ) { 
	$pattern = $config{ $tag }{ 'pattern' };
	$verbose >= 2 && print "tag $tag, pattern $pattern\n"; 
	$count = 0;
	foreach $cmdline ( keys %processes ) { 
	    $verbose >= 2 && print "tag $tag, pattern $pattern, cmdline $cmdline\n"; 
	    if ( $cmdline =~ m/$pattern/ ) { 
		$verbose && print "tag $tag, cmdline '$cmdline'\n";
		foreach $pid ( keys %{$processes{ $cmdline }} ) { 
		    $count++;
		    $runtime = $processes{ $cmdline }{ $pid };
		    $verbose && print "tag $tag, cmdline '$cmdline', pid $pid, runtime $runtime\n";

		    # splunk   10127     1 27 Jan19 ?        40-10:37:50 splunkd -h 10.10.24.220 -p 8089 start
		    if ( $runtime =~ m/^(\d+)-(\d\d):(\d\d):(\d\d)$/ ) { 
			( $days, $hours, $mins, $runsecs ) = ( $1, $2, $3, $4 );
			$runsecs += $days * 86400;
			$runsecs += $hours * 3600;
			$runsecs += $mins * 60;
			}
		    elsif ( $runtime =~ m/^(\d+):(\d\d):(\d\d)$/ ) { 
			( $hours, $mins, $runsecs ) = ( $1, $2, $3 );
			$runsecs += $hours * 3600;
			$runsecs += $mins * 60;
			}
		    elsif ( $runtime =~ m/^(\d+):(\d\d)$/ ) { 
			( $mins, $runsecs ) = ( $1, $2 );
			$runsecs += $mins * 60;
			}
		    else { 
			warn "can't parse seconds from $runtime\n";
			next;
			}
		    $verbose && print STDERR "$cmdline, $pid, $runtime -> $runsecs\n";

		    if ( $has_ndbm ) { 
			$last_data = $last_runtimes{ "$cmdline,$pid" };
			if ( $last_data ) { 
			    ( $last_runsecs, $last_epoch_time ) = split( m/,/, $last_data );
			    if ( $runsecs > $last_runsecs ) { 
				$delta_runsecs = $runsecs - $last_runsecs;
				$delta_epoch_time = $last_epoch_time - $now; 
				if ( $delta_epoch_time > 0 && $delta_epoch_time < 3600 ) { 
				    $rate = $delta_runsecs * 3600 / $delta_epoch_time;
				    $verbose && print STDERR "$cmdline, $pid, $last_runsecs, $runsecs, $last_epoch_time, $now, $rate\n";
				    if ( $rate > $config{ $tag }{ 'secscrit' } ) { 
					push @crits, sprintf( "%s is using %d s of cpu time / hour, gt %d", 
					    $tag, $delta_runsecs, $config{ $tag }{ 'secscrit' } );
					}
				    elsif ( $rate > $config{ $tag }{ 'secswarn' } ) { 
					push @warns, sprintf( "%s is using %d s of cpu time / hour, gt %d", 
					    $tag, $delta_runsecs, $config{ $tag }{ 'secswarn' } );
					}

				    }
				}
			    }
			$last_runtimes{ "$cmdline,$pid" } = "$runsecs,$now";
			}
		    }
		}
	    }

	$verbose && print "$tag count $count\n";
	if ( $zone_count > 1 && $config{ $tag }{ 'options' } =~ m/\bper-zone\b/ ) { 
	    $config{ $tag }{ 'cntlocrit' } *= $zone_count;
	    $config{ $tag }{ 'cntlowarn' } *= $zone_count;
	    $config{ $tag }{ 'cnthiwarn' } *= $zone_count;
	    $config{ $tag }{ 'cnthicrit' } *= $zone_count;
	    }

	if ( $count < $config{ $tag }{ 'cntlocrit' } ) { 
	    push @crits, sprintf( "%s too few running, %d lt %d", 
		$tag, $count, $config{ $tag }{ 'cntlocrit' } );
	    }
	elsif ( $count > $config{ $tag }{ 'cnthicrit' } ) { 
	    push @crits, sprintf( "%s too many running, %d gt %d", 
		$tag, $count, $config{ $tag }{ 'cnthicrit' } );
	    }
	elsif ( $count < $config{ $tag }{ 'cntlowarn' } ) { 
	    push @warns, sprintf( "%s too few running, %d lt %d", 
		$tag, $count, $config{ $tag }{ 'cntlowarn' } );
	    }
	elsif ( $count > $config{ $tag }{ 'cnthiwarn' } ) { 
	    push @warns, sprintf( "%s too many running, %d gt %d", 
		$tag, $count, $config{ $tag }{ 'cnthiwarn' } );
	    }
	else { 
	    $verbose && printf( "ok %s %d running\n", $tag, $count );
	    #push @oks, sprintf( "%s %d running", $tag, $count );
	    $nok++;
	    }

	}

    push @oks, "$nok items running"; 

    # clean the database
    if ( $has_ndbm ) { 
	foreach $key ( keys %last_runtimes ) { 
	    if ( $key =~ m/(.*),(\d+)$/ ) { 
		( $cmdline, $pid ) = ( $1, $2 ); 
		if ( ! defined $processes{ $cmdline }{ $pid } ) { 
		    delete $last_runtimes{ $key };
		    }
		}
	    }
	}
    }




    
    


