#!/usr/bin/perl # # Check sun 'hdadm smart' or 'hd -r' status # This is primarly designed for Sun X4500 thumpers. # # $Header: /opt/home/doke/work/nagios/RCS/check_hdadm,v 1.3 2010/07/14 22:04:49 doke Exp $ # from http://en.wikipedia.org/wiki/S.M.A.R.T. # 05 05 Reallocated Sectors Count # Count of reallocated sectors. When the hard drive finds a # read/write/verification error, it marks this sector as "reallocated" and # transfers data to a special reserved area (spare area). This process is # also known as remapping, and "reallocated" sectors are called remaps. # This is why, on modern hard disks, "bad blocks" cannot be found while # testing the surface: all bad blocks are hidden in reallocated sectors. # However, as the number of reallocated sectors increases, the read/write # speed tends to decrease. The raw value normally represents a count of # the number of bad sectors that have been found and remapped. Thus, the # higher the attribute value, the more sectors the drive has had to # reallocate. # 194 C2 Temperature # Current internal temperature. # 196 C4 Reallocation Event Count # Green Arrow Down.svg Count of remap operations. The raw value of # this attribute shows the total number of attempts to transfer # data from reallocated sectors to a spare area. Both successful & # unsuccessful attempts are counted. # 197 C5 Current Pending Sector Count # Number of "unstable" sectors (waiting to be remapped, because of # read errors). If an unstable sector is subsequently written or # read successfully, this value is decreased and the sector is not # remapped. Read errors on a sector will not remap the sector (since # it might be readable later); instead, the drive firmware remembers # that the sector needs to be remapped, and remaps it the next time # it's written. # 198 C6 Uncorrectable Sector Count # The total number of uncorrectable errors when reading/writing a sector. # A rise in the value of this attribute indicates defects of the disk # surface and/or problems in the mechanical subsystem. (or Off-Line Scan # Uncorrectable Sector Count: Fujitsu) # I'm making these limits up # my $crit_reallocated = 100; # critical reallocated sector count my $warn_reallocated = 20; # warning reallocated sector count my $crit_event = 100; # critical sector reallocation events my $warn_event = 20; # warning sector reallocation events my $crit_pending = 10; # critical pending sector reallocations my $warn_pending = 5; # warning pending sector reallocations my $crit_uncorrectable = 100; # critical uncorrectable sector count my $warn_uncorrectable = 20; # warning uncorrectable sector count my $crit_temp = 39; # critical temperature my $warn_temp = 34; # warning temperature # # # 39C is from Deskstar 7K500 and Deskstar E7K500 Hard Disk Drive Specification, page 51. # http://www.hitachigst.com/tech/techlib.nsf/techdocs/CE3F5756C827F35A86256F4F006B8AD4/$file/7K500v1.5.pdf # # It actually says: # Temperature 5C to 55oC (See note below) # Relative humidity 8 to 90%, non-condensing # Maximum wet bulb temperature 29.4oC, non-condensing # # The wet bulb temperature is the max operating temperature at max operating humidity, 90%. # When you reduce the humidity, you can have a higher temperature, up to 55C at 10%. # There's a graph that shows at 50% humidity, the limit should be about 39C. # my $expected_num_disks = 48; # 48 drives fit in an X4500. use strict; use warnings; use Getopt::Long; #use Data::Dumper; $ENV{PATH} = "/opt/sfw/bin:/usr/sbin:/sbin:$ENV{PATH}"; use vars qw( $verbose $help @crits @warns @unknowns @oks @ignores ); $verbose = 0; $help = 0; sub usage { my( $rc ) = @_; print "Usage: $0 [options] --cr n critical reallocated sector count [$crit_reallocated] --wr n warning reallocated sector count [$warn_reallocated] --ce n critical sector reallocation events [$crit_event] --we n warning sector reallocation events [$warn_event] --cp n critical pending sector reallocations [$crit_pending] --wp n warning pending sector reallocations [$warn_pending] --cu n critical uncorrectable sector count [$crit_uncorrectable] --wu n warning uncorrectable sector count [$warn_uncorrectable] --ct n critical temperature in C [$crit_temp] --wt n warning temperature in C [$warn_temp] -n n expected number of disks [$expected_num_disks] -v verbose -h help "; exit $rc; } Getopt::Long::Configure ("bundling"); GetOptions( 'cr=i' => \$crit_reallocated, 'wr=i' => \$warn_reallocated, 'ce=i' => \$crit_event, 'we=i' => \$warn_event, 'cp=i' => \$crit_pending, 'wp=i' => \$warn_pending, 'cu=i' => \$crit_uncorrectable, 'wu=i' => \$warn_uncorrectable, 'ct=i' => \$crit_temp, 'wt=i' => \$warn_temp, 'n=i' => \$expected_num_disks, 'v+' => \$verbose, 'h' => \$help, ); &usage( 0 ) if ( $help ); # Just in case of problems, let's not hang $SIG{'ALRM'} = sub { print ("ERROR: timed out\n"); exit 2; }; alarm( 300 ); #check_hdadm(); check_hd(); my $rc = 0; my $sep = ''; if ( $#crits >= 0 ) { $rc = 2; printf "%d CRITICAL errors: %s", scalar( @crits ), join( ", ", @crits ); $sep = '; '; } if ( $#warns >= 0 ) { $rc = 1 if ( $rc == 0 ); printf "%s%d Warnings: %s", $sep, scalar( @warns ), join( ", ", @warns ); $sep = '; '; } if ( $#unknowns >= 0 ) { $rc = -1 if ( $rc == 0 ); printf "%s%d Unknowns: %s", $sep, scalar( @unknowns ), join( ", ", @unknowns ); $sep = '; '; } if ( $rc == 0 && $#oks >= 0 ) { printf "Ok: %s", join( ", ", @oks ); $sep = '; '; } if ( $#ignores >= 0 ) { printf "%sIgnoring: %s", $sep, join( ", ", @ignores ); } print "\n"; exit $rc; ################## #clio:~4# /usr/bin/hdadm smart #Reallocated sector count (id 5) #-----------SunFireX4500---Rear------------- #0 0 0 0 0 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 0 #---*-----*-SunFireX4500-*-Front*------*---- # #Reallocation event count (id 196) #-----------SunFireX4500---Rear------------- #1 0 0 0 8 1465 3 0 108 0 0 0 #0 5 0 0 0 0 0 0 1465 0 0 0 #0 0 0 0 0 0 0 0 0 0 1459 0 #0 0 3 0 0 0 0 0 0 0 0 0 #---*-----*-SunFireX4500-*-Front*------*---- # #Current pending sector count (id 197) #-----------SunFireX4500---Rear------------- #1 0 3 0 8 0 7 0 8 0 0 0 #0 5 0 2 0 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 0 #0 0 2 0 0 0 0 0 0 0 0 0 #---*-----*-SunFireX4500-*-Front*------*---- # #Scan uncorrected sector count (id 198) #-----------SunFireX4500---Rear------------- #0 0 0 0 0 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 0 #---*-----*-SunFireX4500-*-Front*------*---- # # grr, turns out it just drops numbers out of the grid when drives are missing #clio:/opt/home/doke/tmp23# hdadm smart #Reallocated sector count (id 5) #-----------SunFireX4500---Rear------------- #0 0 0 0 1 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 0 #0 0 0 0 0 0 0 0 0 0 0 0 #---*-----*-SunFireX4500-*-Front*------*---- # # now what? sub check_hdadm { my( $cmd, $map, $row, @vals, $i, %data, $msg ); if ( ! -e "/usr/bin/hdadm" ) { push @crits, "/usr/bin/hdadm not installed"; return; } $cmd = "sudo -S /usr/bin/hdadm smart < /dev/null 2>&1 |"; $verbose && print "+ $cmd\n"; if ( ! open( pH, $cmd ) ) { push @unknowns, "can't run $cmd: $!"; return; } $map = ''; while( ) { chomp; $verbose && print "> $_\n"; if ( m/^\s*$/ ) { # ignore it } elsif ( m/Reallocated sector count/i ) { $map = 'sector'; } elsif ( m/Reallocation event count/i ) { $map = 'event'; } elsif ( m/Current pending sector count/i ) { $map = 'pending'; } elsif ( m/Scan uncorrected sector count/i ) { $map = 'uncorrected'; } elsif ( m/-+SunFireX4500-+Rear-/i ) { # start result rows $row = 3; } elsif ( m/^\s*(\d+\s+){11}(\d+)\s*$/i ) { @vals = split( /\s+/, $_ ); foreach $i ( 0 .. 11 ) { $data{ $map }[ $row * 12 + $i ] = $vals[ $i ]; } $row--; } elsif ( m/-[\*-]+SunFireX4500[\*-]+Front[\*-]+/i ) { # done rows $map = ''; } elsif ( m/is not in the sudoers file|Password:/i ) { push @crits, "this user is not permited to run '/usr/bin/hdadm smart' in the sudoers file"; return; } } close pH; for $i ( 0 .. 47 ) { if ( $data{ 'event' }[ $i ] > $warn_event ) { $msg = sprintf( "disk %d has %d reallocation events", $i, $data{ 'event' }[ $i ] ); if ( $data{ 'event' }[ $i ] > $crit_event ) { push @crits, $msg; } else { push @warns, $msg; } } } for $i ( 0 .. 47 ) { if ( $data{ 'pending' }[ $i ] > $warn_pending ) { $msg = sprintf( "disk %d has %d pending remaps", $i, $data{ 'pending' }[ $i ] ); if ( $data{ 'pending' }[ $i ] > $crit_pending ) { push @crits, $msg; } else { push @warns, $msg; } } } } # clio# /usr/bin/hd -r #0 c3t0 #====== #Revision: 16 #Offline status 132 #Selftest status 0 #Seconds to collect 10419 #Time in minutes to run short selftest 1 #Time in minutes to run extended selftest 174 #Offline capability 91 #SMART capability 3 #Error logging capability 1 #Checksum 0xe7 #Identification Status Current Worst Raw data # 1 Raw read error rate 0xb 100 100 0 # 2 Throughput performance 0x5 159 159 205 # 3 Spin up time 0x7 107 107 55877501580 # 4 Start/Stop count 0x12 100 100 151 # 5 Reallocated sector count 0x33 100 100 0 # 7 Seek error rate 0xb 100 100 0 # 8 Seek time performance 0x5 136 136 31 # 9 Power on hours count 0x12 100 100 3733 # 10 Spin retry count 0x13 100 100 0 # 12 Device power cycle count 0x32 100 100 151 #192 Power off retract count 0x32 100 100 295 #193 Load cycle count 0x12 100 100 295 #194 Temperature 0x2 229 229 24/ 13/ 37 (degrees C cur/min/max) #196 Reallocation event count 0x32 100 100 0 #197 Current pending sector count 0x22 100 100 0 #198 Scan uncorrected sector count 0x8 100 100 0 #199 Ultra DMA CRC error count 0xa 200 253 0 #200 Write/Multi-Zone Error Rate 0x8 200 200 0 sub check_hd { my( $cmd, $num_disks, $disknum, $diskct, $id, $name, $status, $current, $worst, $raw, $curtemp, $mintemp, $maxtemp, $warn, $crit, $msg, @disks_seen ); if ( ! -e "/usr/bin/hd" ) { push @crits, "/usr/bin/hd not installed"; return; } $cmd = "sudo -S /usr/bin/hd -r < /dev/null 2>&1 |"; $verbose && print "+ $cmd\n"; if ( ! open( pH, $cmd ) ) { push @unknowns, "can't run $cmd: $!"; return; } $num_disks = 0; while( ) { chomp; $verbose && print "> $_\n"; if ( m/^ \s* (\d+) \s+ (c\d+t\d+)/ix ) { $disknum = $1; $diskct = $2; $num_disks ++; $disks_seen[ $disknum ] = 1; } elsif ( m/^ \s* (\d+) \s+ (\S.*) \s+ (0x[\da-f]+) \s+ (\d+) \s+ (\d+) \s+ (\d+) \s* $/ix ) { $id = $1; $name = $2; $status = $3; $current = $4; $worst = $5; $raw = $6; if ( $id == 5 ) { $warn = $warn_reallocated; $crit = $crit_reallocated; $msg = sprintf( "disk %d %s has %d reallocated sectors", $disknum, $diskct, $raw ); } elsif ( $id == 196 ) { $warn = $warn_event; $crit = $crit_event; $msg = sprintf( "disk %d %s has had %d reallocation events", $disknum, $diskct, $raw ); } elsif ( $id == 197 ) { $warn = $warn_pending; $crit = $crit_pending; $msg = sprintf( "disk %d %s has %d pending sector reallocations", $disknum, $diskct, $raw ); } elsif ( $id == 198 ) { $warn = $warn_uncorrectable; $crit = $crit_uncorrectable; $msg = sprintf( "disk %d %s has had %d uncorrectable sector errors", $disknum, $diskct, $raw ); } else { next; } if ( $raw > $crit ) { push @crits, $msg; } elsif ( $raw > $warn ) { push @warns, $msg; } } elsif ( m/^ \s* (194) \s+ (\S.*) \s+ (0x[\da-f]+) \s+ (\d+) \s+ (\d+) \s+ (\d+)\/\s*(\d+)\/\s*(\d+) /ix ) { $id = $1; $name = $2; $status = $3; $current = $4; $worst = $5; $curtemp = $6; $mintemp = $7; $maxtemp = $8; if ( $curtemp > $warn_temp ) { $msg = sprintf( "disk %d %s at %dC", $disknum, $diskct, $curtemp ); if ( $curtemp > $crit_temp ) { push @crits, $msg; } else { push @warns, $msg; } } } elsif ( m/is not in the sudoers file|Password:/i ) { push @crits, sprintf( "%s is not permited to sudo '/usr/bin/hd -r' without password", getpwuid( $> ) ); return; } } close pH; for $disknum ( 0 .. ( $expected_num_disks - 1 ) ) { if ( ! $disks_seen[ $disknum ] ) { push @crits, "disk $disknum missing"; } } push @oks, "$num_disks disks"; }