Define Service[edit]
this will add a new service and group (the actual check defanation) add this to the local services folder on the Nagios server
'/opt/nagios/etc/local/services'
#cat solaris-cam.cfg define service { service_description solaris_cam display_name solaris common array manager # the CAM software runs on host hundsglump.muc check_command check_nrpe!check_sstcam is_volatile 1 max_check_attempts 1 servicegroups solaris_CAM hostgroup_name solaris_CAM }
(a service group to look at all CAM monitoring)
Define a Service group[edit]
/opt/nagios/etc/local/servicegroups
- cat solaris-cam.cfg
define servicegroup { servicegroup_name solaris_CAM alias solaris common array manager }
Define a host-template[edit]
/opt/nagios/etc/local/host-templates
cat solaris-cam.cfg define host{ name solaris_CAM hostgroups +solaris_CAM register 0 }
/opt/nagios/etc/local/hostgroups cat solaris-cam.cfg
define hostgroup { hostgroup_name solaris_CAM alias solaris common array manager }
Add check to host[edit]
define host { use solaris_CAM x x x }
Check config[edit]
/opt/nagios/bin/nagios -v /opt/nagios/etc/nagios.cfg
'
NRPE Command defanition[edit]
Add this to the client in the commands folder '/opt/nagios/nrpe/nrpe-commands"
#cat > sstcam.cfg command[check_sstcam]=/opt/nagios/plugins/libexec/check_sstcam
add the check script to plugins folder
'/opt/nagios/plugins/libexec/'
then make it exacutable
chmod u+x /opt/nagios/plugins/libexec/check_sstcam
and a quick `chown -R nagios:nagios /opt/nagios` never did any harm
then restart nrpe
'/etc/init.d/nrpe restart'
Check script [check_sstcam][edit]
# cat check_sstcam #! /usr/bin/perl -w # # check_sstcam - nagios plugin which checks for alerts generated by the # solaris storage tek common array manager # # # Copyright (C) 2007 Gerhard Lausser, gerhard.lausser@consol.de # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # # Report bugs to: gerhard.lausser@consol.de # # 2007-02-13 1.0 initial release # 2007-02-19 1.1 added -n and -f options to check only certain devices. # 2008-01-17 1.2 added -p for persistency. bugfix with multiline descr. # # # # check_sstcam watches the alertfiles generated by the common array manager. # any changes will be reported immediately. in the service definition # is_volatile and max_check_attempts must be set to 1. # if -p is used, the usual way with is_volatile 0 and max_check_attempts > 1 # is also possible. # # use strict; use Data::Dumper; use IO::File; use Getopt::Long qw(:config no_ignore_case getopt_compat); use vars qw($PROGNAME $REVISION $TIMEOUT $ALARMDIR $STATUSDIR $opt_V $opt_h $opt_t $opt_v $opt_n $opt_f $opt_p); my %ERRORS=( OK => 0, WARNING => 1, CRITICAL => 2, UNKNOWN => 3 ); my %ERRORCODES=( 0 => 'OK', 1 => 'WARNING', 2 => 'CRITICAL', 3 => 'UNKNOWN' ); $PROGNAME = "check_sstcam"; $REVISION = '$Revision: 1.2 $'; $TIMEOUT = 10; $STATUSDIR = "/opt/nagios/etc/logchecks"; $ALARMDIR = "/var/opt/SUNWsefms/store/Alarms"; sub print_usage () { print "Usage:\n"; print " $PROGNAME [-t <timeout>] [-n <device> | -f <configfile>]\n"; print " $PROGNAME [-h | --help]\n"; print " $PROGNAME [-V | --version]\n"; print "\n\nOptions:\n"; print " -t, --timeout\n"; print " The number of seconds after which the plugin will abort\n"; print " -p, --persistent\n"; print " Stay critical as long as there are alarm files\n"; print " -h, --help\n"; print " Print detailed help screen\n"; print " -V, --version\n"; print " Print version information\n\n"; } sub print_help () { print "Copyright (c) 2007 Gerhard Lausser\n\n"; print_usage(); print "\n"; print " Check the solaris storage tek common array manager alert files\n"; print "\n"; print "The default is to check all devices managed by the cam.\n"; print "By providing a certain device's name with the -n option, you can\n"; print "limit checking for this device only.\n"; print "By using a config file you can name more than just one devices to\n"; print "monitor. Provide the filename with the -f option and specify the\n"; print "device names in the config file with a line like:\n"; print "\@devicenames = qw(devicename-1 devicename-2 devicename-3);\n"; print "\n"; support(); } sub print_revision ($$) { my $commandName = shift; my $pluginRevision = shift; $pluginRevision =~ s/^\$Revision: //; $pluginRevision =~ s/ \$\s*$//; print "$commandName $pluginRevision\n"; print "This nagios plugin comes with ABSOLUTELY NO WARRANTY. You may redistribute\ncopies of this plugin under the terms of the GNU General Public License.\n"; } sub support () { my $support='Send email to gerhard.lausser@consol.de if you have questions\nregarding use of this software. \nPlease include version information with all correspondence (when possible,\nuse output from the --version option of the plugin itself).\n'; $support =~ s/@/\@/g; $support =~ s/\\n/\n/g; print $support; } sub trace { my $format = shift; my $logfh = IO::File->new; if (-f "/tmp/$PROGNAME.trace") { if ($logfh->open("/tmp/$PROGNAME.trace", "a")) { $logfh->printf("%s: ", scalar localtime); $logfh->printf($format, @_); $logfh->printf("\n"); $logfh->close; } } } sub getcurrentevents { my $tmpevents = {}; trace("looking for alarm files"); foreach my $alarmfile (glob $ALARMDIR.'/alarm*') { next if $alarmfile !~ /^.*\/alarm\d+$/; if (-r $alarmfile) { trace(sprintf "opened alarm file %s", $alarmfile); my $tmpevent = {}; my $xml = do { local (@ARGV, $/) = $alarmfile; <> }; my $patterns = { eventid => qr#(.*?)#, devicename => qr#(.*?)#, id => qr#(.*?)#, devicetype => qr#(.*?)#, description => qr#(.*?)#s, severity => qr#(.*?)#, devicekey => qr#(.*?)#, datecreated => qr#(.*?)#, state => qr#(.*?)#, }; while (my ($k, $v) = each %$patterns) { if ($xml =~ /$v/) { $tmpevent->{$k} = $1; $tmpevent->{$k} =~ s#\n# #g; # Description my be multiline } } $tmpevents->{$tmpevent->{eventid}} = $tmpevent; } else { trace(sprintf "cannot open alarm file %s", $alarmfile); } } return $tmpevents; } sub loadevents { my $statusfile = $STATUSDIR.'/'.$PROGNAME.'.status'; our $events = {}; if (-f $statusfile) { trace("loading saved events"); eval { require $statusfile; }; } return $events; } sub saveevents { my $events = shift; my $statusfile = $STATUSDIR.'/'.$PROGNAME.'.status'; trace("saving current events"); $Data::Dumper::Indent = 1; my $dump = Data::Dumper->Dump([$events], [qw(events)]); if (open SNAP, '>', $statusfile) { print SNAP "$dump\n"; close SNAP; } } my $exitcode = $ERRORS{UNKNOWN}; my $exitmessage = "you should never see this message"; my @warnings; my @criticals; my @unknowns; chdir; if (! GetOptions( "t|timeout=i" => \$opt_t, "n|name=s" => \$opt_n, "f|config=s" => \$opt_f, "p|persistent" => \$opt_p, "V|version" => \$opt_V, "h|help" => \$opt_h, "v|verbose" => \$opt_v, )) { print_help(); exit $ERRORS{UNKNOWN}; } if ($opt_t) { $TIMEOUT = $opt_t; } $SIG{ALRM} = sub { printf "UNKNOWN - %s timed out after %d seconds\n", $PROGNAME, $TIMEOUT; exit $ERRORS{UNKNOWN}; }; alarm($TIMEOUT); if ($opt_V) { print_revision($PROGNAME, $REVISION); exit $ERRORS{OK}; } if ($opt_h) { print_help(); exit $ERRORS{OK}; } # # read a configfile with whitelisted devices # our @devicenames; if ($opt_n) { @devicenames = ($opt_n); } elsif ($opt_f) { $opt_f = (-f $opt_f.'.cfg') ? $opt_f.'.cfg' : $opt_f; if (-f $opt_f) { eval { require $opt_f; }; if ($@) { printf "syntax errors in config file %s: %s\n", $opt_f, $@; exit $ERRORS{UNKNOWN}; } } else { printf STDERR "cannot open config file %s\n", $opt_f; exit $ERRORS{UNKNOWN}; } } # # todo: decide wether this is the active node of a clustered installation # # # todo: check for processes and alert if cam is not running # # # check for alarm files # if (-d $ALARMDIR) { my $currentevents = getcurrentevents(); my $savedevents = $opt_p ? {} : loadevents(); my @neweventids; my @oldeventids; my @deleventids; if (!@devicenames) { # empty devicenames means: monitor all known devices my %seen; @devicenames = grep { ! $seen{$_} ++ } map { $_->{devicename} } values %$currentevents; } #printf "%s\n", Data::Dumper::Dumper($currentevents); foreach my $eventid (keys %{$savedevents}) { if (exists $currentevents->{$eventid}) { push(@oldeventids, $eventid); trace(sprintf "already known event %s", $eventid); printf STDERR "already known event %s\n", $eventid if $opt_v; # eventuell hat sich die severity erhoeht. nachfragen. } else { push(@deleventids, $eventid); trace(sprintf "cleared event %s", $eventid); printf STDERR "cleared event %s\n", $eventid if $opt_v; } } foreach my $eventid (keys %{$currentevents}) { if (! exists $savedevents->{$eventid}) { push(@neweventids, $eventid); trace(sprintf "new event %s", $eventid); printf STDERR "new event %s\n", $eventid if $opt_v; } } foreach my $device (@devicenames) { my @devcriticals; my @devwarnings; my @devunknowns; my $statistics = ""; foreach my $eventid (@neweventids) { next if $device ne $currentevents->{$eventid}->{devicename}; if ($currentevents->{$eventid}->{severity} == 2) { push(@devwarnings, $currentevents->{$eventid}->{description}); } elsif ($currentevents->{$eventid}->{severity} == 3 || $currentevents->{$eventid}->{severity} == 4) { push(@devcriticals, $currentevents->{$eventid}->{description}); } } $statistics = join(", ", ( @devcriticals ? sprintf("%d error%s", scalar(@devcriticals), scalar(@devcriticals) == 1 ? "" : "s") : (), @devwarnings ? sprintf("%d warning%s", scalar(@devwarnings), scalar(@devwarnings) == 1 ? "" : "s") : ())); if (@devcriticals) { push(@criticals, sprintf "Storage %s (%s): %s", $device, $statistics, join(", ", (@devcriticals, @devwarnings))); } elsif (@devwarnings) { push(@warnings, sprintf "Storage %s (%s): %s", $device, $statistics, join(", ", @devwarnings)); } } saveevents($currentevents); if (@criticals) { $exitmessage = sprintf "%s", join(" // ", @criticals, @warnings); $exitcode = 2; } elsif (@warnings) { $exitmessage = sprintf "%s", join(" // ", @warnings); $exitcode = 1; } else { $exitmessage = sprintf "cam detected no new errors"; $exitcode = 0; } } else { $exitmessage = "Alarm directory does not exist"; $exitcode = $ERRORS{UNKNOWN}; } printf "%s - %s\n", $ERRORCODES{$exitcode}, $exitmessage; exit $exitcode;