×
Create a new article
Write your page title here:
We currently have 3,189 articles on s23. Type your article name above or create one of the articles listed here!



    s23
    3,189Articles

    Nagios/checks/solaris CAM

    < Nagios‎ | checks

    Define Service[edit]

    this will add a new service and group (the actual check defanation) add this to the local services folder on the Nagios server

    '/opt/nagios/etc/local/services'

    #cat solaris-cam.cfg 
    
    define service {
      service_description   solaris_cam
      display_name            solaris common array manager
      # the CAM software runs on host hundsglump.muc
      check_command         check_nrpe!check_sstcam
      is_volatile           1
      max_check_attempts    1
      servicegroups           solaris_CAM
      hostgroup_name          solaris_CAM
    }
    
    
    

    (a service group to look at all CAM monitoring)

    Define a Service group[edit]

    /opt/nagios/etc/local/servicegroups

    1. cat solaris-cam.cfg
    define servicegroup {
            servicegroup_name       solaris_CAM
            alias                   solaris common array manager
    }
    



    Define a host-template[edit]

    /opt/nagios/etc/local/host-templates

    cat solaris-cam.cfg 
    define host{
            name            solaris_CAM
            hostgroups      +solaris_CAM
            register        0
    }
    

    /opt/nagios/etc/local/hostgroups cat solaris-cam.cfg

    define hostgroup {
           hostgroup_name  solaris_CAM
           alias           solaris common array manager
    }
    


    Add check to host[edit]

    define host {
           use             solaris_CAM
    x
    x
    x
    }
    



    Check config[edit]

    /opt/nagios/bin/nagios -v /opt/nagios/etc/nagios.cfg
    

    '


    NRPE Command defanition[edit]

    Add this to the client in the commands folder '/opt/nagios/nrpe/nrpe-commands"

    #cat > sstcam.cfg
    command[check_sstcam]=/opt/nagios/plugins/libexec/check_sstcam
    


    add the check script to plugins folder

    '/opt/nagios/plugins/libexec/'

    then make it exacutable

    chmod u+x /opt/nagios/plugins/libexec/check_sstcam
    

    and a quick `chown -R nagios:nagios /opt/nagios` never did any harm


    then restart nrpe

    '/etc/init.d/nrpe restart'
    


    Check script [check_sstcam][edit]

    # cat check_sstcam
    
    #! /usr/bin/perl -w
    #
    # check_sstcam - nagios plugin which checks for alerts generated by the 
    #                solaris storage tek common array manager
    #                
    #
    # Copyright (C) 2007 Gerhard Lausser, gerhard.lausser@consol.de
    #
    # This program is free software; you can redistribute it and/or
    # modify it under the terms of the GNU General Public License
    # as published by the Free Software Foundation; either version 2
    # of the License, or (at your option) any later version.
    #
    # This program is distributed in the hope that it will be useful,
    # but WITHOUT ANY WARRANTY; without even the implied warranty of
    # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    # GNU General Public License for more details.
    #
    # You should have received a copy of the GNU General Public License
    # along with this program; if not, write to the Free Software
    # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
    #
    #
    # Report bugs to:  gerhard.lausser@consol.de
    #
    # 2007-02-13 1.0        initial release
    # 2007-02-19 1.1        added -n and -f options to check only certain devices.
    # 2008-01-17 1.2        added -p for persistency. bugfix with multiline descr.
    #
    #
     
    #
    # check_sstcam watches the alertfiles generated by the common array manager.
    # any changes will be reported immediately. in the service definition
    # is_volatile and max_check_attempts must be set to 1.
    # if -p is used, the usual way with is_volatile 0 and max_check_attempts > 1
    # is also possible.
    # 
    #
    use strict;
    use Data::Dumper;
    use IO::File;
    use Getopt::Long qw(:config no_ignore_case getopt_compat);
    use vars qw($PROGNAME $REVISION $TIMEOUT $ALARMDIR $STATUSDIR
        $opt_V $opt_h $opt_t $opt_v $opt_n $opt_f $opt_p);
    
    my %ERRORS=( OK => 0, WARNING => 1, CRITICAL => 2, UNKNOWN => 3 );
    my %ERRORCODES=( 0 => 'OK', 1 => 'WARNING', 2 => 'CRITICAL', 3 => 'UNKNOWN' );
     
    $PROGNAME = "check_sstcam";
    $REVISION = '$Revision: 1.2 $';
    $TIMEOUT = 10;
    
    $STATUSDIR = "/opt/nagios/etc/logchecks";
    $ALARMDIR = "/var/opt/SUNWsefms/store/Alarms";
     
    sub print_usage () {
      print "Usage:\n";
      print "  $PROGNAME [-t <timeout>] [-n <device> | -f <configfile>]\n";
      print "  $PROGNAME [-h | --help]\n";
      print "  $PROGNAME [-V | --version]\n";
      print "\n\nOptions:\n";
      print "  -t, --timeout\n";
      print "     The number of seconds after which the plugin will abort\n";
      print "  -p, --persistent\n";
      print "     Stay critical as long as there are alarm files\n";
      print "  -h, --help\n";
      print "     Print detailed help screen\n";
      print "  -V, --version\n";
      print "     Print version information\n\n";
    }
     
    sub print_help () {
      print "Copyright (c) 2007 Gerhard Lausser\n\n";
      print_usage();
      print "\n";
      print "  Check the solaris storage tek common array manager alert files\n";
      print "\n";
      print "The default is to check all devices managed by the cam.\n";
      print "By providing a certain device's name with the -n option, you can\n";
      print "limit checking for this device only.\n";
      print "By using a config file you can name more than just one devices to\n";
      print "monitor. Provide the filename with the -f option and specify the\n";
      print "device names in the config file with a line like:\n";
      print "\@devicenames = qw(devicename-1 devicename-2 devicename-3);\n";
      print "\n";
      support();
    }
     
    sub print_revision ($$) {
      my $commandName = shift;
      my $pluginRevision = shift;
      $pluginRevision =~ s/^\$Revision: //;
      $pluginRevision =~ s/ \$\s*$//;
      print "$commandName $pluginRevision\n";
      print "This nagios plugin comes with ABSOLUTELY NO WARRANTY. You may redistribute\ncopies of this plugin under the terms of the GNU General Public License.\n";
    }
    
    sub support () {
      my $support='Send email to gerhard.lausser@consol.de if you have questions\nregarding use of this software. \nPlease include version information with all correspondence (when possible,\nuse output from the --version option of the plugin itself).\n';
      $support =~ s/@/\@/g;
      $support =~ s/\\n/\n/g;
      print $support;
    }
    
    sub trace {
      my $format = shift;
      my $logfh = IO::File->new;
      if (-f "/tmp/$PROGNAME.trace") {
        if ($logfh->open("/tmp/$PROGNAME.trace", "a")) {
          $logfh->printf("%s: ", scalar localtime);
          $logfh->printf($format, @_);
          $logfh->printf("\n");
          $logfh->close;
        }
      }
    }
    
    
    sub getcurrentevents {
      my $tmpevents = {};
      trace("looking for alarm files");
      foreach my $alarmfile (glob $ALARMDIR.'/alarm*') {
        next if $alarmfile !~ /^.*\/alarm\d+$/;
        if (-r $alarmfile) {
          trace(sprintf "opened alarm file %s", $alarmfile);
          my $tmpevent = {};
          my $xml = do { local (@ARGV, $/) = $alarmfile; <> };
          my $patterns = {
            eventid     => qr#(.*?)#,
            devicename  => qr#(.*?)#,
            id          => qr#(.*?)#,
            devicetype  => qr#(.*?)#,
            description => qr#(.*?)#s,
            severity    => qr#(.*?)#,
            devicekey   => qr#(.*?)#,
            datecreated => qr#(.*?)#,
            state       => qr#(.*?)#,
          };
          while (my ($k, $v) = each %$patterns) {
            if ($xml =~ /$v/) {
              $tmpevent->{$k} = $1;
              $tmpevent->{$k} =~ s#\n# #g; # Description my be multiline
            }
          } 
          $tmpevents->{$tmpevent->{eventid}} = $tmpevent;
        } else {
          trace(sprintf "cannot open alarm file %s", $alarmfile);
        }
      }
      return $tmpevents;
    } 
    
    
    sub loadevents {
      my $statusfile = $STATUSDIR.'/'.$PROGNAME.'.status';
      our $events = {};
      if (-f $statusfile) {
        trace("loading saved events");
        eval {
          require $statusfile;
        };
      }
      return $events;
    }
    
    
    sub saveevents {
      my $events = shift;
      my $statusfile = $STATUSDIR.'/'.$PROGNAME.'.status';
      trace("saving current events");
      $Data::Dumper::Indent = 1;
      my $dump = Data::Dumper->Dump([$events], [qw(events)]);
      if (open SNAP, '>', $statusfile) {
        print SNAP "$dump\n";
        close SNAP;
      }
    }
    
    
    my $exitcode = $ERRORS{UNKNOWN};
    my $exitmessage = "you should never see this message";
    my @warnings;
    my @criticals;
    my @unknowns;
    
    chdir;
    if (! GetOptions(
        "t|timeout=i" => \$opt_t,
        "n|name=s" => \$opt_n,
        "f|config=s" => \$opt_f,
        "p|persistent" => \$opt_p,
        "V|version" => \$opt_V,
        "h|help" => \$opt_h,
        "v|verbose" => \$opt_v,
     )) {
      print_help();
      exit $ERRORS{UNKNOWN};
    }
    
    if ($opt_t) {
      $TIMEOUT = $opt_t;
    }
    
    $SIG{ALRM} = sub {
      printf "UNKNOWN - %s timed out after %d seconds\n", $PROGNAME, $TIMEOUT;
      exit $ERRORS{UNKNOWN};
    };
    alarm($TIMEOUT);
    
    if ($opt_V) {
      print_revision($PROGNAME, $REVISION);
      exit $ERRORS{OK};
    }
    
    if ($opt_h) {
      print_help();
      exit $ERRORS{OK};
    }
    
    #
    # read a configfile with whitelisted devices
    #
    our @devicenames;
    if ($opt_n) {
      @devicenames = ($opt_n);
    } elsif ($opt_f) {
      $opt_f = (-f $opt_f.'.cfg') ? $opt_f.'.cfg' : $opt_f;
      if (-f $opt_f) {
        eval {
          require $opt_f;
        };
        if ($@) {
          printf "syntax errors in config file %s: %s\n", $opt_f, $@;
          exit $ERRORS{UNKNOWN};
        }
      } else {
        printf STDERR "cannot open config file %s\n", $opt_f;
        exit $ERRORS{UNKNOWN};
      }
    }
    
    #
    # todo: decide wether this is the active node of a clustered installation
    #
    
    #
    # todo: check for processes and alert if cam is not running
    #
    
    #
    # check for alarm files
    #
    if (-d $ALARMDIR) {
      my $currentevents = getcurrentevents();
      my $savedevents = $opt_p ? {} : loadevents();
      my @neweventids;
      my @oldeventids;
      my @deleventids;
      if (!@devicenames) {
        # empty devicenames means: monitor all known devices
        my %seen;
        @devicenames = grep { ! $seen{$_} ++ } map {
            $_->{devicename}
        } values %$currentevents;
      }
      #printf "%s\n", Data::Dumper::Dumper($currentevents);
      foreach my $eventid (keys %{$savedevents}) {
        if (exists $currentevents->{$eventid}) {
          push(@oldeventids, $eventid);
          trace(sprintf "already known event %s", $eventid);
          printf STDERR "already known event %s\n", $eventid if $opt_v;
          # eventuell hat sich die severity erhoeht. nachfragen.
        } else {
          push(@deleventids, $eventid);
          trace(sprintf "cleared event %s", $eventid);
          printf STDERR "cleared event %s\n", $eventid if $opt_v;
        }
      }
      foreach my $eventid (keys %{$currentevents}) {
        if (! exists $savedevents->{$eventid}) {
          push(@neweventids, $eventid);
          trace(sprintf "new event %s", $eventid);
          printf STDERR "new event %s\n", $eventid if $opt_v;
        }
      }
      foreach my $device (@devicenames) {
        my @devcriticals;
        my @devwarnings;
        my @devunknowns;
        my $statistics = "";
        foreach my $eventid (@neweventids) {
          next if  $device ne $currentevents->{$eventid}->{devicename};
          if ($currentevents->{$eventid}->{severity} == 2) {
            push(@devwarnings, $currentevents->{$eventid}->{description});
          } elsif ($currentevents->{$eventid}->{severity} == 3 ||
                   $currentevents->{$eventid}->{severity} == 4) {
            push(@devcriticals, $currentevents->{$eventid}->{description});
          }
        }
        $statistics = join(", ", (
            @devcriticals ?
                sprintf("%d error%s", scalar(@devcriticals), 
                scalar(@devcriticals) == 1 ? "" : "s") : (),
            @devwarnings ?
                sprintf("%d warning%s", scalar(@devwarnings),
                scalar(@devwarnings) == 1 ? "" : "s") : ()));
        if (@devcriticals) {
          push(@criticals, sprintf "Storage %s (%s): %s", $device,
              $statistics, join(", ", (@devcriticals, @devwarnings)));
        } elsif (@devwarnings) {
          push(@warnings, sprintf "Storage %s (%s): %s", $device,
              $statistics, join(", ", @devwarnings));
        }
      }
      saveevents($currentevents);
      if (@criticals) {
        $exitmessage = sprintf "%s", join(" // ", @criticals, @warnings);
        $exitcode = 2;
      } elsif (@warnings) {
        $exitmessage = sprintf "%s", join(" // ", @warnings);
        $exitcode = 1;
      } else {
        $exitmessage = sprintf "cam detected no new errors";
        $exitcode = 0;
      }
    } else {
      $exitmessage = "Alarm directory does not exist";
      $exitcode = $ERRORS{UNKNOWN};
    }
    
    printf "%s - %s\n", $ERRORCODES{$exitcode}, $exitmessage;
    exit $exitcode;
    
    Cookies help us deliver our services. By using our services, you agree to our use of cookies.
    Cookies help us deliver our services. By using our services, you agree to our use of cookies.