# startup task to invoke nagios_tail.pl to monitor nagios service changes type=Single ptype=SubStr pattern=SEC_STARTUP context=SEC_INTERNAL_EVENT desc=monitor new nagios events action=spawn /apps/nagios23/sec/nagios_tail.pl /apps/nagios23/var/status.dat # #_* check if a service that was previously flagged down comes back up # # if service is ok and app_server_service_PROBLEM_CONTEXT is currently set # delete app_server_service_PROBLEM_CONTEXT # continue processing additional rules # don't update nagios - done in a later rule type=Single ptype=RegExp continue=TakeNext pattern=SERVICE, (\w+\-\w+\-[\w\d]+),\s((\w+)\s(\w+)\s([^,]+)),\s(OK),\s(.*) context=$3_$1_$5_PROBLEM_CONTEXT desc=$0 action=write - Resolved: $3 $1 $5; \ eval %h ( $h{"$3"}{"$4"}{"$5"} = $h{"$3"}{"$4"}{"$5"} - 1 ); \ eval %i ( %h ? 2 : 0 ); \ write - TOTAL REMAINING PROBLEMS: %h; \ delete $3_$1_$5_PROBLEM_CONTEXT; \ # #_* if all problems for this app-dc-service have ended, remove the problem context for the app and datacenter # # if service is OK and app_dc_service_PROBLEM_CONTEXT exists # and number of services having a problem is currently 0 # remove the app_dc_service_PROBLEM_CONTEXT # remove the PROBLEM_DELAY_NOTIFICATION context if it hasn't already expired # update nagios - all ok type=Single ptype=RegExp pattern=SERVICE, (\w+\-\w+\-[\w\d]+),\s((\w+)\s(\w+)\s([^,]+)),\s(OK),\s(.*) context=$3_$4_$5_PROBLEM_CONTEXT && =( defined $h{"$3"}{"$4"}{"$5"} && $h{"$3"}{"$4"}{"$5"} eq "0" ) desc=Delete datacenter-service problem context if no hosts having problems action=delete $3_$4_$5_PROBLEM_CONTEXT; \ delete $3_$4_$5_PROBLEM_DELAY_NOTIFICATION_CONTEXT; \ write - DC Problems Ending: $3 $4 $5; \ write /apps/nagios23/var/rw/nagios.cmd ([%u] PROCESS_SERVICE_CHECK_RESULT;EventCorrelator;SEC $2;0;No problems found) # #_* check if an additional server (that is not one of the servers that is already down) is in critical state # # there are two versions of this rule, one where # app_dc_service_PROBLEM_DELAY_NOTIFICATION_CONTEXT is set, and one # where it is not set. # if PROBLEM_DELAY_NOTIFICATION context has expired, it's safe to update nagios now type=Single ptype=RegExp pattern=SERVICE, (\w+\-\w+\-[\w\d]+),\s((\w+)\s(\w+)\s([^,]+)),\s(CRITICAL|WARNING),\s(.*) context=$3_$4_$5_PROBLEM_CONTEXT && ! $3_$1_$5_PROBLEM_CONTEXT && ! $3_$4_$5_PROBLEM_DELAY_NOTIFICATION_CONTEXT desc=$0 action=create $3_$1_$5_PROBLEM_CONTEXT; \ add $3_$4_$5_PROBLEM_CONTEXT $0 at %t; \ eval %h ( $h{"$3"}{"$4"}{"$5"} = $h{"$3"}{"$4"}{"$5"} + 1 ); \ write - TOTAL CURRENT $3 $4 $5 problems: %h; \ write /apps/nagios23/var/rw/nagios.cmd ([%u] PROCESS_SERVICE_CHECK_RESULT;EventCorrelator;SEC $2;2;%h server problems with $5) # if PROBLEM_DELAY_NOTIFICATION context is still in effect, don't flag nagios yet type=Single ptype=RegExp pattern=SERVICE, (\w+\-\w+\-[\w\d]+),\s((\w+)\s(\w+)\s([^,]+)),\s(CRITICAL|WARNING),\s(.*) context=$3_$4_$5_PROBLEM_CONTEXT && ! $3_$1_$5_PROBLEM_CONTEXT desc=$0 action=create $3_$1_$5_PROBLEM_CONTEXT; \ add $3_$4_$5_PROBLEM_CONTEXT $0 at %t; \ eval %h ( $h{"$3"}{"$4"}{"$5"} = $h{"$3"}{"$4"}{"$5"} + 1 ); \ write - TOTAL CURRENT $3 $4 $5 problems: %h; \ # #_* checking for any service in problem state in a dc that was previous not in a problem state # # create app_dc_service_PROBLEM_CONTEXT # create app_server_service_PROBLEM_CONTEXT # create a PROBLEM_DELAY_NOTIFICATION context that will expire in 3 minutes # when this context expires, failure will be submitted to nagios type=Single ptype=RegExp pattern=SERVICE, (\w+\-\w+\-[\w\d]+),\s((\w+)\s(\w+)\s([^,]+)),\s(CRITICAL|WARNING),\s(.*) context=! $3_$1_$5_PROBLEM_CONTEXT desc=$0 action=write - SINGLE PROBLEM: $3 $4 $5 : $0; \ create $3_$4_$5_PROBLEM_CONTEXT; \ create $3_$1_$5_PROBLEM_CONTEXT; \ eval %h ( $h{"$3"}{"$4"}{"$5"} = 1 ); \ create $3_$4_$5_PROBLEM_DELAY_NOTIFICATION_CONTEXT 180 (write /apps/nagios23/var/rw/nagios.cmd ([%u] PROCESS_SERVICE_CHECK_RESULT;EventCorrelator;SEC $2;2;Server problems observed)) # keep passive OK checks fresh # - if there is no PROBLEM_CONTEXT or OK_CONTEXT for this service # - create an OK_CONTEXT that will last for 5 minutes # - submit a passive check to nagios type=Single ptype=RegExp continue=TakeNext pattern=SERVICE, (\w+\-\w+\-[\w\d]+),\s((\w+)\s(\w+)\s([^,]+)),\s(OK),\s(.*) context=! $3_$4_$5_PROBLEM_CONTEXT && ! $3_$4_$5_OK_CONTEXT desc=$0 action=write - $3 $4 $5 OK - freshness check; \ create $3_$4_$5_OK_CONTEXT 300; \ write /apps/nagios23/var/rw/nagios.cmd ([%u] PROCESS_SERVICE_CHECK_RESULT;EventCorrelator;SEC $2;0;No problems found) # exit - used for automated testing purposes type=Single ptype=RegExp pattern=_TEST_ENDS_NOW_ desc=$0 action=eval %d ( use English; system "kill $PID"; )