![]() |
Icinga-core 1.4.0
next gen monitoring
|
00001 /***************************************************************************** 00002 * 00003 * CHECKS.C - Service and host check functions for Icinga 00004 * 00005 * Copyright (c) 1999-2010 Ethan Galstad (egalstad@nagios.org) 00006 * Copyright (c) 2009-2011 Nagios Core Development Team and Community Contributors 00007 * Copyright (c) 2009-2011 Icinga Development Team (http://www.icinga.org) 00008 * 00009 * License: 00010 * 00011 * This program is free software; you can redistribute it and/or modify 00012 * it under the terms of the GNU General Public License version 2 as 00013 * published by the Free Software Foundation. 00014 * 00015 * This program is distributed in the hope that it will be useful, 00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00018 * GNU General Public License for more details. 00019 * 00020 * You should have received a copy of the GNU General Public License 00021 * along with this program; if not, write to the Free Software 00022 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 00023 * 00024 *****************************************************************************/ 00025 00026 #include "../include/config.h" 00027 #include "../include/comments.h" 00028 #include "../include/common.h" 00029 #include "../include/statusdata.h" 00030 #include "../include/downtime.h" 00031 #include "../include/macros.h" 00032 #include "../include/icinga.h" 00033 #include "../include/broker.h" 00034 #include "../include/perfdata.h" 00035 00036 /*#define DEBUG_CHECKS*/ 00037 /*#define DEBUG_HOST_CHECKS 1*/ 00038 00039 00040 #ifdef EMBEDDEDPERL 00041 #include "../include/epn_icinga.h" 00042 #endif 00043 00044 #ifdef USE_EVENT_BROKER 00045 #include "../include/neberrors.h" 00046 #endif 00047 00048 extern int sigshutdown; 00049 extern int sigrestart; 00050 00051 extern char *temp_file; 00052 extern char *temp_path; 00053 extern char *check_result_path; 00054 00055 extern int interval_length; 00056 00057 extern int command_check_interval; 00058 00059 extern int log_initial_states; 00060 extern int log_passive_checks; 00061 00062 extern int service_check_timeout; 00063 extern int host_check_timeout; 00064 00065 extern int check_reaper_interval; 00066 extern int max_check_reaper_time; 00067 00068 extern int use_aggressive_host_checking; 00069 extern unsigned long cached_host_check_horizon; 00070 extern unsigned long cached_service_check_horizon; 00071 extern int enable_predictive_host_dependency_checks; 00072 extern int enable_predictive_service_dependency_checks; 00073 00074 extern int soft_state_dependencies; 00075 00076 extern int currently_running_service_checks; 00077 extern int currently_running_host_checks; 00078 00079 extern int accept_passive_service_checks; 00080 extern int execute_service_checks; 00081 extern int accept_passive_host_checks; 00082 extern int execute_host_checks; 00083 extern int obsess_over_services; 00084 extern int obsess_over_hosts; 00085 00086 extern int translate_passive_host_checks; 00087 extern int passive_host_checks_are_soft; 00088 00089 extern int check_service_freshness; 00090 extern int check_host_freshness; 00091 extern int additional_freshness_latency; 00092 00093 extern int max_host_check_spread; 00094 extern int max_service_check_spread; 00095 00096 extern int use_large_installation_tweaks; 00097 extern int free_child_process_memory; 00098 extern int child_processes_fork_twice; 00099 00100 extern int stalking_event_handlers_for_hosts; 00101 extern int stalking_event_handlers_for_services; 00102 00103 extern time_t program_start; 00104 extern time_t event_start; 00105 00106 extern timed_event *event_list_low; 00107 extern timed_event *event_list_low_tail; 00108 00109 extern host *host_list; 00110 extern service *service_list; 00111 extern servicedependency *servicedependency_list; 00112 extern hostdependency *hostdependency_list; 00113 00114 extern unsigned long next_event_id; 00115 extern unsigned long next_problem_id; 00116 00117 extern check_result check_result_info; 00118 extern check_result *check_result_list; 00119 00120 extern pthread_t worker_threads[TOTAL_WORKER_THREADS]; 00121 00122 extern unsigned long max_debug_file_size; 00123 00124 #ifdef EMBEDDEDPERL 00125 extern int use_embedded_perl; 00126 #endif 00127 00128 int dummy; /* reduce compiler warnings */ 00129 00130 /******************************************************************/ 00131 /********************* MISCELLANEOUS FUNCTIONS ********************/ 00132 /******************************************************************/ 00133 00134 /* extract check result */ 00135 static void extract_check_result(FILE *fp,dbuf *checkresult_dbuf){ 00136 char output_buffer[MAX_INPUT_BUFFER]=""; 00137 char *temp_buffer; 00138 00139 /* initialize buffer */ 00140 strcpy(output_buffer,""); 00141 00142 /* get all lines of plugin output - escape newlines */ 00143 while(fgets(output_buffer,sizeof(output_buffer)-1,fp)){ 00144 temp_buffer=escape_newlines(output_buffer); 00145 dbuf_strcat(checkresult_dbuf,temp_buffer); 00146 my_free(temp_buffer); 00147 } 00148 } 00149 00150 /* convert a command line to an array of arguments, suitable for exec* functions */ 00151 static int parse_command_line(char *cmd, char *argv[MAX_CMD_ARGS]){ 00152 unsigned int argc=0; 00153 char *parsed_cmd; 00154 00155 /* Skip initial white-space characters. */ 00156 for(parsed_cmd=cmd;isspace(*cmd);++cmd) 00157 ; 00158 00159 /* Parse command line. */ 00160 while(*cmd&&(argc<MAX_CMD_ARGS-1)){ 00161 argv[argc++]=parsed_cmd; 00162 00163 switch(*cmd){ 00164 case '\'': 00165 while((*cmd)&&(*cmd!='\'')) 00166 *(parsed_cmd++)=*(cmd++); 00167 if(*cmd) 00168 ++cmd; 00169 break; 00170 case '"': 00171 while((*cmd)&&(*cmd!='"')){ 00172 if((*cmd=='\\')&&cmd[1]&&strchr("\"\\\n",cmd[1])) 00173 ++cmd; 00174 *(parsed_cmd++)=*(cmd++); 00175 } 00176 if(*cmd) 00177 ++cmd; 00178 break; 00179 default: 00180 while((*cmd)&&!isspace(*cmd)){ 00181 if((*cmd=='\\')&&cmd[1]) 00182 ++cmd; 00183 *(parsed_cmd++)=*(cmd++); 00184 } 00185 } 00186 00187 while(isspace(*cmd)) 00188 ++cmd; 00189 00190 if(argc>=MAX_CMD_ARGS-1){ 00191 logit(NSLOG_RUNTIME_WARNING,TRUE,"overlimit args for command %s\n",argv[0]); 00192 _exit(STATE_UNKNOWN); 00193 } 00194 else 00195 *(parsed_cmd++)='\0'; 00196 } 00197 00198 argv[argc]=NULL; 00199 00200 return OK; 00201 } 00202 00203 /* run a check */ 00204 static int run_check(char *processed_command,dbuf *checkresult_dbuf){ 00205 char *argv[MAX_CMD_ARGS]; 00206 FILE *fp; 00207 pid_t pid; 00208 int pipefds[2]; 00209 int retval; 00210 00211 /* check for check execution method (shell or execvp) */ 00212 if(!has_shell_metachars(processed_command)){ 00213 00214 if(pipe(pipefds)<0){ 00215 logit(NSLOG_RUNTIME_WARNING,TRUE,"error creating pipe: %s\n", strerror(errno)); 00216 _exit(STATE_UNKNOWN); 00217 } 00218 if((pid=fork())<0){ 00219 logit(NSLOG_RUNTIME_WARNING,TRUE,"fork error\n"); 00220 _exit(STATE_UNKNOWN); 00221 } 00222 else if(!pid){ 00223 /* child replaces stdout/stderr with output of the pipe */ 00224 if((dup2(pipefds[1],STDOUT_FILENO)<0)||(dup2(pipefds[1],STDERR_FILENO)<0)){ 00225 logit(NSLOG_RUNTIME_WARNING,TRUE,"dup2 error\n"); 00226 _exit(STATE_UNKNOWN); 00227 } 00228 00229 /* close unused half of pipe */ 00230 close(pipefds[1]); 00231 00232 /* extract command args for execv */ 00233 parse_command_line(processed_command,argv); 00234 00235 if(!argv[0]){ 00236 logit(NSLOG_RUNTIME_WARNING,TRUE,"plugin command definition empty\n"); 00237 _exit(STATE_UNKNOWN); 00238 } 00239 00240 log_debug_info(DEBUGL_CHECKS,0,"running command %s via execvp\n",processed_command); 00241 00242 if(execvp(argv[0], argv)<0){ /* execvp only returns in case of an error */ 00243 logit(NSLOG_RUNTIME_WARNING,TRUE,"error executing command '%s': %s. Make sure that the file actually exists (in PATH, if set) and is executable!\n",processed_command, strerror(errno)); 00244 _exit(STATE_UNKNOWN); 00245 } 00246 _exit(STATE_UNKNOWN); 00247 } 00248 00249 /* prepare pipe reading */ 00250 close(pipefds[1]); 00251 fp=fdopen(pipefds[0],"r"); 00252 if(!fp){ 00253 logit(NSLOG_RUNTIME_WARNING,TRUE,"fdopen error\n"); 00254 _exit(STATE_UNKNOWN); 00255 } 00256 00257 /* extract check result */ 00258 extract_check_result(fp,checkresult_dbuf); 00259 00260 /* close the process */ 00261 fclose(fp); 00262 close(pipefds[0]); 00263 00264 if(waitpid(pid,&retval,0)!=pid) 00265 retval=-1; 00266 } 00267 else{ 00268 log_debug_info(DEBUGL_CHECKS,0,"running command %s via popen\n",processed_command); 00269 fp=popen(processed_command,"r"); 00270 00271 if(fp==NULL) 00272 _exit(STATE_UNKNOWN); 00273 00274 /* extract check result */ 00275 extract_check_result(fp,checkresult_dbuf); 00276 00277 /* close the process */ 00278 retval=pclose(fp); 00279 } 00280 00281 return retval; 00282 } 00283 00284 00285 /******************************************************************/ 00286 /********************** CHECK REAPER FUNCTIONS ********************/ 00287 /******************************************************************/ 00288 00289 /* reaps host and service check results */ 00290 int reap_check_results(void){ 00291 check_result *queued_check_result=NULL; 00292 service *temp_service=NULL; 00293 host *temp_host=NULL; 00294 time_t current_time=0L; 00295 time_t reaper_start_time=0L; 00296 int reaped_checks=0; 00297 00298 log_debug_info(DEBUGL_FUNCTIONS,0,"reap_check_results() start\n"); 00299 log_debug_info(DEBUGL_CHECKS,0,"Starting to reap check results.\n"); 00300 00301 /* get the start time */ 00302 time(&reaper_start_time); 00303 00304 /* process files in the check result queue */ 00305 process_check_result_queue(check_result_path); 00306 00307 /* read all check results that have come in... */ 00308 while((queued_check_result=read_check_result())){ 00309 00310 reaped_checks++; 00311 00312 log_debug_info(DEBUGL_CHECKS,2,"Found a check result (#%d) to handle...\n",reaped_checks); 00313 00314 /* service check */ 00315 if(queued_check_result->object_check_type==SERVICE_CHECK){ 00316 00317 /* make sure the service exists */ 00318 if((temp_service=find_service(queued_check_result->host_name,queued_check_result->service_description))==NULL){ 00319 00320 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check result queue contained results for service '%s' on host '%s', but the service could not be found! Perhaps you forgot to define the service in your config files?\n",queued_check_result->service_description,queued_check_result->host_name); 00321 00322 /* delete the file that contains the check results, as well as the ok-to-go file */ 00323 delete_check_result_file(queued_check_result->output_file); 00324 00325 /* free memory */ 00326 free_check_result(queued_check_result); 00327 my_free(queued_check_result); 00328 00329 /* TODO - add new service definition automatically */ 00330 00331 continue; 00332 } 00333 00334 log_debug_info(DEBUGL_CHECKS,1,"Handling check result for service '%s' on host '%s'...\n",temp_service->description,temp_service->host_name); 00335 00336 /* process the check result */ 00337 handle_async_service_check_result(temp_service,queued_check_result); 00338 } 00339 00340 /* host check */ 00341 else{ 00342 if((temp_host=find_host(queued_check_result->host_name))==NULL){ 00343 00344 /* make sure the host exists */ 00345 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check result queue contained results for host '%s', but the host could not be found! Perhaps you forgot to define the host in your config files?\n",queued_check_result->host_name); 00346 00347 /* delete the file that contains the check results, as well as the ok-to-go file */ 00348 delete_check_result_file(queued_check_result->output_file); 00349 00350 /* free memory */ 00351 free_check_result(queued_check_result); 00352 my_free(queued_check_result); 00353 00354 /* TODO - add new host definition automatically */ 00355 00356 continue; 00357 } 00358 00359 log_debug_info(DEBUGL_CHECKS,1,"Handling check result for host '%s'...\n",temp_host->name); 00360 00361 /* process the check result */ 00362 handle_async_host_check_result_3x(temp_host,queued_check_result); 00363 } 00364 00365 /* delete the file that contains the check results, as well as the ok-to-go file */ 00366 /* files can contain multiple check results - in this case, the file will be removed when the first check result is processed */ 00367 delete_check_result_file(queued_check_result->output_file); 00368 00369 log_debug_info(DEBUGL_CHECKS|DEBUGL_IPC,1,"Deleted check result file '%s'\n",queued_check_result->output_file); 00370 00371 /* free allocated memory */ 00372 free_check_result(queued_check_result); 00373 my_free(queued_check_result); 00374 00375 /* break out if we've been here too long (max_check_reaper_time seconds) */ 00376 time(¤t_time); 00377 if((int)(current_time-reaper_start_time)>max_check_reaper_time){ 00378 log_debug_info(DEBUGL_CHECKS,0,"Breaking out of check result reaper: max reaper time exceeded\n"); 00379 break; 00380 } 00381 00382 /* bail out if we encountered a signal */ 00383 if(sigshutdown==TRUE || sigrestart==TRUE){ 00384 log_debug_info(DEBUGL_CHECKS,0,"Breaking out of check result reaper: signal encountered\n"); 00385 break; 00386 } 00387 } 00388 00389 log_debug_info(DEBUGL_CHECKS,0,"Finished reaping %d check results\n",reaped_checks); 00390 log_debug_info(DEBUGL_FUNCTIONS,0,"reap_check_results() end\n"); 00391 00392 return OK; 00393 } 00394 00395 00396 00397 00398 /******************************************************************/ 00399 /****************** SERVICE MONITORING FUNCTIONS ******************/ 00400 /******************************************************************/ 00401 00402 /* executes a scheduled service check */ 00403 int run_scheduled_service_check(service *svc, int check_options, double latency){ 00404 int result=OK; 00405 time_t current_time=0L; 00406 time_t preferred_time=0L; 00407 time_t next_valid_time=0L; 00408 int time_is_valid=TRUE; 00409 00410 if(svc==NULL) 00411 return ERROR; 00412 00413 log_debug_info(DEBUGL_FUNCTIONS,0,"run_scheduled_service_check() start\n"); 00414 log_debug_info(DEBUGL_CHECKS,0,"Attempting to run scheduled check of service '%s' on host '%s': check options=%d, latency=%lf\n",svc->description,svc->host_name,check_options,latency); 00415 00416 /* attempt to run the check */ 00417 result=run_async_service_check(svc,check_options,latency,TRUE,TRUE,&time_is_valid,&preferred_time); 00418 00419 /* an error occurred, so reschedule the check */ 00420 if(result==ERROR){ 00421 00422 log_debug_info(DEBUGL_CHECKS,1,"Unable to run scheduled service check at this time\n"); 00423 00424 /* only attempt to (re)schedule checks that should get checked... */ 00425 if(svc->should_be_scheduled==TRUE){ 00426 00427 /* get current time */ 00428 time(¤t_time); 00429 00430 /* determine next time we should check the service if needed */ 00431 /* if service has no check interval, schedule it again for 5 minutes from now */ 00432 if(current_time>=preferred_time) 00433 preferred_time=current_time+((svc->check_interval<=0)?300:(svc->check_interval*interval_length)); 00434 00435 /* make sure we rescheduled the next service check at a valid time */ 00436 get_next_valid_time(preferred_time,&next_valid_time,svc->check_period_ptr); 00437 00438 /* 00439 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Service '%s' on host '%s' timeperiod check failed...\n",svc->description,svc->host_name); 00440 logit(NSLOG_RUNTIME_WARNING,TRUE,"Current time: %s",ctime(¤t_time)); 00441 logit(NSLOG_RUNTIME_WARNING,TRUE,"Preferred time: %s",ctime(&preferred_time)); 00442 logit(NSLOG_RUNTIME_WARNING,TRUE,"Next valid time: %s",ctime(&next_valid_time)); 00443 */ 00444 00445 /* the service could not be rescheduled properly - set the next check time for next week */ 00446 /*if(time_is_valid==FALSE && next_valid_time==preferred_time){*/ 00447 /* UPDATED 08/12/09 EG to reflect proper timeperod check logic */ 00448 if(time_is_valid==FALSE && check_time_against_period(next_valid_time,svc->check_period_ptr)==ERROR){ 00449 00450 /* 00451 svc->next_check=(time_t)(next_valid_time+(60*60*24*365)); 00452 svc->should_be_scheduled=FALSE; 00453 */ 00454 00455 svc->next_check=(time_t)(next_valid_time+(60*60*24*7)); 00456 00457 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check of service '%s' on host '%s' could not be rescheduled properly. Scheduling check for next week...\n",svc->description,svc->host_name); 00458 00459 log_debug_info(DEBUGL_CHECKS,1,"Unable to find any valid times to reschedule the next service check!\n"); 00460 } 00461 00462 /* this service could be rescheduled... */ 00463 else{ 00464 svc->next_check=next_valid_time; 00465 svc->should_be_scheduled=TRUE; 00466 00467 log_debug_info(DEBUGL_CHECKS,1,"Rescheduled next service check for %s",ctime(&next_valid_time)); 00468 } 00469 } 00470 00471 /* reschedule the next service check - unless we couldn't find a valid next check time */ 00472 /* 10/19/07 EG - keep original check options */ 00473 if(svc->should_be_scheduled==TRUE) 00474 schedule_service_check(svc,svc->next_check,check_options); 00475 00476 /* update the status log */ 00477 update_service_status(svc,FALSE); 00478 00479 return ERROR; 00480 } 00481 00482 return OK; 00483 } 00484 00485 00486 /* forks a child process to run a service check, but does not wait for the service check result */ 00487 int run_async_service_check(service *svc, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time){ 00488 icinga_macros mac; 00489 char *raw_command=NULL; 00490 char *processed_command=NULL; 00491 struct timeval start_time,end_time; 00492 pid_t pid=0; 00493 int fork_error=FALSE; 00494 int wait_result=0; 00495 host *temp_host=NULL; 00496 int pclose_result=0; 00497 mode_t new_umask=077; 00498 mode_t old_umask; 00499 char *output_file=NULL; 00500 double old_latency=0.0; 00501 dbuf checkresult_dbuf; 00502 int dbuf_chunk=1024; 00503 #ifdef USE_EVENT_BROKER 00504 int neb_result=OK; 00505 #endif 00506 #ifdef EMBEDDEDPERL 00507 char fname[512]=""; 00508 char *args[5]={"",DO_CLEAN, "", "", NULL }; 00509 char *perl_plugin_output=NULL; 00510 char *temp_buffer=NULL; 00511 char *args3=NULL; 00512 SV *plugin_hndlr_cr=NULL; /* perl.h holds typedef struct */ 00513 int count; 00514 int use_epn=FALSE; 00515 #ifdef aTHX 00516 dTHX; 00517 #endif 00518 dSP; 00519 #endif 00520 00521 log_debug_info(DEBUGL_FUNCTIONS,0,"run_async_service_check()\n"); 00522 00523 /* make sure we have something */ 00524 if(svc==NULL) 00525 return ERROR; 00526 00527 /* is the service check viable at this time? */ 00528 if(check_service_check_viability(svc,check_options,time_is_valid,preferred_time)==ERROR) 00529 return ERROR; 00530 00531 /* find the host associated with this service */ 00532 if((temp_host=svc->host_ptr)==NULL) 00533 return ERROR; 00534 00535 /******** GOOD TO GO FOR A REAL SERVICE CHECK AT THIS POINT ********/ 00536 00537 #ifdef USE_EVENT_BROKER 00538 /* initialize start/end times */ 00539 start_time.tv_sec=0L; 00540 start_time.tv_usec=0L; 00541 end_time.tv_sec=0L; 00542 end_time.tv_usec=0L; 00543 00544 /* send data to event broker */ 00545 neb_result=broker_service_check(NEBTYPE_SERVICECHECK_ASYNC_PRECHECK,NEBFLAG_NONE,NEBATTR_NONE,svc,SERVICE_CHECK_ACTIVE,start_time,end_time,svc->service_check_command,svc->latency,0.0,0,FALSE,0,NULL,NULL); 00546 00547 /* neb module wants to cancel the service check - the check will be rescheduled for a later time by the scheduling logic */ 00548 if(neb_result==NEBERROR_CALLBACKCANCEL){ 00549 if(preferred_time) 00550 *preferred_time+=(svc->check_interval*interval_length); 00551 return ERROR; 00552 } 00553 00554 /* neb module wants to override (or cancel) the service check - perhaps it will check the service itself */ 00555 /* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */ 00556 /* NOTE: if would be easier for modules to override checks when the NEBTYPE_SERVICECHECK_INITIATE event is called (later) */ 00557 if(neb_result==NEBERROR_CALLBACKOVERRIDE) 00558 return OK; 00559 #endif 00560 00561 00562 log_debug_info(DEBUGL_CHECKS,0,"Checking service '%s' on host '%s'...\n",svc->description,svc->host_name); 00563 00564 /* clear check options - we don't want old check options retained */ 00565 /* only clear check options for scheduled checks - ondemand checks shouldn't affected retained check options */ 00566 if(scheduled_check==TRUE) 00567 svc->check_options=CHECK_OPTION_NONE; 00568 00569 /* update latency for macros, event broker, save old value for later */ 00570 old_latency=svc->latency; 00571 svc->latency=latency; 00572 00573 /* grab the host and service macro variables */ 00574 memset(&mac, 0, sizeof(mac)); 00575 grab_host_macros_r(&mac, temp_host); 00576 grab_service_macros_r(&mac, svc); 00577 00578 /* get the raw command line */ 00579 get_raw_command_line_r(&mac, svc->check_command_ptr,svc->service_check_command,&raw_command,0); 00580 if(raw_command==NULL){ 00581 clear_volatile_macros_r(&mac); 00582 log_debug_info(DEBUGL_CHECKS,0,"Raw check command for service '%s' on host '%s' was NULL - aborting.\n",svc->description,svc->host_name); 00583 if(preferred_time) 00584 *preferred_time+=(svc->check_interval*interval_length); 00585 svc->latency=old_latency; 00586 return ERROR; 00587 } 00588 00589 /* process any macros contained in the argument */ 00590 process_macros_r(&mac, raw_command,&processed_command,0); 00591 if(processed_command==NULL){ 00592 clear_volatile_macros_r(&mac); 00593 log_debug_info(DEBUGL_CHECKS,0,"Processed check command for service '%s' on host '%s' was NULL - aborting.\n",svc->description,svc->host_name); 00594 if(preferred_time) 00595 *preferred_time+=(svc->check_interval*interval_length); 00596 svc->latency=old_latency; 00597 my_free(raw_command); 00598 return ERROR; 00599 } 00600 00601 /* get the command start time */ 00602 gettimeofday(&start_time,NULL); 00603 00604 /* increment number of service checks that are currently running... */ 00605 currently_running_service_checks++; 00606 00607 /* set the execution flag */ 00608 svc->is_executing=TRUE; 00609 00610 /* start save check info */ 00611 check_result_info.object_check_type=SERVICE_CHECK; 00612 check_result_info.check_type=SERVICE_CHECK_ACTIVE; 00613 check_result_info.check_options=check_options; 00614 check_result_info.scheduled_check=scheduled_check; 00615 check_result_info.reschedule_check=reschedule_check; 00616 check_result_info.start_time=start_time; 00617 check_result_info.finish_time=start_time; 00618 check_result_info.early_timeout=FALSE; 00619 check_result_info.exited_ok=TRUE; 00620 check_result_info.return_code=STATE_OK; 00621 check_result_info.output=NULL; 00622 00623 #ifdef USE_EVENT_BROKER 00624 /* send data to event broker */ 00625 neb_result=broker_service_check(NEBTYPE_SERVICECHECK_INITIATE,NEBFLAG_NONE,NEBATTR_NONE,svc,SERVICE_CHECK_ACTIVE,start_time,end_time,svc->service_check_command,svc->latency,0.0,service_check_timeout,FALSE,0,processed_command,NULL); 00626 00627 my_free(svc->processed_command); 00628 svc->processed_command=strdup(processed_command); 00629 00630 /* neb module wants to override the service check - perhaps it will check the service itself */ 00631 if(neb_result==NEBERROR_CALLBACKOVERRIDE){ 00632 clear_volatile_macros_r(&mac); 00633 svc->latency=old_latency; 00634 my_free(processed_command); 00635 my_free(raw_command); 00636 return OK; 00637 } 00638 #endif 00639 00640 /* open a temp file for storing check output */ 00641 old_umask=umask(new_umask); 00642 dummy=asprintf(&output_file,"%s/checkXXXXXX",temp_path); 00643 check_result_info.output_file_fd=mkstemp(output_file); 00644 if(check_result_info.output_file_fd>=0) 00645 check_result_info.output_file_fp=fdopen(check_result_info.output_file_fd,"w"); 00646 else{ 00647 check_result_info.output_file_fp=NULL; 00648 check_result_info.output_file_fd=-1; 00649 } 00650 umask(old_umask); 00651 00652 log_debug_info(DEBUGL_CHECKS|DEBUGL_IPC,1,"Check result output will be written to '%s' (fd=%d)\n",output_file,check_result_info.output_file_fd); 00653 00654 00655 /* finish save check info */ 00656 check_result_info.host_name=(char *)strdup(svc->host_name); 00657 check_result_info.service_description=(char *)strdup(svc->description); 00658 check_result_info.output_file=(check_result_info.output_file_fd<0 || output_file==NULL)?NULL:strdup(output_file); 00659 00660 /* free memory */ 00661 my_free(output_file); 00662 00663 /* write start of check result file */ 00664 /* if things go really bad later on down the line, the user will at least have a partial file to help debug missing output results */ 00665 if(check_result_info.output_file_fp){ 00666 00667 fprintf(check_result_info.output_file_fp,"### Active Check Result File ###\n"); 00668 fprintf(check_result_info.output_file_fp,"file_time=%lu\n",(unsigned long)check_result_info.start_time.tv_sec); 00669 fprintf(check_result_info.output_file_fp,"\n"); 00670 00671 fprintf(check_result_info.output_file_fp,"### Icinga Service Check Result ###\n"); 00672 fprintf(check_result_info.output_file_fp,"# Time: %s",ctime(&check_result_info.start_time.tv_sec)); 00673 fprintf(check_result_info.output_file_fp,"host_name=%s\n",check_result_info.host_name); 00674 fprintf(check_result_info.output_file_fp,"service_description=%s\n",check_result_info.service_description); 00675 fprintf(check_result_info.output_file_fp,"check_type=%d\n",check_result_info.check_type); 00676 fprintf(check_result_info.output_file_fp,"check_options=%d\n",check_result_info.check_options); 00677 fprintf(check_result_info.output_file_fp,"scheduled_check=%d\n",check_result_info.scheduled_check); 00678 fprintf(check_result_info.output_file_fp,"reschedule_check=%d\n",check_result_info.reschedule_check); 00679 fprintf(check_result_info.output_file_fp,"latency=%f\n",svc->latency); 00680 fprintf(check_result_info.output_file_fp,"start_time=%lu.%lu\n",check_result_info.start_time.tv_sec,check_result_info.start_time.tv_usec); 00681 00682 /* flush output or it'll get written again when we fork() */ 00683 fflush(check_result_info.output_file_fp); 00684 } 00685 00686 /* initialize dynamic buffer for storing plugin output */ 00687 dbuf_init(&checkresult_dbuf,dbuf_chunk); 00688 00689 00690 /* reset latency (permanent value will be set later) */ 00691 svc->latency=old_latency; 00692 00693 /* update check statistics */ 00694 update_check_stats((scheduled_check==TRUE)?ACTIVE_SCHEDULED_SERVICE_CHECK_STATS:ACTIVE_ONDEMAND_SERVICE_CHECK_STATS,start_time.tv_sec); 00695 00696 #ifdef EMBEDDEDPERL 00697 00698 /* get"filename" component of command */ 00699 strncpy(fname,processed_command,strcspn(processed_command," ")); 00700 fname[strcspn(processed_command," ")]='\x0'; 00701 00702 /* should we use the embedded Perl interpreter to run this script? */ 00703 use_epn=file_uses_embedded_perl(fname); 00704 00705 /* if yes, do some initialization */ 00706 if(use_epn==TRUE){ 00707 00708 log_debug_info(DEBUGL_CHECKS,1,"** Using Embedded Perl interpreter to run service check...\n"); 00709 00710 args[0]=fname; 00711 args[2]=""; 00712 00713 if(strchr(processed_command,' ')==NULL){ 00714 args[3]=""; 00715 } else { 00716 /* make sure to strip leading whitespaces from args */ 00717 args3=processed_command+strlen(fname)+1; 00718 for (;isspace(*args3);args3++); 00719 args[3]=args3; 00720 } 00721 00722 ENTER; 00723 SAVETMPS; 00724 PUSHMARK(SP); 00725 XPUSHs(sv_2mortal(newSVpv(args[0],0))); 00726 XPUSHs(sv_2mortal(newSVpv(args[1],0))); 00727 XPUSHs(sv_2mortal(newSVpv(args[2],0))); 00728 XPUSHs(sv_2mortal(newSVpv(args[3],0))); 00729 PUTBACK; 00730 00731 /* call our perl interpreter to compile and optionally cache the command */ 00732 00733 call_pv("Embed::Persistent::eval_file", G_SCALAR | G_EVAL); 00734 00735 SPAGAIN ; 00736 00737 if( SvTRUE(ERRSV) ){ 00738 00739 /* 00740 * if SvTRUE(ERRSV) 00741 * write failure to IPC pipe 00742 * return 00743 */ 00744 00745 /* remove the top element of the Perl stack (undef) */ 00746 (void) POPs ; 00747 00748 pclose_result=STATE_UNKNOWN; 00749 perl_plugin_output=SvPVX(ERRSV); 00750 00751 log_debug_info(DEBUGL_CHECKS,0,"Embedded Perl failed to compile %s, compile error %s - skipping plugin\n",fname,perl_plugin_output); 00752 00753 /* save plugin output */ 00754 if(perl_plugin_output!=NULL){ 00755 temp_buffer=escape_newlines(perl_plugin_output); 00756 dbuf_strcat(&checkresult_dbuf,temp_buffer); 00757 my_free(temp_buffer); 00758 } 00759 00760 /* get the check finish time */ 00761 gettimeofday(&end_time,NULL); 00762 00763 /* record check result info */ 00764 check_result_info.exited_ok=FALSE; 00765 check_result_info.return_code=pclose_result; 00766 check_result_info.finish_time=end_time; 00767 00768 /* write check result to file */ 00769 if(check_result_info.output_file_fp){ 00770 00771 fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec); 00772 fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout); 00773 fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok); 00774 fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code); 00775 fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf); 00776 00777 /* close the temp file */ 00778 fclose(check_result_info.output_file_fp); 00779 00780 /* move check result to queue directory */ 00781 move_check_result_to_queue(check_result_info.output_file); 00782 } 00783 00784 /* free memory */ 00785 dbuf_free(&checkresult_dbuf); 00786 00787 /* free check result memory */ 00788 free_check_result(&check_result_info); 00789 00790 return OK; 00791 } 00792 else{ 00793 00794 plugin_hndlr_cr=newSVsv(POPs); 00795 00796 log_debug_info(DEBUGL_CHECKS,1,"Embedded Perl successfully compiled %s and returned code ref to plugin handler\n",fname); 00797 00798 PUTBACK ; 00799 FREETMPS ; 00800 LEAVE ; 00801 } 00802 } 00803 #endif 00804 00805 /* plugin is a C plugin or a Perl plugin _without_ compilation errors */ 00806 00807 /* fork a child process */ 00808 pid=fork(); 00809 00810 /* an error occurred while trying to fork */ 00811 if(pid==-1){ 00812 00813 fork_error=TRUE; 00814 00815 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The check of service '%s' on host '%s' could not be performed due to a fork() error: '%s'. The check will be rescheduled.\n",svc->description,svc->host_name,strerror(errno)); 00816 00817 log_debug_info(DEBUGL_CHECKS,0,"Check of service '%s' on host '%s' could not be performed due to a fork() error: '%s'!\n",svc->description,svc->host_name,strerror(errno)); 00818 } 00819 00820 /* if we are in the child process... */ 00821 else if(pid==0){ 00822 00823 /* set environment variables */ 00824 set_all_macro_environment_vars_r(&mac, TRUE); 00825 00826 /* ADDED 11/12/07 EG */ 00827 /* close external command file and shut down worker thread */ 00828 close_command_file(); 00829 00830 /* fork again if we're not in a large installation */ 00831 if(child_processes_fork_twice==TRUE){ 00832 00833 /* fork again... */ 00834 pid=fork(); 00835 00836 /* an error occurred while trying to fork again */ 00837 if(pid==-1) 00838 exit(STATE_UNKNOWN); 00839 } 00840 00841 /* the grandchild (or child if large install tweaks are enabled) process should run the service check... */ 00842 if(pid==0 || child_processes_fork_twice==FALSE){ 00843 00844 /* reset signal handling */ 00845 reset_sighandler(); 00846 00847 /* become the process group leader */ 00848 setpgid(0,0); 00849 00850 /* catch term signals at this process level */ 00851 signal(SIGTERM,service_check_sighandler); 00852 00853 /* catch plugins that don't finish in a timely manner */ 00854 signal(SIGALRM,service_check_sighandler); 00855 alarm(service_check_timeout); 00856 00857 /* disable rotation of the debug file */ 00858 max_debug_file_size=0L; 00859 00860 /******** BEGIN EMBEDDED PERL INTERPRETER EXECUTION ********/ 00861 #ifdef EMBEDDEDPERL 00862 if(use_epn==TRUE){ 00863 00864 /* execute our previously compiled script - from call_pv("Embed::Persistent::eval_file",..) */ 00865 /* NB. args[2] is _now_ a code ref (to the Perl subroutine corresp to the plugin) returned by eval_file() */ 00866 00867 ENTER; 00868 SAVETMPS; 00869 PUSHMARK(SP); 00870 00871 XPUSHs(sv_2mortal(newSVpv(args[0],0))); 00872 XPUSHs(sv_2mortal(newSVpv(args[1],0))); 00873 XPUSHs(plugin_hndlr_cr); 00874 XPUSHs(sv_2mortal(newSVpv(args[3],0))); 00875 00876 PUTBACK; 00877 00878 count=call_pv("Embed::Persistent::run_package", G_ARRAY); 00879 00880 SPAGAIN; 00881 00882 perl_plugin_output = POPpx ; 00883 pclose_result = POPi ; 00884 00885 /* NOTE: 07/16/07 This has to be done before FREETMPS statement below, or the POPpx pointer will be invalid (Hendrik B.) */ 00886 /* get perl plugin output - escape newlines */ 00887 if(perl_plugin_output!=NULL){ 00888 temp_buffer=escape_newlines(perl_plugin_output); 00889 dbuf_strcat(&checkresult_dbuf,temp_buffer); 00890 my_free(temp_buffer); 00891 } 00892 00893 PUTBACK; 00894 FREETMPS; 00895 LEAVE; 00896 00897 log_debug_info(DEBUGL_CHECKS,1,"Embedded Perl ran %s: return code=%d, plugin output=%s\n",fname,pclose_result,(perl_plugin_output==NULL)?"NULL":checkresult_dbuf.buf); 00898 00899 /* reset the alarm */ 00900 alarm(0); 00901 00902 /* get the check finish time */ 00903 gettimeofday(&end_time,NULL); 00904 00905 /* record check result info */ 00906 check_result_info.return_code=pclose_result; 00907 check_result_info.finish_time=end_time; 00908 00909 /* write check result to file */ 00910 if(check_result_info.output_file_fp){ 00911 00912 fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec); 00913 fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout); 00914 fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok); 00915 fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code); 00916 fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf); 00917 00918 /* close the temp file */ 00919 fclose(check_result_info.output_file_fp); 00920 00921 /* move check result to queue directory */ 00922 move_check_result_to_queue(check_result_info.output_file); 00923 } 00924 00925 /* free memory */ 00926 dbuf_free(&checkresult_dbuf); 00927 00928 /* free check result memory */ 00929 free_check_result(&check_result_info); 00930 00931 /* return with plugin exit status - not really necessary... */ 00932 _exit(pclose_result); 00933 } 00934 #endif 00935 /******** END EMBEDDED PERL INTERPRETER EXECUTION ********/ 00936 00937 00938 /* run the plugin check command */ 00939 pclose_result=run_check(processed_command,&checkresult_dbuf); 00940 00941 /* reset the alarm */ 00942 alarm(0); 00943 00944 /* get the check finish time */ 00945 gettimeofday(&end_time,NULL); 00946 00947 /* record check result info */ 00948 check_result_info.finish_time=end_time; 00949 check_result_info.early_timeout=FALSE; 00950 00951 /* test for execution error */ 00952 if(pclose_result==-1){ 00953 pclose_result=STATE_UNKNOWN; 00954 check_result_info.return_code=STATE_CRITICAL; 00955 check_result_info.exited_ok=FALSE; 00956 } 00957 else{ 00958 if(WEXITSTATUS(pclose_result)==0 && WIFSIGNALED(pclose_result)) 00959 check_result_info.return_code=128+WTERMSIG(pclose_result); 00960 else 00961 check_result_info.return_code=WEXITSTATUS(pclose_result); 00962 } 00963 00964 /* write check result to file */ 00965 if(check_result_info.output_file_fp){ 00966 00967 fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec); 00968 fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout); 00969 fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok); 00970 fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code); 00971 fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf); 00972 00973 /* close the temp file */ 00974 fclose(check_result_info.output_file_fp); 00975 00976 /* move check result to queue directory */ 00977 move_check_result_to_queue(check_result_info.output_file); 00978 } 00979 00980 /* free memory */ 00981 dbuf_free(&checkresult_dbuf); 00982 my_free(raw_command); 00983 my_free(processed_command); 00984 00985 /* free check result memory */ 00986 free_check_result(&check_result_info); 00987 00988 /* return with plugin exit status - not really necessary... */ 00989 _exit(pclose_result); 00990 } 00991 00992 /* NOTE: this code is never reached if large install tweaks are enabled... */ 00993 00994 /* unset environment variables */ 00995 set_all_macro_environment_vars_r(&mac, FALSE); 00996 00997 /* free allocated memory */ 00998 /* this needs to be done last, so we don't free memory for variables before they're used above */ 00999 if(free_child_process_memory==TRUE) 01000 free_memory(&mac); 01001 01002 /* parent exits immediately - grandchild process is inherited by the INIT process, so we have no zombie problem... */ 01003 _exit(STATE_OK); 01004 } 01005 01006 /* else the parent should wait for the first child to return... */ 01007 else if(pid>0){ 01008 clear_volatile_macros_r(&mac); 01009 01010 log_debug_info(DEBUGL_CHECKS,2,"Service check is executing in child process (pid=%lu)\n",(unsigned long)pid); 01011 01012 /* parent should close output file */ 01013 if(check_result_info.output_file_fp) 01014 fclose(check_result_info.output_file_fp); 01015 01016 /* should this be done in first child process (after spawning grandchild) as well? */ 01017 /* free memory allocated for IPC functionality */ 01018 free_check_result(&check_result_info); 01019 01020 /* free memory */ 01021 my_free(raw_command); 01022 my_free(processed_command); 01023 01024 /* wait for the first child to return */ 01025 /* don't do this if large install tweaks are enabled - we'll clean up children in event loop */ 01026 if(child_processes_fork_twice==TRUE) 01027 wait_result=waitpid(pid,NULL,0); 01028 } 01029 01030 /* see if we were able to run the check... */ 01031 if(fork_error==TRUE) 01032 return ERROR; 01033 01034 return OK; 01035 } 01036 01037 01038 01039 /* handles asynchronous service check results */ 01040 int handle_async_service_check_result(service *temp_service, check_result *queued_check_result){ 01041 host *temp_host=NULL; 01042 time_t next_service_check=0L; 01043 time_t preferred_time=0L; 01044 time_t next_valid_time=0L; 01045 int reschedule_check=FALSE; 01046 int state_change=FALSE; 01047 int hard_state_change=FALSE; 01048 int first_host_check_initiated=FALSE; 01049 int route_result=HOST_UP; 01050 time_t current_time=0L; 01051 int state_was_logged=FALSE; 01052 char *old_plugin_output=NULL; 01053 char *temp_plugin_output=NULL; 01054 char *temp_ptr=NULL; 01055 servicedependency *temp_dependency=NULL; 01056 objectlist *check_servicelist=NULL; 01057 objectlist *servicelist_item=NULL; 01058 service *master_service=NULL; 01059 int run_async_check=TRUE; 01060 int state_changes_use_cached_state=TRUE; /* TODO - 09/23/07 move this to a global variable */ 01061 int flapping_check_done=FALSE; 01062 void *ptr=NULL; 01063 01064 01065 log_debug_info(DEBUGL_FUNCTIONS,0,"handle_async_service_check_result()\n"); 01066 01067 /* make sure we have what we need */ 01068 if(temp_service==NULL || queued_check_result==NULL) 01069 return ERROR; 01070 01071 /* get the current time */ 01072 time(¤t_time); 01073 01074 log_debug_info(DEBUGL_CHECKS,0,"** Handling check result for service '%s' on host '%s'...\n",temp_service->description,temp_service->host_name); 01075 log_debug_info(DEBUGL_CHECKS,1,"HOST: %s, SERVICE: %s, CHECK TYPE: %s, OPTIONS: %d, SCHEDULED: %s, RESCHEDULE: %s, EXITED OK: %s, RETURN CODE: %d, OUTPUT: %s\n",temp_service->host_name,temp_service->description,(queued_check_result->check_type==SERVICE_CHECK_ACTIVE)?"Active":"Passive",queued_check_result->check_options,(queued_check_result->scheduled_check==TRUE)?"Yes":"No",(queued_check_result->reschedule_check==TRUE)?"Yes":"No",(queued_check_result->exited_ok==TRUE)?"Yes":"No",queued_check_result->return_code,queued_check_result->output); 01076 01077 /* decrement the number of service checks still out there... */ 01078 if(queued_check_result->check_type==SERVICE_CHECK_ACTIVE && currently_running_service_checks>0) 01079 currently_running_service_checks--; 01080 01081 /* skip this service check results if its passive and we aren't accepting passive check results */ 01082 if(queued_check_result->check_type==SERVICE_CHECK_PASSIVE){ 01083 if(accept_passive_service_checks==FALSE){ 01084 log_debug_info(DEBUGL_CHECKS,0,"Discarding passive service check result because passive service checks are disabled globally.\n"); 01085 return ERROR; 01086 } 01087 if(temp_service->accept_passive_service_checks==FALSE){ 01088 log_debug_info(DEBUGL_CHECKS,0,"Discarding passive service check result because passive checks are disabled for this service.\n"); 01089 return ERROR; 01090 } 01091 } 01092 01093 /* clear the freshening flag (it would have been set if this service was determined to be stale) */ 01094 if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) 01095 temp_service->is_being_freshened=FALSE; 01096 01097 /* clear the execution flag if this was an active check */ 01098 if(queued_check_result->check_type==SERVICE_CHECK_ACTIVE) 01099 temp_service->is_executing=FALSE; 01100 01101 /* DISCARD INVALID FRESHNESS CHECK RESULTS */ 01102 /* If a services goes stale, Icinga will initiate a forced check in order to freshen it. There is a race condition whereby a passive check 01103 could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here. This would 01104 make the service fresh again, so we do a quick check to make sure the service is still stale before we accept the check result. */ 01105 if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_service_result_fresh(temp_service,current_time,FALSE)==TRUE){ 01106 log_debug_info(DEBUGL_CHECKS,0,"Discarding service freshness check result because the service is currently fresh (race condition avoided).\n"); 01107 return OK; 01108 } 01109 01110 /* check latency is passed to us */ 01111 temp_service->latency=queued_check_result->latency; 01112 01113 /* update the execution time for this check (millisecond resolution) */ 01114 temp_service->execution_time=(double)((double)(queued_check_result->finish_time.tv_sec-queued_check_result->start_time.tv_sec)+(double)((queued_check_result->finish_time.tv_usec-queued_check_result->start_time.tv_usec)/1000.0)/1000.0); 01115 if(temp_service->execution_time<0.0) 01116 temp_service->execution_time=0.0; 01117 01118 /* get the last check time */ 01119 temp_service->last_check=queued_check_result->start_time.tv_sec; 01120 01121 /* was this check passive or active? */ 01122 temp_service->check_type=(queued_check_result->check_type==SERVICE_CHECK_ACTIVE)?SERVICE_CHECK_ACTIVE:SERVICE_CHECK_PASSIVE; 01123 01124 /* update check statistics for passive checks */ 01125 if(queued_check_result->check_type==SERVICE_CHECK_PASSIVE) 01126 update_check_stats(PASSIVE_SERVICE_CHECK_STATS,queued_check_result->start_time.tv_sec); 01127 01128 /* should we reschedule the next service check? NOTE: This may be overridden later... */ 01129 reschedule_check=queued_check_result->reschedule_check; 01130 01131 /* save the old service status info */ 01132 temp_service->last_state=temp_service->current_state; 01133 01134 /* save old plugin output */ 01135 if(temp_service->plugin_output) 01136 old_plugin_output=(char *)strdup(temp_service->plugin_output); 01137 01138 /* clear the old plugin output and perf data buffers */ 01139 my_free(temp_service->plugin_output); 01140 my_free(temp_service->long_plugin_output); 01141 my_free(temp_service->perf_data); 01142 01143 /* if there was some error running the command, just skip it (this shouldn't be happening) */ 01144 if(queued_check_result->exited_ok==FALSE){ 01145 01146 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check of service '%s' on host '%s' did not exit properly!\n",temp_service->description,temp_service->host_name); 01147 01148 temp_service->plugin_output=(char *)strdup("(Service check did not exit properly)"); 01149 01150 temp_service->current_state=STATE_CRITICAL; 01151 } 01152 01153 /* make sure the return code is within bounds */ 01154 else if(queued_check_result->return_code<0 || queued_check_result->return_code>3){ 01155 01156 if ( queued_check_result->return_code==126 ) { 01157 dummy=asprintf(&temp_service->plugin_output,"The command defined for service %s is not an executable\n", queued_check_result->service_description); 01158 } else if ( queued_check_result->return_code==127 ) { 01159 dummy=asprintf(&temp_service->plugin_output,"The command defined for service %s does not exist\n", queued_check_result->service_description); 01160 } else { 01161 dummy=asprintf(&temp_service->plugin_output, "Return code of %d is out of bounds", queued_check_result->return_code); 01162 } 01163 logit(NSLOG_RUNTIME_WARNING,TRUE,"%s",temp_service->plugin_output); 01164 01165 temp_service->current_state=STATE_CRITICAL; 01166 } 01167 01168 /* else the return code is okay... */ 01169 else{ 01170 01171 /* parse check output to get: (1) short output, (2) long output, (3) perf data */ 01172 parse_check_output(queued_check_result->output,&temp_service->plugin_output,&temp_service->long_plugin_output,&temp_service->perf_data,TRUE,TRUE); 01173 01174 /* make sure the plugin output isn't null */ 01175 if(temp_service->plugin_output==NULL) 01176 temp_service->plugin_output=(char *)strdup("(No output returned from plugin)"); 01177 01178 /* replace semicolons in plugin output (but not performance data) with colons */ 01179 else if((temp_ptr=temp_service->plugin_output)){ 01180 while((temp_ptr=strchr(temp_ptr,';'))) 01181 *temp_ptr=':'; 01182 } 01183 01184 log_debug_info(DEBUGL_CHECKS,2,"Parsing check output...\n"); 01185 log_debug_info(DEBUGL_CHECKS,2,"Short Output: %s\n",(temp_service->plugin_output==NULL)?"NULL":temp_service->plugin_output); 01186 log_debug_info(DEBUGL_CHECKS,2,"Long Output: %s\n",(temp_service->long_plugin_output==NULL)?"NULL":temp_service->long_plugin_output); 01187 log_debug_info(DEBUGL_CHECKS,2,"Perf Data: %s\n",(temp_service->perf_data==NULL)?"NULL":temp_service->perf_data); 01188 01189 /* grab the return code */ 01190 temp_service->current_state=queued_check_result->return_code; 01191 } 01192 01193 01194 /* record the last state time */ 01195 switch(temp_service->current_state){ 01196 case STATE_OK: 01197 temp_service->last_time_ok=temp_service->last_check; 01198 break; 01199 case STATE_WARNING: 01200 temp_service->last_time_warning=temp_service->last_check; 01201 break; 01202 case STATE_UNKNOWN: 01203 temp_service->last_time_unknown=temp_service->last_check; 01204 break; 01205 case STATE_CRITICAL: 01206 temp_service->last_time_critical=temp_service->last_check; 01207 break; 01208 default: 01209 break; 01210 } 01211 01212 /* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */ 01213 if(temp_service->check_type==SERVICE_CHECK_PASSIVE){ 01214 if(log_passive_checks==TRUE) 01215 logit(NSLOG_PASSIVE_CHECK,FALSE,"PASSIVE SERVICE CHECK: %s;%s;%d;%s\n",temp_service->host_name,temp_service->description,temp_service->current_state,temp_service->plugin_output); 01216 } 01217 01218 /* get the host that this service runs on */ 01219 temp_host=(host *)temp_service->host_ptr; 01220 01221 /* if the service check was okay... */ 01222 if(temp_service->current_state==STATE_OK){ 01223 01224 /* if the host has never been checked before, verify its status */ 01225 /* only do this if 1) the initial state was set to non-UP or 2) the host is not scheduled to be checked soon (next 5 minutes) */ 01226 if(temp_host->has_been_checked==FALSE && (temp_host->initial_state!=HOST_UP || (unsigned long)temp_host->next_check==0L || (unsigned long)(temp_host->next_check-current_time)>300)){ 01227 01228 /* set a flag to remember that we launched a check */ 01229 first_host_check_initiated=TRUE; 01230 01231 /* 08/04/07 EG launch an async (parallel) host check unless aggressive host checking is enabled */ 01232 /* previous logic was to simply run a sync (serial) host check */ 01233 /* do NOT allow cached check results to happen here - we need the host to be checked for real... */ 01234 if(use_aggressive_host_checking==TRUE) 01235 perform_on_demand_host_check(temp_host,NULL,CHECK_OPTION_NONE,FALSE,0L); 01236 else 01237 run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL); 01238 } 01239 } 01240 01241 01242 /**** NOTE - THIS WAS MOVED UP FROM LINE 1049 BELOW TO FIX PROBLEMS WHERE CURRENT ATTEMPT VALUE WAS ACTUALLY "LEADING" REAL VALUE ****/ 01243 /* increment the current attempt number if this is a soft state (service was rechecked) */ 01244 if(temp_service->state_type==SOFT_STATE && (temp_service->current_attempt < temp_service->max_attempts)) 01245 temp_service->current_attempt=temp_service->current_attempt+1; 01246 01247 01248 log_debug_info(DEBUGL_CHECKS,2,"ST: %s CA: %d MA: %d CS: %d LS: %d LHS: %d\n",(temp_service->state_type==SOFT_STATE)?"SOFT":"HARD",temp_service->current_attempt,temp_service->max_attempts,temp_service->current_state,temp_service->last_state,temp_service->last_hard_state); 01249 01250 /* check for a state change (either soft or hard) */ 01251 if(temp_service->current_state!=temp_service->last_state){ 01252 log_debug_info(DEBUGL_CHECKS,2,"Service has changed state since last check!\n"); 01253 state_change=TRUE; 01254 } 01255 01256 /* checks for a hard state change where host was down at last service check */ 01257 /* this occurs in the case where host goes down and service current attempt gets reset to 1 */ 01258 /* if this check is not made, the service recovery looks like a soft recovery instead of a hard one */ 01259 if(temp_service->host_problem_at_last_check==TRUE && temp_service->current_state==STATE_OK){ 01260 log_debug_info(DEBUGL_CHECKS,2,"Service had a HARD STATE CHANGE!!\n"); 01261 hard_state_change=TRUE; 01262 } 01263 01264 /* check for a "normal" hard state change where max check attempts is reached */ 01265 if(temp_service->current_attempt>=temp_service->max_attempts && temp_service->current_state!=temp_service->last_hard_state){ 01266 log_debug_info(DEBUGL_CHECKS,2,"Service had a HARD STATE CHANGE!!\n"); 01267 hard_state_change=TRUE; 01268 } 01269 01270 /* a state change occurred... */ 01271 /* reset last and next notification times and acknowledgement flag if necessary, misc other stuff */ 01272 if(state_change==TRUE || hard_state_change==TRUE){ 01273 01274 /* reschedule the service check */ 01275 reschedule_check=TRUE; 01276 01277 /* reset notification times */ 01278 temp_service->last_notification=(time_t)0; 01279 temp_service->next_notification=(time_t)0; 01280 01281 /* reset notification suppression option */ 01282 temp_service->no_more_notifications=FALSE; 01283 01284 if(temp_service->acknowledgement_type==ACKNOWLEDGEMENT_NORMAL){ 01285 01286 temp_service->problem_has_been_acknowledged=FALSE; 01287 temp_service->acknowledgement_type=ACKNOWLEDGEMENT_NONE; 01288 01289 /* remove any non-persistant comments associated with the ack */ 01290 delete_service_acknowledgement_comments(temp_service); 01291 } 01292 else if(temp_service->acknowledgement_type==ACKNOWLEDGEMENT_STICKY && temp_service->current_state==STATE_OK){ 01293 01294 temp_service->problem_has_been_acknowledged=FALSE; 01295 temp_service->acknowledgement_type=ACKNOWLEDGEMENT_NONE; 01296 01297 /* remove any non-persistant comments associated with the ack */ 01298 delete_service_acknowledgement_comments(temp_service); 01299 } 01300 01301 /* do NOT reset current notification number!!! */ 01302 /* hard changes between non-OK states should continue to be escalated, so don't reset current notification number */ 01303 /*temp_service->current_notification_number=0;*/ 01304 } 01305 01306 /* initialize the last host and service state change times if necessary */ 01307 if(temp_service->last_state_change==(time_t)0) 01308 temp_service->last_state_change=temp_service->last_check; 01309 if(temp_service->last_hard_state_change==(time_t)0) 01310 temp_service->last_hard_state_change=temp_service->last_check; 01311 if(temp_host->last_state_change==(time_t)0) 01312 temp_host->last_state_change=temp_service->last_check; 01313 if(temp_host->last_hard_state_change==(time_t)0) 01314 temp_host->last_hard_state_change=temp_service->last_check; 01315 01316 /* update last service state change times */ 01317 if(state_change==TRUE) 01318 temp_service->last_state_change=temp_service->last_check; 01319 if(hard_state_change==TRUE) 01320 temp_service->last_hard_state_change=temp_service->last_check; 01321 01322 /* update the event and problem ids */ 01323 if(state_change==TRUE){ 01324 01325 /* always update the event id on a state change */ 01326 temp_service->last_event_id=temp_service->current_event_id; 01327 temp_service->current_event_id=next_event_id; 01328 next_event_id++; 01329 01330 /* update the problem id when transitioning to a problem state */ 01331 if(temp_service->last_state==STATE_OK){ 01332 /* don't reset last problem id, or it will be zero the next time a problem is encountered */ 01333 /* temp_service->last_problem_id=temp_service->current_problem_id;*/ 01334 temp_service->current_problem_id=next_problem_id; 01335 next_problem_id++; 01336 } 01337 01338 /* clear the problem id when transitioning from a problem state to an OK state */ 01339 if(temp_service->current_state==STATE_OK){ 01340 temp_service->last_problem_id=temp_service->current_problem_id; 01341 temp_service->current_problem_id=0L; 01342 } 01343 } 01344 01345 01346 /**************************************/ 01347 /******* SERVICE CHECK OK LOGIC *******/ 01348 /**************************************/ 01349 01350 /* if the service is up and running OK... */ 01351 if(temp_service->current_state==STATE_OK){ 01352 01353 log_debug_info(DEBUGL_CHECKS,1,"Service is OK.\n"); 01354 01355 /* reset the acknowledgement flag (this should already have been done, but just in case...) */ 01356 temp_service->problem_has_been_acknowledged=FALSE; 01357 temp_service->acknowledgement_type=ACKNOWLEDGEMENT_NONE; 01358 01359 /* verify the route to the host and send out host recovery notifications */ 01360 if(temp_host->current_state!=HOST_UP){ 01361 01362 log_debug_info(DEBUGL_CHECKS,1,"Host is NOT UP, so we'll check it to see if it recovered...\n"); 01363 01364 /* 08/04/07 EG launch an async (parallel) host check (possibly cached) unless aggressive host checking is enabled */ 01365 /* previous logic was to simply run a sync (serial) host check */ 01366 if(use_aggressive_host_checking==TRUE) 01367 perform_on_demand_host_check(temp_host,NULL,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon); 01368 /* 09/23/07 EG don't launch a new host check if we already did so earlier */ 01369 else if(first_host_check_initiated==TRUE) 01370 log_debug_info(DEBUGL_CHECKS,1,"First host check was already initiated, so we'll skip a new host check.\n"); 01371 else{ 01372 /* can we use the last cached host state? */ 01373 /* usually only use cached host state if no service state change has occurred */ 01374 if((state_change==FALSE || state_changes_use_cached_state==TRUE) && temp_host->has_been_checked==TRUE && ((current_time-temp_host->last_check) <= cached_host_check_horizon)){ 01375 log_debug_info(DEBUGL_CHECKS,1,"* Using cached host state: %d\n",temp_host->current_state); 01376 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time); 01377 update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS,current_time); 01378 } 01379 01380 /* else launch an async (parallel) check of the host */ 01381 else 01382 run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL); 01383 } 01384 } 01385 01386 /* if a hard service recovery has occurred... */ 01387 if(hard_state_change==TRUE){ 01388 01389 log_debug_info(DEBUGL_CHECKS,1,"Service experienced a HARD RECOVERY.\n"); 01390 01391 /* set the state type macro */ 01392 temp_service->state_type=HARD_STATE; 01393 01394 /* log the service recovery */ 01395 log_service_event(temp_service); 01396 state_was_logged=TRUE; 01397 01398 /* 10/04/07 check to see if the service and/or associate host is flapping */ 01399 /* this should be done before a notification is sent out to ensure the host didn't just start flapping */ 01400 check_for_service_flapping(temp_service,TRUE,TRUE); 01401 check_for_host_flapping(temp_host,TRUE,FALSE,TRUE); 01402 flapping_check_done=TRUE; 01403 01404 /* notify contacts about the service recovery */ 01405 service_notification(temp_service,NOTIFICATION_NORMAL,NULL,NULL,NOTIFICATION_OPTION_NONE); 01406 01407 /* run the service event handler to handle the hard state change */ 01408 handle_service_event(temp_service); 01409 } 01410 01411 /* else if a soft service recovery has occurred... */ 01412 else if(state_change==TRUE){ 01413 01414 log_debug_info(DEBUGL_CHECKS,1,"Service experienced a SOFT RECOVERY.\n"); 01415 01416 /* this is a soft recovery */ 01417 temp_service->state_type=SOFT_STATE; 01418 01419 /* log the soft recovery */ 01420 log_service_event(temp_service); 01421 state_was_logged=TRUE; 01422 01423 /* run the service event handler to handle the soft state change */ 01424 handle_service_event(temp_service); 01425 } 01426 01427 /* else no service state change has occurred... */ 01428 else{ 01429 log_debug_info(DEBUGL_CHECKS,1,"Service did not change state.\n"); 01430 } 01431 01432 /* should we obsessive over service checks? */ 01433 if(obsess_over_services==TRUE) 01434 obsessive_compulsive_service_check_processor(temp_service); 01435 01436 /* reset all service variables because its okay now... */ 01437 temp_service->host_problem_at_last_check=FALSE; 01438 temp_service->current_attempt=1; 01439 temp_service->state_type=HARD_STATE; 01440 temp_service->last_hard_state=STATE_OK; 01441 temp_service->last_notification=(time_t)0; 01442 temp_service->next_notification=(time_t)0; 01443 temp_service->current_notification_number=0; 01444 #ifdef USE_ST_BASED_ESCAL_RANGES 01445 temp_service->current_warning_notification_number=0; 01446 temp_service->current_critical_notification_number=0; 01447 temp_service->current_unknown_notification_number=0; 01448 #endif 01449 temp_service->problem_has_been_acknowledged=FALSE; 01450 temp_service->acknowledgement_type=ACKNOWLEDGEMENT_NONE; 01451 temp_service->notified_on_unknown=FALSE; 01452 temp_service->notified_on_warning=FALSE; 01453 temp_service->notified_on_critical=FALSE; 01454 temp_service->no_more_notifications=FALSE; 01455 01456 if(reschedule_check==TRUE) 01457 next_service_check=(time_t)(temp_service->last_check+(temp_service->check_interval*interval_length)); 01458 } 01459 01460 01461 /*******************************************/ 01462 /******* SERVICE CHECK PROBLEM LOGIC *******/ 01463 /*******************************************/ 01464 01465 /* hey, something's not working quite like it should... */ 01466 else{ 01467 01468 log_debug_info(DEBUGL_CHECKS,1,"Service is in a non-OK state!\n"); 01469 01470 /* check the route to the host if its up right now... */ 01471 if(temp_host->current_state==HOST_UP){ 01472 01473 log_debug_info(DEBUGL_CHECKS,1,"Host is currently UP, so we'll recheck its state to make sure...\n"); 01474 01475 /* 08/04/07 EG launch an async (parallel) host check (possibly cached) unless aggressive host checking is enabled */ 01476 /* previous logic was to simply run a sync (serial) host check */ 01477 if(use_aggressive_host_checking==TRUE) 01478 perform_on_demand_host_check(temp_host,&route_result,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon); 01479 else{ 01480 /* can we use the last cached host state? */ 01481 /* only use cached host state if no service state change has occurred */ 01482 if((state_change==FALSE || state_changes_use_cached_state==TRUE) && temp_host->has_been_checked==TRUE && ((current_time-temp_host->last_check) <= cached_host_check_horizon)){ 01483 /* use current host state as route result */ 01484 route_result=temp_host->current_state; 01485 log_debug_info(DEBUGL_CHECKS,1,"* Using cached host state: %d\n",temp_host->current_state); 01486 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time); 01487 update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS,current_time); 01488 } 01489 01490 /* else launch an async (parallel) check of the host */ 01491 /* CHANGED 02/15/08 only if service changed state since service was last checked */ 01492 else if(state_change==TRUE){ 01493 /* use current host state as route result */ 01494 route_result=temp_host->current_state; 01495 run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL); 01496 } 01497 01498 /* ADDED 02/15/08 */ 01499 /* else assume same host state */ 01500 else{ 01501 route_result=temp_host->current_state; 01502 log_debug_info(DEBUGL_CHECKS,1,"* Using last known host state: %d\n",temp_host->current_state); 01503 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time); 01504 update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS,current_time); 01505 } 01506 } 01507 } 01508 01509 /* else the host is either down or unreachable, so recheck it if necessary */ 01510 else{ 01511 01512 log_debug_info(DEBUGL_CHECKS,1,"Host is currently DOWN/UNREACHABLE.\n"); 01513 01514 /* we're using aggressive host checking, so really do recheck the host... */ 01515 if(use_aggressive_host_checking==TRUE){ 01516 log_debug_info(DEBUGL_CHECKS,1,"Agressive host checking is enabled, so we'll recheck the host state...\n"); 01517 perform_on_demand_host_check(temp_host,&route_result,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon); 01518 } 01519 01520 /* the service wobbled between non-OK states, so check the host... */ 01521 else if((state_change==TRUE && state_changes_use_cached_state==FALSE) && temp_service->last_hard_state!=STATE_OK){ 01522 log_debug_info(DEBUGL_CHECKS,1,"Service wobbled between non-OK states, so we'll recheck the host state...\n"); 01523 /* 08/04/07 EG launch an async (parallel) host check unless aggressive host checking is enabled */ 01524 /* previous logic was to simply run a sync (serial) host check */ 01525 /* use current host state as route result */ 01526 route_result=temp_host->current_state; 01527 run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL); 01528 /*perform_on_demand_host_check(temp_host,&route_result,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon);*/ 01529 } 01530 01531 /* else fake the host check, but (possibly) resend host notifications to contacts... */ 01532 else{ 01533 01534 log_debug_info(DEBUGL_CHECKS,1,"Assuming host is in same state as before...\n"); 01535 01536 /* if the host has never been checked before, set the checked flag and last check time */ 01537 /* 03/11/06 EG Note: This probably never evaluates to FALSE, present for historical reasons only, can probably be removed in the future */ 01538 if(temp_host->has_been_checked==FALSE){ 01539 temp_host->has_been_checked=TRUE; 01540 temp_host->last_check=temp_service->last_check; 01541 } 01542 01543 /* fake the route check result */ 01544 route_result=temp_host->current_state; 01545 01546 /* possibly re-send host notifications... */ 01547 host_notification(temp_host,NOTIFICATION_NORMAL,NULL,NULL,NOTIFICATION_OPTION_NONE); 01548 } 01549 } 01550 01551 /* if the host is down or unreachable ... */ 01552 /* 05/29/2007 NOTE: The host might be in a SOFT problem state due to host check retries/caching. Not sure if we should take that into account and do something different or not... */ 01553 if(route_result!=HOST_UP){ 01554 01555 log_debug_info(DEBUGL_CHECKS,2,"Host is not UP, so we mark state changes if appropriate\n"); 01556 01557 /* "fake" a hard state change for the service - well, its not really fake, but it didn't get caught earlier... */ 01558 if(temp_service->last_hard_state!=temp_service->current_state) 01559 hard_state_change=TRUE; 01560 01561 /* update last state change times */ 01562 if(state_change==TRUE || hard_state_change==TRUE) 01563 temp_service->last_state_change=temp_service->last_check; 01564 if(hard_state_change==TRUE) { 01565 temp_service->last_hard_state_change=temp_service->last_check; 01566 temp_service->state_type=HARD_STATE; 01567 temp_service->last_hard_state=temp_service->current_state; 01568 } 01569 01570 /* put service into a hard state without attempting check retries and don't send out notifications about it */ 01571 temp_service->host_problem_at_last_check=TRUE; 01572 temp_service->state_type=HARD_STATE; 01573 temp_service->last_hard_state=temp_service->current_state; 01574 temp_service->current_attempt=1; 01575 } 01576 01577 /* the host is up - it recovered since the last time the service was checked... */ 01578 else if(temp_service->host_problem_at_last_check==TRUE){ 01579 01580 /* next time the service is checked we shouldn't get into this same case... */ 01581 temp_service->host_problem_at_last_check=FALSE; 01582 01583 /* reset the current check counter, so we give the service a chance */ 01584 /* this helps prevent the case where service has N max check attempts, N-1 of which have already occurred. */ 01585 /* if we didn't do this, the next check might fail and result in a hard problem - we should really give it more time */ 01586 /* ADDED IF STATEMENT 01-17-05 EG */ 01587 /* 01-17-05: Services in hard problem states before hosts went down would sometimes come back as soft problem states after */ 01588 /* the hosts recovered. This caused problems, so hopefully this will fix it */ 01589 if(temp_service->state_type==SOFT_STATE) 01590 temp_service->current_attempt=1; 01591 } 01592 01593 log_debug_info(DEBUGL_CHECKS,1,"Current/Max Attempt(s): %d/%d\n",temp_service->current_attempt,temp_service->max_attempts); 01594 01595 /* if we should retry the service check, do so (except it the host is down or unreachable!) */ 01596 if(temp_service->current_attempt < temp_service->max_attempts){ 01597 01598 /* the host is down or unreachable, so don't attempt to retry the service check */ 01599 if(route_result!=HOST_UP){ 01600 01601 log_debug_info(DEBUGL_CHECKS,1,"Host isn't UP, so we won't retry the service check...\n"); 01602 01603 /* the host is not up, so reschedule the next service check at regular interval */ 01604 if(reschedule_check==TRUE) 01605 next_service_check=(time_t)(temp_service->last_check+(temp_service->check_interval*interval_length)); 01606 01607 /* log the problem as a hard state if the host just went down */ 01608 if(hard_state_change==TRUE){ 01609 log_service_event(temp_service); 01610 state_was_logged=TRUE; 01611 01612 /* run the service event handler to handle the hard state */ 01613 handle_service_event(temp_service); 01614 } 01615 } 01616 01617 /* the host is up, so continue to retry the service check */ 01618 else{ 01619 01620 log_debug_info(DEBUGL_CHECKS,1,"Host is UP, so we'll retry the service check...\n"); 01621 01622 /* this is a soft state */ 01623 if (temp_service->current_attempt < temp_service->max_attempts) { 01624 temp_service->state_type=SOFT_STATE; 01625 } 01626 01627 /* log the service check retry */ 01628 log_service_event(temp_service); 01629 state_was_logged=TRUE; 01630 01631 /* run the service event handler to handle the soft state */ 01632 handle_service_event(temp_service); 01633 01634 if(reschedule_check==TRUE) 01635 next_service_check=(time_t)(temp_service->last_check+(temp_service->retry_interval*interval_length)); 01636 } 01637 01638 /* perform dependency checks on the second to last check of the service */ 01639 if(enable_predictive_service_dependency_checks==TRUE && temp_service->current_attempt==(temp_service->max_attempts-1)){ 01640 01641 log_debug_info(DEBUGL_CHECKS,1,"Looking for services to check for predictive dependency checks...\n"); 01642 01643 /* check services that THIS ONE depends on for notification AND execution */ 01644 /* we do this because we might be sending out a notification soon and we want the dependency logic to be accurate */ 01645 for(temp_dependency=get_first_servicedependency_by_dependent_service(temp_service->host_name,temp_service->description,&ptr);temp_dependency!=NULL;temp_dependency=get_next_servicedependency_by_dependent_service(temp_service->host_name,temp_service->description,&ptr)){ 01646 if(temp_dependency->dependent_service_ptr==temp_service && temp_dependency->master_service_ptr!=NULL){ 01647 master_service=(service *)temp_dependency->master_service_ptr; 01648 log_debug_info(DEBUGL_CHECKS,2,"Predictive check of service '%s' on host '%s' queued.\n",master_service->description,master_service->host_name); 01649 add_object_to_objectlist(&check_servicelist,(void *)master_service); 01650 } 01651 } 01652 } 01653 } 01654 01655 01656 /* we've reached the maximum number of service rechecks, so handle the error */ 01657 else{ 01658 01659 log_debug_info(DEBUGL_CHECKS,1,"Service has reached max number of rechecks, so we'll handle the error...\n"); 01660 01661 /* this is a hard state */ 01662 temp_service->state_type=HARD_STATE; 01663 01664 /* if we've hard a hard state change... */ 01665 if(hard_state_change==TRUE){ 01666 01667 /* log the service problem (even if host is not up, which is new in 0.0.5) */ 01668 log_service_event(temp_service); 01669 state_was_logged=TRUE; 01670 } 01671 01672 /* else log the problem (again) if this service is flagged as being volatile */ 01673 else if(temp_service->is_volatile!=FALSE){ 01674 log_service_event(temp_service); 01675 state_was_logged=TRUE; 01676 } 01677 01678 /* check for start of flexible (non-fixed) scheduled downtime if we just had a hard/soft error */ 01679 /* 2011-02-21 MF: we need to check for both, state_change (SOFT) and hard_state_change (HARD) values */ 01680 if((hard_state_change==TRUE || state_change==TRUE) && temp_service->pending_flex_downtime>0) 01681 check_pending_flex_service_downtime(temp_service); 01682 01683 /* 10/04/07 check to see if the service and/or associate host is flapping */ 01684 /* this should be done before a notification is sent out to ensure the host didn't just start flapping */ 01685 check_for_service_flapping(temp_service,TRUE,TRUE); 01686 check_for_host_flapping(temp_host,TRUE,FALSE,TRUE); 01687 flapping_check_done=TRUE; 01688 01689 #ifdef USE_ST_BASED_ESCAL_RANGES 01690 if (hard_state_change==TRUE){ 01691 temp_service->current_warning_notification_number=0; 01692 temp_service->current_critical_notification_number=0; 01693 temp_service->current_unknown_notification_number=0; 01694 } 01695 #endif 01696 /* (re)send notifications out about this service problem if the host is up (and was at last check also) and the dependencies were okay... */ 01697 service_notification(temp_service,NOTIFICATION_NORMAL,NULL,NULL,NOTIFICATION_OPTION_NONE); 01698 01699 /* run the service event handler if we changed state from the last hard state or if this service is flagged as being volatile */ 01700 if(hard_state_change==TRUE || temp_service->is_volatile!=FALSE) 01701 handle_service_event(temp_service); 01702 01703 /* save the last hard state */ 01704 temp_service->last_hard_state=temp_service->current_state; 01705 01706 /* reschedule the next check at the regular interval */ 01707 if(reschedule_check==TRUE) 01708 next_service_check=(time_t)(temp_service->last_check+(temp_service->check_interval*interval_length)); 01709 } 01710 01711 01712 /* should we obsessive over service checks? */ 01713 if(obsess_over_services==TRUE) 01714 obsessive_compulsive_service_check_processor(temp_service); 01715 } 01716 01717 /* reschedule the next service check ONLY for active, scheduled checks */ 01718 if(reschedule_check==TRUE){ 01719 01720 log_debug_info(DEBUGL_CHECKS,1,"Rescheduling next check of service at %s",ctime(&next_service_check)); 01721 01722 /* default is to reschedule service check unless a test below fails... */ 01723 temp_service->should_be_scheduled=TRUE; 01724 01725 /* next check time was calculated above */ 01726 temp_service->next_check=next_service_check; 01727 01728 /* make sure we don't get ourselves into too much trouble... */ 01729 if(current_time>temp_service->next_check) 01730 temp_service->next_check=current_time; 01731 01732 /* make sure we rescheduled the next service check at a valid time */ 01733 preferred_time=temp_service->next_check; 01734 get_next_valid_time(preferred_time,&next_valid_time,temp_service->check_period_ptr); 01735 temp_service->next_check=next_valid_time; 01736 01737 /* services with non-recurring intervals do not get rescheduled */ 01738 if(temp_service->check_interval==0) 01739 temp_service->should_be_scheduled=FALSE; 01740 01741 /* services with active checks disabled do not get rescheduled */ 01742 if(temp_service->checks_enabled==FALSE) 01743 temp_service->should_be_scheduled=FALSE; 01744 01745 /* schedule a non-forced check if we can */ 01746 if(temp_service->should_be_scheduled==TRUE) 01747 schedule_service_check(temp_service,temp_service->next_check,CHECK_OPTION_NONE); 01748 } 01749 01750 /* if we're stalking this state type and state was not already logged AND the plugin output changed since last check, log it now.. */ 01751 if(temp_service->state_type==HARD_STATE && state_change==FALSE && state_was_logged==FALSE && compare_strings(old_plugin_output,temp_service->plugin_output)){ 01752 01753 if((temp_service->current_state==STATE_OK && temp_service->stalk_on_ok==TRUE)) { 01754 01755 log_service_event(temp_service); 01756 01757 /* should we run event handlers ? */ 01758 if (stalking_event_handlers_for_services==TRUE) 01759 handle_service_event(temp_service); 01760 01761 } else if((temp_service->current_state==STATE_WARNING && temp_service->stalk_on_warning==TRUE)) { 01762 01763 log_service_event(temp_service); 01764 01765 /* should we run event handlers ? */ 01766 if (stalking_event_handlers_for_services==TRUE) 01767 handle_service_event(temp_service); 01768 01769 } else if((temp_service->current_state==STATE_UNKNOWN && temp_service->stalk_on_unknown==TRUE)) { 01770 01771 log_service_event(temp_service); 01772 01773 /* should we run event handlers ? */ 01774 if (stalking_event_handlers_for_services==TRUE) 01775 handle_service_event(temp_service); 01776 01777 } else if((temp_service->current_state==STATE_CRITICAL && temp_service->stalk_on_critical==TRUE)) { 01778 01779 log_service_event(temp_service); 01780 01781 /* should we run event handlers ? */ 01782 if (stalking_event_handlers_for_services==TRUE) 01783 handle_service_event(temp_service); 01784 01785 } 01786 } 01787 01788 #ifdef USE_EVENT_BROKER 01789 /* send data to event broker */ 01790 broker_service_check(NEBTYPE_SERVICECHECK_PROCESSED,NEBFLAG_NONE,NEBATTR_NONE,temp_service,temp_service->check_type,queued_check_result->start_time,queued_check_result->finish_time,temp_service->service_check_command,temp_service->latency,temp_service->execution_time,service_check_timeout,queued_check_result->early_timeout,queued_check_result->return_code,temp_service->processed_command,NULL); 01791 #endif 01792 01793 /* set the checked flag */ 01794 temp_service->has_been_checked=TRUE; 01795 01796 /* update the current service status log */ 01797 update_service_status(temp_service,FALSE); 01798 01799 /* check to see if the service and/or associate host is flapping */ 01800 if(flapping_check_done==FALSE){ 01801 check_for_service_flapping(temp_service,TRUE,TRUE); 01802 check_for_host_flapping(temp_host,TRUE,FALSE,TRUE); 01803 } 01804 01805 /* update service performance info */ 01806 update_service_performance_data(temp_service); 01807 01808 /* free allocated memory */ 01809 my_free(temp_plugin_output); 01810 my_free(old_plugin_output); 01811 01812 01813 /* run async checks of all services we added above */ 01814 /* don't run a check if one is already executing or we can get by with a cached state */ 01815 for(servicelist_item=check_servicelist;servicelist_item!=NULL;servicelist_item=servicelist_item->next){ 01816 run_async_check=TRUE; 01817 temp_service=(service *)servicelist_item->object_ptr; 01818 01819 /* we can get by with a cached state, so don't check the service */ 01820 if((current_time-temp_service->last_check)<=cached_service_check_horizon){ 01821 run_async_check=FALSE; 01822 01823 /* update check statistics */ 01824 update_check_stats(ACTIVE_CACHED_SERVICE_CHECK_STATS,current_time); 01825 } 01826 01827 if(temp_service->is_executing==TRUE) 01828 run_async_check=FALSE; 01829 01830 if(run_async_check==TRUE) 01831 run_async_service_check(temp_service,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL); 01832 } 01833 free_objectlist(&check_servicelist); 01834 01835 return OK; 01836 } 01837 01838 01839 01840 /* schedules an immediate or delayed service check */ 01841 void schedule_service_check(service *svc, time_t check_time, int options){ 01842 timed_event *temp_event=NULL; 01843 timed_event *new_event=NULL; 01844 int found=FALSE; 01845 int use_original_event=TRUE; 01846 01847 log_debug_info(DEBUGL_FUNCTIONS,0,"schedule_service_check()\n"); 01848 01849 if(svc==NULL) 01850 return; 01851 01852 log_debug_info(DEBUGL_CHECKS,0,"Scheduling a %s, active check of service '%s' on host '%s' @ %s",(options & CHECK_OPTION_FORCE_EXECUTION)?"forced":"non-forced",svc->description,svc->host_name,ctime(&check_time)); 01853 01854 /* don't schedule a check if active checks of this service are disabled */ 01855 if(svc->checks_enabled==FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)){ 01856 log_debug_info(DEBUGL_CHECKS,0,"Active checks of this service are disabled.\n"); 01857 return; 01858 } 01859 01860 /* allocate memory for a new event item */ 01861 new_event=(timed_event *)malloc(sizeof(timed_event)); 01862 if(new_event==NULL){ 01863 01864 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Could not reschedule check of service '%s' on host '%s'!\n",svc->description,svc->host_name); 01865 01866 return; 01867 } 01868 01869 /* default is to use the new event */ 01870 use_original_event=FALSE; 01871 found=FALSE; 01872 01873 #ifdef PERFORMANCE_INCREASE_BUT_VERY_BAD_IDEA_INDEED 01874 /* WARNING! 1/19/07 on-demand async service checks will end up causing mutliple scheduled checks of a service to appear in the queue if the code below is skipped */ 01875 /* if(use_large_installation_tweaks==FALSE)... skip code below */ 01876 #endif 01877 01878 /* see if there are any other scheduled checks of this service in the queue */ 01879 for(temp_event=event_list_low;temp_event!=NULL;temp_event=temp_event->next){ 01880 01881 if(temp_event->event_type==EVENT_SERVICE_CHECK && svc==(service *)temp_event->event_data){ 01882 found=TRUE; 01883 break; 01884 } 01885 } 01886 01887 /* we found another service check event for this service in the queue - what should we do? */ 01888 if(found==TRUE && temp_event!=NULL){ 01889 01890 log_debug_info(DEBUGL_CHECKS,2,"Found another service check event for this service @ %s",ctime(&temp_event->run_time)); 01891 01892 /* use the originally scheduled check unless we decide otherwise */ 01893 use_original_event=TRUE; 01894 01895 /* the original event is a forced check... */ 01896 if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)){ 01897 01898 /* the new event is also forced and its execution time is earlier than the original, so use it instead */ 01899 if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)){ 01900 use_original_event=FALSE; 01901 log_debug_info(DEBUGL_CHECKS,2,"New service check event is forced and occurs before the existing event, so the new event will be used instead.\n"); 01902 } 01903 } 01904 01905 /* the original event is not a forced check... */ 01906 else{ 01907 01908 /* the new event is a forced check, so use it instead */ 01909 if((options & CHECK_OPTION_FORCE_EXECUTION)){ 01910 use_original_event=FALSE; 01911 log_debug_info(DEBUGL_CHECKS,2,"New service check event is forced, so it will be used instead of the existing event.\n"); 01912 } 01913 01914 /* the new event is not forced either and its execution time is earlier than the original, so use it instead */ 01915 else if(check_time < temp_event->run_time){ 01916 use_original_event=FALSE; 01917 log_debug_info(DEBUGL_CHECKS,2,"New service check event occurs before the existing (older) event, so it will be used instead.\n"); 01918 } 01919 01920 /* the new event is older, so override the existing one */ 01921 else{ 01922 log_debug_info(DEBUGL_CHECKS,2,"New service check event occurs after the existing event, so we'll ignore it.\n"); 01923 } 01924 } 01925 01926 /* the originally queued event won the battle, so keep it */ 01927 if(use_original_event==TRUE){ 01928 my_free(new_event); 01929 } 01930 01931 /* else we're using the new event, so remove the old one */ 01932 else{ 01933 remove_event(temp_event,&event_list_low,&event_list_low_tail); 01934 my_free(temp_event); 01935 } 01936 } 01937 01938 /* save check options for retention purposes */ 01939 svc->check_options=options; 01940 01941 /* schedule a new event */ 01942 if(use_original_event==FALSE){ 01943 01944 log_debug_info(DEBUGL_CHECKS,2,"Scheduling new service check event.\n"); 01945 01946 /* set the next service check time */ 01947 svc->next_check=check_time; 01948 01949 /* place the new event in the event queue */ 01950 new_event->event_type=EVENT_SERVICE_CHECK; 01951 new_event->event_data=(void *)svc; 01952 new_event->event_args=(void *)NULL; 01953 new_event->event_options=options; 01954 new_event->run_time=svc->next_check; 01955 new_event->recurring=FALSE; 01956 new_event->event_interval=0L; 01957 new_event->timing_func=NULL; 01958 new_event->compensate_for_time_change=TRUE; 01959 reschedule_event(new_event,&event_list_low,&event_list_low_tail); 01960 } 01961 01962 else{ 01963 /* reset the next check time (it may be out of sync) */ 01964 if(temp_event!=NULL) 01965 svc->next_check=temp_event->run_time; 01966 01967 log_debug_info(DEBUGL_CHECKS,2,"Keeping original service check event (ignoring the new one).\n"); 01968 } 01969 01970 return; 01971 } 01972 01973 01974 01975 /* checks viability of performing a service check */ 01976 int check_service_check_viability(service *svc, int check_options, int *time_is_valid, time_t *new_time){ 01977 int result=OK; 01978 int perform_check=TRUE; 01979 time_t current_time=0L; 01980 time_t preferred_time=0L; 01981 int check_interval=0; 01982 01983 log_debug_info(DEBUGL_FUNCTIONS,0,"check_service_check_viability()\n"); 01984 01985 /* make sure we have a service */ 01986 if(svc==NULL) 01987 return ERROR; 01988 01989 /* get the check interval to use if we need to reschedule the check */ 01990 if(svc->state_type==SOFT_STATE && svc->current_state!=STATE_OK) 01991 check_interval=(svc->retry_interval*interval_length); 01992 else 01993 check_interval=(svc->check_interval*interval_length); 01994 01995 /* get the current time */ 01996 time(¤t_time); 01997 01998 /* initialize the next preferred check time */ 01999 preferred_time=current_time; 02000 02001 /* can we check the host right now? */ 02002 if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)){ 02003 02004 /* if checks of the service are currently disabled... */ 02005 if(svc->checks_enabled==FALSE){ 02006 preferred_time=current_time+check_interval; 02007 perform_check=FALSE; 02008 02009 log_debug_info(DEBUGL_CHECKS,2,"Active checks of the service are currently disabled.\n"); 02010 } 02011 02012 /* make sure this is a valid time to check the service */ 02013 if(check_time_against_period((unsigned long)current_time,svc->check_period_ptr)==ERROR){ 02014 preferred_time=current_time; 02015 if(time_is_valid) 02016 *time_is_valid=FALSE; 02017 perform_check=FALSE; 02018 02019 log_debug_info(DEBUGL_CHECKS,2,"This is not a valid time for this service to be actively checked.\n"); 02020 } 02021 02022 /* check service dependencies for execution */ 02023 if(check_service_dependencies(svc,EXECUTION_DEPENDENCY)==DEPENDENCIES_FAILED){ 02024 preferred_time=current_time+check_interval; 02025 perform_check=FALSE; 02026 02027 log_debug_info(DEBUGL_CHECKS,2,"Execution dependencies for this service failed, so it will not be actively checked.\n"); 02028 } 02029 } 02030 02031 /* pass back the next viable check time */ 02032 if(new_time) 02033 *new_time=preferred_time; 02034 02035 result=(perform_check==TRUE)?OK:ERROR; 02036 02037 return result; 02038 } 02039 02040 02041 02042 /* checks service dependencies */ 02043 int check_service_dependencies(service *svc,int dependency_type){ 02044 servicedependency *temp_dependency=NULL; 02045 service *temp_service=NULL; 02046 int state=STATE_OK; 02047 time_t current_time=0L; 02048 void *ptr=NULL; 02049 02050 02051 log_debug_info(DEBUGL_FUNCTIONS,0,"check_service_dependencies()\n"); 02052 02053 /* check all dependencies... */ 02054 for(temp_dependency=get_first_servicedependency_by_dependent_service(svc->host_name,svc->description,&ptr);temp_dependency!=NULL;temp_dependency=get_next_servicedependency_by_dependent_service(svc->host_name,svc->description,&ptr)){ 02055 02056 /* only check dependencies of the desired type (notification or execution) */ 02057 if(temp_dependency->dependency_type!=dependency_type) 02058 continue; 02059 02060 /* find the service we depend on... */ 02061 if((temp_service=temp_dependency->master_service_ptr)==NULL) 02062 continue; 02063 02064 /* skip this dependency if it has a timeperiod and the current time isn't valid */ 02065 time(¤t_time); 02066 if(temp_dependency->dependency_period!=NULL && check_time_against_period(current_time,temp_dependency->dependency_period_ptr)==ERROR) 02067 return FALSE; 02068 02069 /* get the status to use (use last hard state if its currently in a soft state) */ 02070 if(temp_service->state_type==SOFT_STATE && soft_state_dependencies==FALSE) 02071 state=temp_service->last_hard_state; 02072 else 02073 state=temp_service->current_state; 02074 02075 /* is the service we depend on in state that fails the dependency tests? */ 02076 if(state==STATE_OK && temp_dependency->fail_on_ok==TRUE) 02077 return DEPENDENCIES_FAILED; 02078 if(state==STATE_WARNING && temp_dependency->fail_on_warning==TRUE) 02079 return DEPENDENCIES_FAILED; 02080 if(state==STATE_UNKNOWN && temp_dependency->fail_on_unknown==TRUE) 02081 return DEPENDENCIES_FAILED; 02082 if(state==STATE_CRITICAL && temp_dependency->fail_on_critical==TRUE) 02083 return DEPENDENCIES_FAILED; 02084 if((state==STATE_OK && temp_service->has_been_checked==FALSE) && temp_dependency->fail_on_pending==TRUE) 02085 return DEPENDENCIES_FAILED; 02086 02087 /* immediate dependencies ok at this point - check parent dependencies if necessary */ 02088 if(temp_dependency->inherits_parent==TRUE){ 02089 if(check_service_dependencies(temp_service,dependency_type)!=DEPENDENCIES_OK) 02090 return DEPENDENCIES_FAILED; 02091 } 02092 } 02093 02094 return DEPENDENCIES_OK; 02095 } 02096 02097 02098 02099 /* check for services that never returned from a check... */ 02100 void check_for_orphaned_services(void){ 02101 service *temp_service=NULL; 02102 time_t current_time=0L; 02103 time_t expected_time=0L; 02104 02105 02106 log_debug_info(DEBUGL_FUNCTIONS,0,"check_for_orphaned_services()\n"); 02107 02108 /* get the current time */ 02109 time(¤t_time); 02110 02111 /* check all services... */ 02112 for(temp_service=service_list;temp_service!=NULL;temp_service=temp_service->next){ 02113 02114 /* skip services that are not currently executing */ 02115 if(temp_service->is_executing==FALSE) 02116 continue; 02117 02118 /* determine the time at which the check results should have come in (allow 10 minutes slack time) */ 02119 expected_time=(time_t)(temp_service->next_check+temp_service->latency+service_check_timeout+check_reaper_interval+600); 02120 02121 /* this service was supposed to have executed a while ago, but for some reason the results haven't come back in... */ 02122 if(expected_time<current_time){ 02123 02124 /* log a warning */ 02125 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The check of service '%s' on host '%s' looks like it was orphaned (results never came back). I'm scheduling an immediate check of the service...\n",temp_service->description,temp_service->host_name); 02126 02127 log_debug_info(DEBUGL_CHECKS,1,"Service '%s' on host '%s' was orphaned, so we're scheduling an immediate check...\n",temp_service->description,temp_service->host_name); 02128 02129 /* decrement the number of running service checks */ 02130 if(currently_running_service_checks>0) 02131 currently_running_service_checks--; 02132 02133 /* disable the executing flag */ 02134 temp_service->is_executing=FALSE; 02135 02136 /* schedule an immediate check of the service */ 02137 schedule_service_check(temp_service,current_time,CHECK_OPTION_ORPHAN_CHECK); 02138 } 02139 02140 } 02141 02142 return; 02143 } 02144 02145 02146 02147 /* check freshness of service results */ 02148 void check_service_result_freshness(void){ 02149 service *temp_service=NULL; 02150 time_t current_time=0L; 02151 02152 02153 log_debug_info(DEBUGL_FUNCTIONS,0,"check_service_result_freshness()\n"); 02154 log_debug_info(DEBUGL_CHECKS,1,"Checking the freshness of service check results...\n"); 02155 02156 /* bail out if we're not supposed to be checking freshness */ 02157 if(check_service_freshness==FALSE){ 02158 log_debug_info(DEBUGL_CHECKS,1,"Service freshness checking is disabled.\n"); 02159 return; 02160 } 02161 02162 /* get the current time */ 02163 time(¤t_time); 02164 02165 /* check all services... */ 02166 for(temp_service=service_list;temp_service!=NULL;temp_service=temp_service->next){ 02167 02168 /* skip services we shouldn't be checking for freshness */ 02169 if(temp_service->check_freshness==FALSE) 02170 continue; 02171 02172 /* skip services that are currently executing (problems here will be caught by orphaned service check) */ 02173 if(temp_service->is_executing==TRUE) 02174 continue; 02175 02176 /* skip services that have both active and passive checks disabled */ 02177 if(temp_service->checks_enabled==FALSE && temp_service->accept_passive_service_checks==FALSE) 02178 continue; 02179 02180 /* skip services that are already being freshened */ 02181 if(temp_service->is_being_freshened==TRUE) 02182 continue; 02183 02184 /* see if the time is right... */ 02185 if(check_time_against_period(current_time,temp_service->check_period_ptr)==ERROR) 02186 continue; 02187 02188 /* EXCEPTION */ 02189 /* don't check freshness of services without regular check intervals if we're using auto-freshness threshold */ 02190 if(temp_service->check_interval==0 && temp_service->freshness_threshold==0) 02191 continue; 02192 02193 /* the results for the last check of this service are stale! */ 02194 if(is_service_result_fresh(temp_service,current_time,TRUE)==FALSE){ 02195 02196 /* set the freshen flag */ 02197 temp_service->is_being_freshened=TRUE; 02198 02199 /* schedule an immediate forced check of the service */ 02200 schedule_service_check(temp_service,current_time,CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK); 02201 } 02202 02203 } 02204 02205 return; 02206 } 02207 02208 02209 02210 /* tests whether or not a service's check results are fresh */ 02211 int is_service_result_fresh(service *temp_service, time_t current_time, int log_this){ 02212 int freshness_threshold=0; 02213 time_t expiration_time=0L; 02214 int days=0; 02215 int hours=0; 02216 int minutes=0; 02217 int seconds=0; 02218 int tdays=0; 02219 int thours=0; 02220 int tminutes=0; 02221 int tseconds=0; 02222 02223 log_debug_info(DEBUGL_CHECKS,2,"Checking freshness of service '%s' on host '%s'...\n",temp_service->description,temp_service->host_name); 02224 02225 /* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */ 02226 if(temp_service->freshness_threshold==0){ 02227 if(temp_service->state_type==HARD_STATE || temp_service->current_state==STATE_OK) 02228 freshness_threshold=(temp_service->check_interval*interval_length)+temp_service->latency+additional_freshness_latency; 02229 else 02230 freshness_threshold=(temp_service->retry_interval*interval_length)+temp_service->latency+additional_freshness_latency; 02231 } 02232 else 02233 freshness_threshold=temp_service->freshness_threshold; 02234 02235 log_debug_info(DEBUGL_CHECKS,2,"Freshness thresholds: service=%d, use=%d\n",temp_service->freshness_threshold,freshness_threshold); 02236 02237 /* calculate expiration time */ 02238 /* CHANGED 11/10/05 EG - program start is only used in expiration time calculation if > last check AND active checks are enabled, so active checks can become stale immediately upon program startup */ 02239 /* CHANGED 02/25/06 SG - passive checks also become stale, so remove dependence on active check logic */ 02240 if(temp_service->has_been_checked==FALSE) 02241 expiration_time=(time_t)(event_start+freshness_threshold); 02242 /* CHANGED 06/19/07 EG - Per Ton's suggestion (and user requests), only use program start time over last check if no specific threshold has been set by user. Otheriwse use it. Problems can occur if Icinga is restarted more frequently that freshness threshold intervals (services never go stale). */ 02243 /* CHANGED 10/07/07 EG - Only match next condition for services that have active checks enabled... */ 02244 /* CHANGED 10/07/07 EG - Added max_service_check_spread to expiration time as suggested by Altinity */ 02245 else if(temp_service->checks_enabled==TRUE && event_start>temp_service->last_check && temp_service->freshness_threshold==0) 02246 expiration_time=(time_t)(event_start+freshness_threshold+(max_service_check_spread*interval_length)); 02247 else 02248 expiration_time=(time_t)(temp_service->last_check+freshness_threshold); 02249 02250 log_debug_info(DEBUGL_CHECKS,2,"HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n",temp_service->has_been_checked,(unsigned long)program_start,(unsigned long)event_start,(unsigned long)temp_service->last_check,(unsigned long)current_time,(unsigned long)expiration_time); 02251 02252 /* the results for the last check of this service are stale */ 02253 if(expiration_time<current_time){ 02254 02255 get_time_breakdown((current_time-expiration_time),&days,&hours,&minutes,&seconds); 02256 get_time_breakdown(freshness_threshold,&tdays,&thours,&tminutes,&tseconds); 02257 02258 /* log a warning */ 02259 if(log_this==TRUE) 02260 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The results of service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). I'm forcing an immediate check of the service.\n",temp_service->description,temp_service->host_name,days,hours,minutes,seconds,tdays,thours,tminutes,tseconds); 02261 02262 log_debug_info(DEBUGL_CHECKS,1,"Check results for service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). Forcing an immediate check of the service...\n",temp_service->description,temp_service->host_name,days,hours,minutes,seconds,tdays,thours,tminutes,tseconds); 02263 02264 return FALSE; 02265 } 02266 02267 log_debug_info(DEBUGL_CHECKS,1,"Check results for service '%s' on host '%s' are fresh.\n",temp_service->description,temp_service->host_name); 02268 02269 return TRUE; 02270 } 02271 02272 02273 02274 02275 /******************************************************************/ 02276 /*************** COMMON ROUTE/HOST CHECK FUNCTIONS ****************/ 02277 /******************************************************************/ 02278 02279 /* execute an on-demand check */ 02280 int perform_on_demand_host_check(host *hst, int *check_return_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon){ 02281 02282 log_debug_info(DEBUGL_FUNCTIONS,0,"perform_on_demand_host_check()\n"); 02283 02284 perform_on_demand_host_check_3x(hst,check_return_code,check_options,use_cached_result,check_timestamp_horizon); 02285 02286 return OK; 02287 } 02288 02289 02290 02291 /* execute a scheduled host check using either the 2.x or 3.x logic */ 02292 int perform_scheduled_host_check(host *hst, int check_options, double latency){ 02293 02294 log_debug_info(DEBUGL_FUNCTIONS,0,"perform_scheduled_host_check()\n"); 02295 02296 run_scheduled_host_check_3x(hst,check_options,latency); 02297 02298 return OK; 02299 } 02300 02301 02302 02303 /* schedules an immediate or delayed host check */ 02304 void schedule_host_check(host *hst, time_t check_time, int options){ 02305 timed_event *temp_event=NULL; 02306 timed_event *new_event=NULL; 02307 int found=FALSE; 02308 int use_original_event=TRUE; 02309 02310 02311 log_debug_info(DEBUGL_FUNCTIONS,0,"schedule_host_check()\n"); 02312 02313 if(hst==NULL) 02314 return; 02315 02316 log_debug_info(DEBUGL_CHECKS,0,"Scheduling a %s, active check of host '%s' @ %s",(options & CHECK_OPTION_FORCE_EXECUTION)?"forced":"non-forced",hst->name,ctime(&check_time)); 02317 02318 /* don't schedule a check if active checks of this host are disabled */ 02319 if(hst->checks_enabled==FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)){ 02320 log_debug_info(DEBUGL_CHECKS,0,"Active checks are disabled for this host.\n"); 02321 return; 02322 } 02323 02324 /* allocate memory for a new event item */ 02325 if((new_event=(timed_event *)malloc(sizeof(timed_event)))==NULL){ 02326 02327 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Could not reschedule check of host '%s'!\n",hst->name); 02328 02329 return; 02330 } 02331 02332 /* default is to use the new event */ 02333 use_original_event=FALSE; 02334 found=FALSE; 02335 02336 #ifdef PERFORMANCE_INCREASE_BUT_VERY_BAD_IDEA_INDEED 02337 /* WARNING! 1/19/07 on-demand async host checks will end up causing mutliple scheduled checks of a host to appear in the queue if the code below is skipped */ 02338 /* if(use_large_installation_tweaks==FALSE)... skip code below */ 02339 #endif 02340 02341 /* see if there are any other scheduled checks of this host in the queue */ 02342 for(temp_event=event_list_low;temp_event!=NULL;temp_event=temp_event->next){ 02343 if(temp_event->event_type==EVENT_HOST_CHECK && hst==(host *)temp_event->event_data){ 02344 found=TRUE; 02345 break; 02346 } 02347 } 02348 02349 /* we found another host check event for this host in the queue - what should we do? */ 02350 if(found==TRUE && temp_event!=NULL){ 02351 02352 log_debug_info(DEBUGL_CHECKS,2,"Found another host check event for this host @ %s",ctime(&temp_event->run_time)); 02353 02354 /* use the originally scheduled check unless we decide otherwise */ 02355 use_original_event=TRUE; 02356 02357 /* the original event is a forced check... */ 02358 if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)){ 02359 02360 /* the new event is also forced and its execution time is earlier than the original, so use it instead */ 02361 if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)){ 02362 log_debug_info(DEBUGL_CHECKS,2,"New host check event is forced and occurs before the existing event, so the new event be used instead.\n"); 02363 use_original_event=FALSE; 02364 } 02365 } 02366 02367 /* the original event is not a forced check... */ 02368 else{ 02369 02370 /* the new event is a forced check, so use it instead */ 02371 if((options & CHECK_OPTION_FORCE_EXECUTION)){ 02372 use_original_event=FALSE; 02373 log_debug_info(DEBUGL_CHECKS,2,"New host check event is forced, so it will be used instead of the existing event.\n"); 02374 } 02375 02376 /* the new event is not forced either and its execution time is earlier than the original, so use it instead */ 02377 else if(check_time < temp_event->run_time){ 02378 use_original_event=FALSE; 02379 log_debug_info(DEBUGL_CHECKS,2,"New host check event occurs before the existing (older) event, so it will be used instead.\n"); 02380 } 02381 02382 /* the new event is older, so override the existing one */ 02383 else{ 02384 log_debug_info(DEBUGL_CHECKS,2,"New host check event occurs after the existing event, so we'll ignore it.\n"); 02385 } 02386 } 02387 02388 /* the originally queued event won the battle, so keep it */ 02389 if(use_original_event==TRUE){ 02390 my_free(new_event); 02391 } 02392 02393 /* else use the new event, so remove the old */ 02394 else{ 02395 remove_event(temp_event,&event_list_low,&event_list_low_tail); 02396 my_free(temp_event); 02397 } 02398 } 02399 02400 /* save check options for retention purposes */ 02401 hst->check_options=options; 02402 02403 /* use the new event */ 02404 if(use_original_event==FALSE){ 02405 02406 log_debug_info(DEBUGL_CHECKS,2,"Scheduling new host check event.\n"); 02407 02408 /* set the next host check time */ 02409 hst->next_check=check_time; 02410 02411 /* place the new event in the event queue */ 02412 new_event->event_type=EVENT_HOST_CHECK; 02413 new_event->event_data=(void *)hst; 02414 new_event->event_args=(void *)NULL; 02415 new_event->event_options=options; 02416 new_event->run_time=hst->next_check; 02417 new_event->recurring=FALSE; 02418 new_event->event_interval=0L; 02419 new_event->timing_func=NULL; 02420 new_event->compensate_for_time_change=TRUE; 02421 reschedule_event(new_event,&event_list_low,&event_list_low_tail); 02422 } 02423 02424 else{ 02425 /* reset the next check time (it may be out of sync) */ 02426 if(temp_event!=NULL) 02427 hst->next_check=temp_event->run_time; 02428 02429 log_debug_info(DEBUGL_CHECKS,2,"Keeping original host check event (ignoring the new one).\n"); 02430 } 02431 02432 return; 02433 } 02434 02435 02436 02437 /* checks host dependencies */ 02438 int check_host_dependencies(host *hst,int dependency_type){ 02439 hostdependency *temp_dependency=NULL; 02440 host *temp_host=NULL; 02441 int state=HOST_UP; 02442 time_t current_time=0L; 02443 void *ptr=NULL; 02444 02445 02446 log_debug_info(DEBUGL_FUNCTIONS,0,"check_host_dependencies()\n"); 02447 02448 /* check all dependencies... */ 02449 for(temp_dependency=get_first_hostdependency_by_dependent_host(hst->name,&ptr);temp_dependency!=NULL;temp_dependency=get_next_hostdependency_by_dependent_host(hst->name,&ptr)){ 02450 02451 /* only check dependencies of the desired type (notification or execution) */ 02452 if(temp_dependency->dependency_type!=dependency_type) 02453 continue; 02454 02455 /* find the host we depend on... */ 02456 if((temp_host=temp_dependency->master_host_ptr)==NULL) 02457 continue; 02458 02459 /* skip this dependency if it has a timeperiod and the current time isn't valid */ 02460 time(¤t_time); 02461 if(temp_dependency->dependency_period!=NULL && check_time_against_period(current_time,temp_dependency->dependency_period_ptr)==ERROR) 02462 return FALSE; 02463 02464 /* get the status to use (use last hard state if its currently in a soft state) */ 02465 if(temp_host->state_type==SOFT_STATE && soft_state_dependencies==FALSE) 02466 state=temp_host->last_hard_state; 02467 else 02468 state=temp_host->current_state; 02469 02470 /* is the host we depend on in state that fails the dependency tests? */ 02471 if(state==HOST_UP && temp_dependency->fail_on_up==TRUE) 02472 return DEPENDENCIES_FAILED; 02473 if(state==HOST_DOWN && temp_dependency->fail_on_down==TRUE) 02474 return DEPENDENCIES_FAILED; 02475 if(state==HOST_UNREACHABLE && temp_dependency->fail_on_unreachable==TRUE) 02476 return DEPENDENCIES_FAILED; 02477 if((state==HOST_UP && temp_host->has_been_checked==FALSE) && temp_dependency->fail_on_pending==TRUE) 02478 return DEPENDENCIES_FAILED; 02479 02480 /* immediate dependencies ok at this point - check parent dependencies if necessary */ 02481 if(temp_dependency->inherits_parent==TRUE){ 02482 if(check_host_dependencies(temp_host,dependency_type)!=DEPENDENCIES_OK) 02483 return DEPENDENCIES_FAILED; 02484 } 02485 } 02486 02487 return DEPENDENCIES_OK; 02488 } 02489 02490 02491 02492 /* check for hosts that never returned from a check... */ 02493 void check_for_orphaned_hosts(void){ 02494 host *temp_host=NULL; 02495 time_t current_time=0L; 02496 time_t expected_time=0L; 02497 02498 02499 log_debug_info(DEBUGL_FUNCTIONS,0,"check_for_orphaned_hosts()\n"); 02500 02501 /* get the current time */ 02502 time(¤t_time); 02503 02504 /* check all hosts... */ 02505 for(temp_host=host_list;temp_host!=NULL;temp_host=temp_host->next){ 02506 02507 /* skip hosts that don't have a set check interval (on-demand checks are missed by the orphan logic) */ 02508 if(temp_host->next_check==(time_t)0L) 02509 continue; 02510 02511 /* skip hosts that are not currently executing */ 02512 if(temp_host->is_executing==FALSE) 02513 continue; 02514 02515 /* determine the time at which the check results should have come in (allow 10 minutes slack time) */ 02516 expected_time=(time_t)(temp_host->next_check+temp_host->latency+host_check_timeout+check_reaper_interval+600); 02517 02518 /* this host was supposed to have executed a while ago, but for some reason the results haven't come back in... */ 02519 if(expected_time<current_time){ 02520 02521 /* log a warning */ 02522 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The check of host '%s' looks like it was orphaned (results never came back). I'm scheduling an immediate check of the host...\n",temp_host->name); 02523 02524 log_debug_info(DEBUGL_CHECKS,1,"Host '%s' was orphaned, so we're scheduling an immediate check...\n",temp_host->name); 02525 02526 /* decrement the number of running host checks */ 02527 if(currently_running_host_checks>0) 02528 currently_running_host_checks--; 02529 02530 /* disable the executing flag */ 02531 temp_host->is_executing=FALSE; 02532 02533 /* schedule an immediate check of the host */ 02534 schedule_host_check(temp_host,current_time,CHECK_OPTION_ORPHAN_CHECK); 02535 } 02536 02537 } 02538 02539 return; 02540 } 02541 02542 02543 02544 /* check freshness of host results */ 02545 void check_host_result_freshness(void){ 02546 host *temp_host=NULL; 02547 time_t current_time=0L; 02548 02549 02550 log_debug_info(DEBUGL_FUNCTIONS,0,"check_host_result_freshness()\n"); 02551 log_debug_info(DEBUGL_CHECKS,2,"Attempting to check the freshness of host check results...\n"); 02552 02553 /* bail out if we're not supposed to be checking freshness */ 02554 if(check_host_freshness==FALSE){ 02555 log_debug_info(DEBUGL_CHECKS,2,"Host freshness checking is disabled.\n"); 02556 return; 02557 } 02558 02559 /* get the current time */ 02560 time(¤t_time); 02561 02562 /* check all hosts... */ 02563 for(temp_host=host_list;temp_host!=NULL;temp_host=temp_host->next){ 02564 02565 /* skip hosts we shouldn't be checking for freshness */ 02566 if(temp_host->check_freshness==FALSE) 02567 continue; 02568 02569 /* skip hosts that have both active and passive checks disabled */ 02570 if(temp_host->checks_enabled==FALSE && temp_host->accept_passive_host_checks==FALSE) 02571 continue; 02572 02573 /* skip hosts that are currently executing (problems here will be caught by orphaned host check) */ 02574 if(temp_host->is_executing==TRUE) 02575 continue; 02576 02577 /* skip hosts that are already being freshened */ 02578 if(temp_host->is_being_freshened==TRUE) 02579 continue; 02580 02581 /* see if the time is right... */ 02582 if(check_time_against_period(current_time,temp_host->check_period_ptr)==ERROR) 02583 continue; 02584 02585 /* the results for the last check of this host are stale */ 02586 if(is_host_result_fresh(temp_host,current_time,TRUE)==FALSE){ 02587 02588 /* set the freshen flag */ 02589 temp_host->is_being_freshened=TRUE; 02590 02591 /* schedule an immediate forced check of the host */ 02592 schedule_host_check(temp_host,current_time,CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK); 02593 } 02594 } 02595 02596 return; 02597 } 02598 02599 02600 02601 /* checks to see if a hosts's check results are fresh */ 02602 int is_host_result_fresh(host *temp_host, time_t current_time, int log_this){ 02603 time_t expiration_time=0L; 02604 int freshness_threshold=0; 02605 int days=0; 02606 int hours=0; 02607 int minutes=0; 02608 int seconds=0; 02609 int tdays=0; 02610 int thours=0; 02611 int tminutes=0; 02612 int tseconds=0; 02613 02614 log_debug_info(DEBUGL_CHECKS,2,"Checking freshness of host '%s'...\n",temp_host->name); 02615 02616 /* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */ 02617 if(temp_host->freshness_threshold==0) 02618 freshness_threshold=(temp_host->check_interval*interval_length)+temp_host->latency+additional_freshness_latency; 02619 else 02620 freshness_threshold=temp_host->freshness_threshold; 02621 02622 log_debug_info(DEBUGL_CHECKS,2,"Freshness thresholds: host=%d, use=%d\n",temp_host->freshness_threshold,freshness_threshold); 02623 02624 /* calculate expiration time */ 02625 /* CHANGED 11/10/05 EG - program start is only used in expiration time calculation if > last check AND active checks are enabled, so active checks can become stale immediately upon program startup */ 02626 if(temp_host->has_been_checked==FALSE) 02627 expiration_time=(time_t)(event_start+freshness_threshold); 02628 /* CHANGED 06/19/07 EG - Per Ton's suggestion (and user requests), only use program start time over last check if no specific threshold has been set by user. Otheriwse use it. Problems can occur if Icinga is restarted more frequently that freshness threshold intervals (hosts never go stale). */ 02629 /* CHANGED 10/07/07 EG - Added max_host_check_spread to expiration time as suggested by Altinity */ 02630 else if(temp_host->checks_enabled==TRUE && event_start>temp_host->last_check && temp_host->freshness_threshold==0) 02631 expiration_time=(time_t)(event_start+freshness_threshold+(max_host_check_spread*interval_length)); 02632 else 02633 expiration_time=(time_t)(temp_host->last_check+freshness_threshold); 02634 02635 log_debug_info(DEBUGL_CHECKS,2,"HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n",temp_host->has_been_checked,(unsigned long)program_start,(unsigned long)event_start,(unsigned long)temp_host->last_check,(unsigned long)current_time,(unsigned long)expiration_time); 02636 02637 /* the results for the last check of this host are stale */ 02638 if(expiration_time<current_time){ 02639 02640 get_time_breakdown((current_time-expiration_time),&days,&hours,&minutes,&seconds); 02641 get_time_breakdown(freshness_threshold,&tdays,&thours,&tminutes,&tseconds); 02642 02643 /* log a warning */ 02644 if(log_this==TRUE) 02645 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The results of host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). I'm forcing an immediate check of the host.\n",temp_host->name,days,hours,minutes,seconds,tdays,thours,tminutes,tseconds); 02646 02647 log_debug_info(DEBUGL_CHECKS,1,"Check results for host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). Forcing an immediate check of the host...\n",temp_host->name,days,hours,minutes,seconds,tdays,thours,tminutes,tseconds); 02648 02649 return FALSE; 02650 } 02651 else 02652 log_debug_info(DEBUGL_CHECKS,1,"Check results for host '%s' are fresh.\n",temp_host->name); 02653 02654 return TRUE; 02655 } 02656 02657 02658 02659 /******************************************************************/ 02660 /************* Icinga 3.X ROUTE/HOST CHECK FUNCTIONS **************/ 02661 /******************************************************************/ 02662 02663 02664 /*** ON-DEMAND HOST CHECKS USE THIS FUNCTION ***/ 02665 /* check to see if we can reach the host */ 02666 int perform_on_demand_host_check_3x(host *hst, int *check_result_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon){ 02667 int result=OK; 02668 02669 log_debug_info(DEBUGL_FUNCTIONS,0,"perform_on_demand_host_check_3x()\n"); 02670 02671 /* make sure we have a host */ 02672 if(hst==NULL) 02673 return ERROR; 02674 02675 log_debug_info(DEBUGL_CHECKS,0,"** On-demand check for host '%s'...\n",hst->name); 02676 02677 /* check the status of the host */ 02678 result=run_sync_host_check_3x(hst,check_result_code,check_options,use_cached_result,check_timestamp_horizon); 02679 02680 return result; 02681 } 02682 02683 02684 02685 /* perform a synchronous check of a host */ 02686 /* on-demand host checks will use this... */ 02687 int run_sync_host_check_3x(host *hst, int *check_result_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon){ 02688 int result=OK; 02689 time_t current_time=0L; 02690 int host_result=HOST_UP; 02691 char *old_plugin_output=NULL; 02692 struct timeval start_time; 02693 struct timeval end_time; 02694 02695 02696 log_debug_info(DEBUGL_FUNCTIONS,0,"run_sync_host_check_3x()\n"); 02697 02698 /* make sure we have a host */ 02699 if(hst==NULL) 02700 return ERROR; 02701 02702 log_debug_info(DEBUGL_CHECKS,0,"** Run sync check of host '%s'...\n",hst->name); 02703 02704 /* is the host check viable at this time? */ 02705 /* if not, return current state and bail out */ 02706 if(check_host_check_viability_3x(hst,check_options,NULL,NULL)==ERROR){ 02707 if(check_result_code) 02708 *check_result_code=hst->current_state; 02709 log_debug_info(DEBUGL_CHECKS,0,"Host check is not viable at this time.\n"); 02710 return OK; 02711 } 02712 02713 /* get the current time */ 02714 time(¤t_time); 02715 02716 /* high resolution start time for event broker */ 02717 gettimeofday(&start_time,NULL); 02718 02719 /* can we use the last cached host state? */ 02720 if(use_cached_result==TRUE && !(check_options & CHECK_OPTION_FORCE_EXECUTION)){ 02721 02722 /* we can used the cached result, so return it and get out of here... */ 02723 if(hst->has_been_checked==TRUE && ((current_time-hst->last_check) <= check_timestamp_horizon)){ 02724 if(check_result_code) 02725 *check_result_code=hst->current_state; 02726 02727 log_debug_info(DEBUGL_CHECKS,1,"* Using cached host state: %d\n",hst->current_state); 02728 02729 /* update check statistics */ 02730 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time); 02731 update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS,current_time); 02732 02733 return OK; 02734 } 02735 } 02736 02737 02738 log_debug_info(DEBUGL_CHECKS,1,"* Running actual host check: old state=%d\n",hst->current_state); 02739 02740 02741 /******** GOOD TO GO FOR A REAL HOST CHECK AT THIS POINT ********/ 02742 02743 /* update check statistics */ 02744 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time); 02745 update_check_stats(SERIAL_HOST_CHECK_STATS,start_time.tv_sec); 02746 02747 /* reset host check latency, since on-demand checks have none */ 02748 hst->latency=0.0; 02749 02750 /* adjust host check attempt */ 02751 adjust_host_check_attempt_3x(hst,TRUE); 02752 02753 /* save old host state */ 02754 hst->last_state=hst->current_state; 02755 if(hst->state_type==HARD_STATE) 02756 hst->last_hard_state=hst->current_state; 02757 02758 /* save old plugin output for state stalking */ 02759 if(hst->plugin_output) 02760 old_plugin_output=(char *)strdup(hst->plugin_output); 02761 02762 /* set the checked flag */ 02763 hst->has_been_checked=TRUE; 02764 02765 /* clear the freshness flag */ 02766 hst->is_being_freshened=FALSE; 02767 02768 /* clear check options - we don't want old check options retained */ 02769 hst->check_options=CHECK_OPTION_NONE; 02770 02771 /* set the check type */ 02772 hst->check_type=HOST_CHECK_ACTIVE; 02773 02774 02775 /*********** EXECUTE THE CHECK AND PROCESS THE RESULTS **********/ 02776 02777 #ifdef USE_EVENT_BROKER 02778 /* send data to event broker */ 02779 end_time.tv_sec=0L; 02780 end_time.tv_usec=0L; 02781 broker_host_check(NEBTYPE_HOSTCHECK_INITIATE,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,0.0,host_check_timeout,FALSE,0,NULL,NULL,NULL,NULL,NULL); 02782 #endif 02783 02784 /* execute the host check */ 02785 host_result=execute_sync_host_check_3x(hst); 02786 02787 /* process the host check result */ 02788 process_host_check_result_3x(hst,host_result,old_plugin_output,check_options,FALSE,use_cached_result,check_timestamp_horizon); 02789 02790 /* free memory */ 02791 my_free(old_plugin_output); 02792 02793 log_debug_info(DEBUGL_CHECKS,1,"* Sync host check done: new state=%d\n",hst->current_state); 02794 02795 /* high resolution end time for event broker */ 02796 gettimeofday(&end_time,NULL); 02797 02798 #ifdef USE_EVENT_BROKER 02799 /* send data to event broker */ 02800 broker_host_check(NEBTYPE_HOSTCHECK_PROCESSED,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,hst->execution_time,host_check_timeout,FALSE,hst->current_state,hst->processed_command,hst->plugin_output,hst->long_plugin_output,hst->perf_data,NULL); 02801 #endif 02802 02803 return result; 02804 } 02805 02806 02807 02808 /* run an "alive" check on a host */ 02809 /* on-demand host checks will use this... */ 02810 int execute_sync_host_check_3x(host *hst){ 02811 icinga_macros mac; 02812 int result=STATE_OK; 02813 int return_result=HOST_UP; 02814 char *processed_command=NULL; 02815 char *raw_command=NULL; 02816 struct timeval start_time; 02817 struct timeval end_time; 02818 char *temp_ptr; 02819 int early_timeout=FALSE; 02820 double exectime; 02821 char *temp_plugin_output=NULL; 02822 #ifdef USE_EVENT_BROKER 02823 int neb_result=OK; 02824 #endif 02825 02826 02827 log_debug_info(DEBUGL_FUNCTIONS,0,"execute_sync_host_check_3x()\n"); 02828 02829 if(hst==NULL) 02830 return HOST_DOWN; 02831 02832 log_debug_info(DEBUGL_CHECKS,0,"** Executing sync check of host '%s'...\n",hst->name); 02833 02834 #ifdef USE_EVENT_BROKER 02835 /* initialize start/end times */ 02836 start_time.tv_sec=0L; 02837 start_time.tv_usec=0L; 02838 end_time.tv_sec=0L; 02839 end_time.tv_usec=0L; 02840 02841 /* send data to event broker */ 02842 neb_result=broker_host_check(NEBTYPE_HOSTCHECK_SYNC_PRECHECK,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,0.0,host_check_timeout,FALSE,0,NULL,NULL,NULL,NULL,NULL); 02843 02844 /* neb module wants to cancel the host check - return the current state of the host */ 02845 if(neb_result==NEBERROR_CALLBACKCANCEL) 02846 return hst->current_state; 02847 02848 /* neb module wants to override the host check - perhaps it will check the host itself */ 02849 /* NOTE: if a module does this, it must check the status of the host and populate the data structures BEFORE it returns from the callback! */ 02850 if(neb_result==NEBERROR_CALLBACKOVERRIDE) 02851 return hst->current_state; 02852 #endif 02853 02854 /* grab the host macros */ 02855 memset(&mac, 0, sizeof(mac)); 02856 grab_host_macros_r(&mac, hst); 02857 02858 /* high resolution start time for event broker */ 02859 gettimeofday(&start_time,NULL); 02860 02861 /* get the last host check time */ 02862 time(&hst->last_check); 02863 02864 /* get the raw command line */ 02865 get_raw_command_line_r(&mac, hst->check_command_ptr,hst->host_check_command,&raw_command,0); 02866 if(raw_command==NULL) { 02867 clear_volatile_macros_r(&mac); 02868 return ERROR; 02869 } 02870 02871 /* process any macros contained in the argument */ 02872 process_macros_r(&mac, raw_command,&processed_command,0); 02873 if(processed_command==NULL) { 02874 clear_volatile_macros_r(&mac); 02875 return ERROR; 02876 } 02877 02878 my_free(hst->processed_command); 02879 hst->processed_command=strdup(processed_command); 02880 02881 #ifdef USE_EVENT_BROKER 02882 /* send data to event broker */ 02883 end_time.tv_sec=0L; 02884 end_time.tv_usec=0L; 02885 broker_host_check(NEBTYPE_HOSTCHECK_RAW_START,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,return_result,hst->state_type,start_time,end_time,hst->host_check_command,0.0,0.0,host_check_timeout,early_timeout,result,processed_command,hst->plugin_output,hst->long_plugin_output,hst->perf_data,NULL); 02886 #endif 02887 02888 log_debug_info(DEBUGL_COMMANDS,1,"Raw host check command: %s\n",raw_command); 02889 log_debug_info(DEBUGL_COMMANDS,0,"Processed host check ommand: %s\n",processed_command); 02890 02891 /* clear plugin output and performance data buffers */ 02892 my_free(hst->plugin_output); 02893 my_free(hst->long_plugin_output); 02894 my_free(hst->perf_data); 02895 02896 /* run the host check command */ 02897 result=my_system_r(&mac, processed_command,host_check_timeout,&early_timeout,&exectime,&temp_plugin_output,MAX_PLUGIN_OUTPUT_LENGTH); 02898 clear_volatile_macros_r(&mac); 02899 02900 /* if the check timed out, report an error */ 02901 if(early_timeout==TRUE){ 02902 02903 my_free(temp_plugin_output); 02904 dummy=asprintf(&temp_plugin_output,"Host check timed out after %d seconds\n",host_check_timeout); 02905 02906 /* log the timeout */ 02907 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Host check command '%s' for host '%s' timed out after %d seconds\n",processed_command,hst->name,host_check_timeout); 02908 } 02909 02910 /* calculate total execution time */ 02911 hst->execution_time=exectime; 02912 02913 /* record check type */ 02914 hst->check_type=HOST_CHECK_ACTIVE; 02915 02916 /* parse the output: short and long output, and perf data */ 02917 parse_check_output(temp_plugin_output,&hst->plugin_output,&hst->long_plugin_output,&hst->perf_data,TRUE,TRUE); 02918 02919 /* free memory */ 02920 my_free(temp_plugin_output); 02921 my_free(raw_command); 02922 my_free(processed_command); 02923 02924 /* a NULL host check command means we should assume the host is UP */ 02925 if(hst->host_check_command==NULL){ 02926 my_free(hst->plugin_output); 02927 hst->plugin_output=(char *)strdup("(Host assumed to be UP)"); 02928 result=STATE_OK; 02929 } 02930 02931 /* make sure we have some data */ 02932 if(hst->plugin_output==NULL || !strcmp(hst->plugin_output,"")){ 02933 my_free(hst->plugin_output); 02934 hst->plugin_output=(char *)strdup("(No output returned from host check)"); 02935 } 02936 02937 /* replace semicolons in plugin output (but not performance data) with colons */ 02938 if((temp_ptr=hst->plugin_output)){ 02939 while((temp_ptr=strchr(temp_ptr,';'))) 02940 *temp_ptr=':'; 02941 } 02942 02943 /* if we're not doing aggressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */ 02944 if(use_aggressive_host_checking==FALSE && result==STATE_WARNING) 02945 result=STATE_OK; 02946 02947 02948 if(result==STATE_OK) 02949 return_result=HOST_UP; 02950 else 02951 return_result=HOST_DOWN; 02952 02953 /* high resolution end time for event broker */ 02954 gettimeofday(&end_time,NULL); 02955 02956 #ifdef USE_EVENT_BROKER 02957 /* send data to event broker */ 02958 broker_host_check(NEBTYPE_HOSTCHECK_RAW_END,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,return_result,hst->state_type,start_time,end_time,hst->host_check_command,0.0,exectime,host_check_timeout,early_timeout,result,processed_command,hst->plugin_output,hst->long_plugin_output,hst->perf_data,NULL); 02959 #endif 02960 02961 log_debug_info(DEBUGL_CHECKS,0,"** Sync host check done: state=%d\n",return_result); 02962 02963 return return_result; 02964 } 02965 02966 02967 02968 /* run a scheduled host check asynchronously */ 02969 int run_scheduled_host_check_3x(host *hst, int check_options, double latency){ 02970 int result=OK; 02971 time_t current_time=0L; 02972 time_t preferred_time=0L; 02973 time_t next_valid_time=0L; 02974 int time_is_valid=TRUE; 02975 02976 02977 log_debug_info(DEBUGL_FUNCTIONS,0,"run_scheduled_host_check_3x()\n"); 02978 02979 if(hst==NULL) 02980 return ERROR; 02981 02982 log_debug_info(DEBUGL_CHECKS,0,"Attempting to run scheduled check of host '%s': check options=%d, latency=%lf\n",hst->name,check_options,latency); 02983 02984 /* attempt to run the check */ 02985 result=run_async_host_check_3x(hst,check_options,latency,TRUE,TRUE,&time_is_valid,&preferred_time); 02986 02987 /* an error occurred, so reschedule the check */ 02988 if(result==ERROR){ 02989 02990 log_debug_info(DEBUGL_CHECKS,1,"Unable to run scheduled host check at this time\n"); 02991 02992 /* only attempt to (re)schedule checks that should get checked... */ 02993 if(hst->should_be_scheduled==TRUE){ 02994 02995 /* get current time */ 02996 time(¤t_time); 02997 02998 /* determine next time we should check the host if needed */ 02999 /* if host has no check interval, schedule it again for 5 minutes from now */ 03000 if(current_time>=preferred_time) 03001 preferred_time=current_time+((hst->check_interval<=0)?300:(hst->check_interval*interval_length)); 03002 03003 /* make sure we rescheduled the next host check at a valid time */ 03004 get_next_valid_time(preferred_time,&next_valid_time,hst->check_period_ptr); 03005 03006 /* the host could not be rescheduled properly - set the next check time for next week */ 03007 if(time_is_valid==FALSE && next_valid_time==preferred_time){ 03008 03009 /* 03010 hst->next_check=(time_t)(next_valid_time+(60*60*24*365)); 03011 hst->should_be_scheduled=FALSE; 03012 */ 03013 03014 hst->next_check=(time_t)(next_valid_time+(60*60*24*7)); 03015 03016 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check of host '%s' could not be rescheduled properly. Scheduling check for next week...\n",hst->name); 03017 03018 log_debug_info(DEBUGL_CHECKS,1,"Unable to find any valid times to reschedule the next host check!\n"); 03019 } 03020 03021 /* this service could be rescheduled... */ 03022 else{ 03023 hst->next_check=next_valid_time; 03024 hst->should_be_scheduled=TRUE; 03025 03026 log_debug_info(DEBUGL_CHECKS,1,"Rescheduled next host check for %s",ctime(&next_valid_time)); 03027 } 03028 } 03029 03030 /* update the status log */ 03031 update_host_status(hst,FALSE); 03032 03033 /* reschedule the next host check - unless we couldn't find a valid next check time */ 03034 /* 10/19/07 EG - keep original check options */ 03035 if(hst->should_be_scheduled==TRUE) 03036 schedule_host_check(hst,hst->next_check,check_options); 03037 03038 return ERROR; 03039 } 03040 03041 return OK; 03042 } 03043 03044 03045 03046 /* perform an asynchronous check of a host */ 03047 /* scheduled host checks will use this, as will some checks that result from on-demand checks... */ 03048 int run_async_host_check_3x(host *hst, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time){ 03049 icinga_macros mac; 03050 char *raw_command=NULL; 03051 char *processed_command=NULL; 03052 struct timeval start_time,end_time; 03053 pid_t pid=0; 03054 int fork_error=FALSE; 03055 int wait_result=0; 03056 int pclose_result=0; 03057 mode_t new_umask=077; 03058 mode_t old_umask; 03059 char *output_file=NULL; 03060 double old_latency=0.0; 03061 dbuf checkresult_dbuf; 03062 int dbuf_chunk=1024; 03063 #ifdef USE_EVENT_BROKER 03064 int neb_result=OK; 03065 #endif 03066 03067 log_debug_info(DEBUGL_FUNCTIONS,0,"run_async_host_check_3x()\n"); 03068 03069 /* make sure we have a host */ 03070 if(hst==NULL) 03071 return ERROR; 03072 03073 log_debug_info(DEBUGL_CHECKS,0,"** Running async check of host '%s'...\n",hst->name); 03074 03075 /* is the host check viable at this time? */ 03076 if(check_host_check_viability_3x(hst,check_options,time_is_valid,preferred_time)==ERROR) 03077 return ERROR; 03078 03079 /* 08/04/07 EG don't execute a new host check if one is already running */ 03080 if(hst->is_executing==TRUE && !(check_options & CHECK_OPTION_FORCE_EXECUTION)){ 03081 log_debug_info(DEBUGL_CHECKS,1,"A check of this host is already being executed, so we'll pass for the moment...\n"); 03082 return ERROR; 03083 } 03084 03085 /******** GOOD TO GO FOR A REAL HOST CHECK AT THIS POINT ********/ 03086 03087 #ifdef USE_EVENT_BROKER 03088 /* initialize start/end times */ 03089 start_time.tv_sec=0L; 03090 start_time.tv_usec=0L; 03091 end_time.tv_sec=0L; 03092 end_time.tv_usec=0L; 03093 03094 /* send data to event broker */ 03095 neb_result=broker_host_check(NEBTYPE_HOSTCHECK_ASYNC_PRECHECK,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,0.0,host_check_timeout,FALSE,0,NULL,NULL,NULL,NULL,NULL); 03096 03097 /* neb module wants to cancel the host check - the check will be rescheduled for a later time by the scheduling logic */ 03098 if(neb_result==NEBERROR_CALLBACKCANCEL) 03099 return ERROR; 03100 03101 /* neb module wants to override the host check - perhaps it will check the host itself */ 03102 /* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */ 03103 if(neb_result==NEBERROR_CALLBACKOVERRIDE) 03104 return OK; 03105 #endif 03106 03107 log_debug_info(DEBUGL_CHECKS,0,"Checking host '%s'...\n",hst->name); 03108 03109 /* clear check options - we don't want old check options retained */ 03110 /* only clear options if this was a scheduled check - on demand check options shouldn't affect retained info */ 03111 if(scheduled_check==TRUE) 03112 hst->check_options=CHECK_OPTION_NONE; 03113 03114 /* adjust host check attempt */ 03115 adjust_host_check_attempt_3x(hst,TRUE); 03116 03117 /* set latency (temporarily) for macros and event broker */ 03118 old_latency=hst->latency; 03119 hst->latency=latency; 03120 03121 /* grab the host macro variables */ 03122 memset(&mac, 0, sizeof(mac)); 03123 grab_host_macros_r(&mac, hst); 03124 03125 /* get the raw command line */ 03126 get_raw_command_line_r(&mac, hst->check_command_ptr,hst->host_check_command,&raw_command,0); 03127 if(raw_command==NULL){ 03128 clear_volatile_macros_r(&mac); 03129 log_debug_info(DEBUGL_CHECKS,0,"Raw check command for host '%s' was NULL - aborting.\n",hst->name); 03130 return ERROR; 03131 } 03132 03133 /* process any macros contained in the argument */ 03134 process_macros_r(&mac, raw_command,&processed_command,0); 03135 if(processed_command==NULL){ 03136 clear_volatile_macros_r(&mac); 03137 log_debug_info(DEBUGL_CHECKS,0,"Processed check command for host '%s' was NULL - aborting.\n",hst->name); 03138 return ERROR; 03139 } 03140 03141 my_free(hst->processed_command); 03142 hst->processed_command=strdup(processed_command); 03143 03144 /* get the command start time */ 03145 gettimeofday(&start_time,NULL); 03146 03147 /* set check time for on-demand checks, so they're not incorrectly detected as being orphaned - Luke Ross 5/16/08 */ 03148 /* NOTE: 06/23/08 EG not sure if there will be side effects to this or not.... */ 03149 if(scheduled_check==FALSE) 03150 hst->next_check=start_time.tv_sec; 03151 03152 /* increment number of host checks that are currently running... */ 03153 currently_running_host_checks++; 03154 03155 /* set the execution flag */ 03156 hst->is_executing=TRUE; 03157 03158 /* open a temp file for storing check output */ 03159 old_umask=umask(new_umask); 03160 dummy=asprintf(&output_file,"%s/checkXXXXXX",temp_path); 03161 check_result_info.output_file_fd=mkstemp(output_file); 03162 if(check_result_info.output_file_fd>=0) 03163 check_result_info.output_file_fp=fdopen(check_result_info.output_file_fd,"w"); 03164 else{ 03165 check_result_info.output_file_fp=NULL; 03166 check_result_info.output_file_fd=-1; 03167 } 03168 umask(old_umask); 03169 03170 log_debug_info(DEBUGL_CHECKS|DEBUGL_IPC,1,"Check result output will be written to '%s' (fd=%d)\n",output_file,check_result_info.output_file_fd); 03171 03172 /* save check info */ 03173 check_result_info.object_check_type=HOST_CHECK; 03174 check_result_info.host_name=(char *)strdup(hst->name); 03175 check_result_info.service_description=NULL; 03176 check_result_info.check_type=HOST_CHECK_ACTIVE; 03177 check_result_info.check_options=check_options; 03178 check_result_info.scheduled_check=scheduled_check; 03179 check_result_info.reschedule_check=reschedule_check; 03180 check_result_info.output_file=(check_result_info.output_file_fd<0 || output_file==NULL)?NULL:strdup(output_file); 03181 check_result_info.latency=latency; 03182 check_result_info.start_time=start_time; 03183 check_result_info.finish_time=start_time; 03184 check_result_info.early_timeout=FALSE; 03185 check_result_info.exited_ok=TRUE; 03186 check_result_info.return_code=STATE_OK; 03187 check_result_info.output=NULL; 03188 03189 /* free memory */ 03190 my_free(output_file); 03191 03192 /* write initial check info to file */ 03193 /* if things go bad later on, the user will at least have something to go on when debugging... */ 03194 if(check_result_info.output_file_fp){ 03195 03196 fprintf(check_result_info.output_file_fp,"### Active Check Result File ###\n"); 03197 fprintf(check_result_info.output_file_fp,"file_time=%lu\n",(unsigned long)check_result_info.start_time.tv_sec); 03198 fprintf(check_result_info.output_file_fp,"\n"); 03199 03200 fprintf(check_result_info.output_file_fp,"### Icinga Host Check Result ###\n"); 03201 fprintf(check_result_info.output_file_fp,"# Time: %s",ctime(&check_result_info.start_time.tv_sec)); 03202 fprintf(check_result_info.output_file_fp,"host_name=%s\n",check_result_info.host_name); 03203 fprintf(check_result_info.output_file_fp,"check_type=%d\n",check_result_info.check_type); 03204 fprintf(check_result_info.output_file_fp,"check_options=%d\n",check_result_info.check_options); 03205 fprintf(check_result_info.output_file_fp,"scheduled_check=%d\n",check_result_info.scheduled_check); 03206 fprintf(check_result_info.output_file_fp,"reschedule_check=%d\n",check_result_info.reschedule_check); 03207 fprintf(check_result_info.output_file_fp,"latency=%f\n",hst->latency); 03208 fprintf(check_result_info.output_file_fp,"start_time=%lu.%lu\n",check_result_info.start_time.tv_sec,check_result_info.start_time.tv_usec); 03209 03210 /* flush buffer or we'll end up writing twice when we fork() */ 03211 fflush(check_result_info.output_file_fp); 03212 } 03213 03214 /* initialize dynamic buffer for storing plugin output */ 03215 dbuf_init(&checkresult_dbuf,dbuf_chunk); 03216 03217 #ifdef USE_EVENT_BROKER 03218 /* send data to event broker */ 03219 broker_host_check(NEBTYPE_HOSTCHECK_INITIATE,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,0.0,host_check_timeout,FALSE,0,processed_command,NULL,NULL,NULL,NULL); 03220 #endif 03221 03222 /* reset latency (permanent value for this check will get set later) */ 03223 hst->latency=old_latency; 03224 03225 /* update check statistics */ 03226 update_check_stats((scheduled_check==TRUE)?ACTIVE_SCHEDULED_HOST_CHECK_STATS:ACTIVE_ONDEMAND_HOST_CHECK_STATS,start_time.tv_sec); 03227 update_check_stats(PARALLEL_HOST_CHECK_STATS,start_time.tv_sec); 03228 03229 /* fork a child process */ 03230 pid=fork(); 03231 03232 /* an error occurred while trying to fork */ 03233 if(pid==-1){ 03234 03235 fork_error=TRUE; 03236 03237 /* log an error */ 03238 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The check of host '%s' could not be performed due to a fork() error: '%s'.\n",hst->name,strerror(errno)); 03239 03240 log_debug_info(DEBUGL_CHECKS,0,"Check of host '%s' could not be performed due to a fork() error: '%s'!\n",hst->name,strerror(errno)); 03241 } 03242 03243 /* if we are in the child process... */ 03244 else if(pid==0){ 03245 03246 /* set environment variables */ 03247 set_all_macro_environment_vars_r(&mac, TRUE); 03248 03249 /* ADDED 11/12/07 EG */ 03250 /* close external command file and shut down worker thread */ 03251 close_command_file(); 03252 03253 /* fork again if we're not in a large installation */ 03254 if(child_processes_fork_twice==TRUE){ 03255 03256 /* fork again... */ 03257 pid=fork(); 03258 03259 /* an error occurred while trying to fork again */ 03260 if(pid==-1) 03261 exit(STATE_UNKNOWN); 03262 } 03263 03264 /* the grandchild (or child if large install tweaks are enabled) process should run the host check... */ 03265 if(pid==0 || child_processes_fork_twice==FALSE){ 03266 03267 /* reset signal handling */ 03268 reset_sighandler(); 03269 03270 /* become the process group leader */ 03271 setpgid(0,0); 03272 03273 /* catch term signals at this process level */ 03274 signal(SIGTERM,host_check_sighandler); 03275 03276 /* catch plugins that don't finish in a timely manner */ 03277 signal(SIGALRM,host_check_sighandler); 03278 alarm(host_check_timeout); 03279 03280 /* disable rotation of the debug file */ 03281 max_debug_file_size=0L; 03282 03283 /* run the plugin check command */ 03284 pclose_result=run_check(processed_command,&checkresult_dbuf); 03285 03286 /* reset the alarm */ 03287 alarm(0); 03288 03289 /* get the check finish time */ 03290 gettimeofday(&end_time,NULL); 03291 03292 /* record check result info */ 03293 check_result_info.finish_time=end_time; 03294 check_result_info.early_timeout=FALSE; 03295 03296 /* test for execution error */ 03297 if(pclose_result==-1){ 03298 pclose_result=STATE_UNKNOWN; 03299 check_result_info.return_code=STATE_CRITICAL; 03300 check_result_info.exited_ok=FALSE; 03301 } 03302 else{ 03303 if(WEXITSTATUS(pclose_result)==0 && WIFSIGNALED(pclose_result)) 03304 check_result_info.return_code=128+WTERMSIG(pclose_result); 03305 else 03306 check_result_info.return_code=WEXITSTATUS(pclose_result); 03307 } 03308 03309 /* write check result to file */ 03310 if(check_result_info.output_file_fp){ 03311 03312 fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec); 03313 fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout); 03314 fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok); 03315 fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code); 03316 fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf); 03317 03318 /* close the temp file */ 03319 fclose(check_result_info.output_file_fp); 03320 03321 /* move check result to queue directory */ 03322 move_check_result_to_queue(check_result_info.output_file); 03323 } 03324 03325 /* free memory */ 03326 dbuf_free(&checkresult_dbuf); 03327 my_free(raw_command); 03328 my_free(processed_command); 03329 03330 /* free check result memory */ 03331 free_check_result(&check_result_info); 03332 03333 /* return with plugin exit status - not really necessary... */ 03334 _exit(pclose_result); 03335 } 03336 03337 /* NOTE: this code is never reached if large install tweaks are enabled... */ 03338 03339 /* unset environment variables */ 03340 set_all_macro_environment_vars_r(&mac, FALSE); 03341 03342 /* free allocated memory */ 03343 /* this needs to be done last, so we don't free memory for variables before they're used above */ 03344 if(free_child_process_memory==TRUE) 03345 free_memory(&mac); 03346 03347 /* parent exits immediately - grandchild process is inherited by the INIT process, so we have no zombie problem... */ 03348 _exit(STATE_OK); 03349 } 03350 03351 /* else the parent should wait for the first child to return... */ 03352 else if(pid>0){ 03353 clear_volatile_macros_r(&mac); 03354 03355 log_debug_info(DEBUGL_CHECKS,2,"Host check is executing in child process (pid=%lu)\n",(unsigned long)pid); 03356 03357 /* parent should close output file */ 03358 if(check_result_info.output_file_fp) 03359 fclose(check_result_info.output_file_fp); 03360 03361 /* should this be done in first child process (after spawning grandchild) as well? */ 03362 /* free memory allocated for IPC functionality */ 03363 free_check_result(&check_result_info); 03364 03365 /* free memory */ 03366 my_free(raw_command); 03367 my_free(processed_command); 03368 03369 /* wait for the first child to return */ 03370 /* if large install tweaks are enabled, we'll clean up the zombie process later */ 03371 if(child_processes_fork_twice==TRUE) 03372 wait_result=waitpid(pid,NULL,0); 03373 } 03374 03375 /* see if we were able to run the check... */ 03376 if(fork_error==TRUE) 03377 return ERROR; 03378 03379 return OK; 03380 } 03381 03382 03383 03384 /* process results of an asynchronous host check */ 03385 int handle_async_host_check_result_3x(host *temp_host, check_result *queued_check_result){ 03386 time_t current_time; 03387 int result=STATE_OK; 03388 int reschedule_check=FALSE; 03389 char *old_plugin_output=NULL; 03390 char *temp_ptr=NULL; 03391 struct timeval start_time_hires; 03392 struct timeval end_time_hires; 03393 03394 log_debug_info(DEBUGL_FUNCTIONS,0,"handle_async_host_check_result_3x()\n"); 03395 03396 /* make sure we have what we need */ 03397 if(temp_host==NULL || queued_check_result==NULL) 03398 return ERROR; 03399 03400 time(¤t_time); 03401 03402 log_debug_info(DEBUGL_CHECKS,1,"** Handling async check result for host '%s'...\n",temp_host->name); 03403 03404 log_debug_info(DEBUGL_CHECKS,2,"\tCheck Type: %s\n",(queued_check_result->check_type==HOST_CHECK_ACTIVE)?"Active":"Passive"); 03405 log_debug_info(DEBUGL_CHECKS,2,"\tCheck Options: %d\n",queued_check_result->check_options); 03406 log_debug_info(DEBUGL_CHECKS,2,"\tScheduled Check?: %s\n",(queued_check_result->scheduled_check==TRUE)?"Yes":"No"); 03407 log_debug_info(DEBUGL_CHECKS,2,"\tReschedule Check?: %s\n",(queued_check_result->reschedule_check==TRUE)?"Yes":"No"); 03408 log_debug_info(DEBUGL_CHECKS,2,"\tExited OK?: %s\n",(queued_check_result->exited_ok==TRUE)?"Yes":"No"); 03409 log_debug_info(DEBUGL_CHECKS,2,"\tExec Time: %.3f\n",temp_host->execution_time); 03410 log_debug_info(DEBUGL_CHECKS,2,"\tLatency: %.3f\n",temp_host->latency); 03411 log_debug_info(DEBUGL_CHECKS,2,"\tReturn Status: %d\n",queued_check_result->return_code); 03412 log_debug_info(DEBUGL_CHECKS,2,"\tOutput: %s\n",(queued_check_result==NULL)?"NULL":queued_check_result->output); 03413 03414 /* decrement the number of host checks still out there... */ 03415 if(queued_check_result->check_type==HOST_CHECK_ACTIVE && currently_running_host_checks>0) 03416 currently_running_host_checks--; 03417 03418 /* skip this host check results if its passive and we aren't accepting passive check results */ 03419 if(queued_check_result->check_type==HOST_CHECK_PASSIVE){ 03420 if(accept_passive_host_checks==FALSE){ 03421 log_debug_info(DEBUGL_CHECKS,0,"Discarding passive host check result because passive host checks are disabled globally.\n"); 03422 return ERROR; 03423 } 03424 if(temp_host->accept_passive_host_checks==FALSE){ 03425 log_debug_info(DEBUGL_CHECKS,0,"Discarding passive host check result because passive checks are disabled for this host.\n"); 03426 return ERROR; 03427 } 03428 } 03429 03430 /* clear the freshening flag (it would have been set if this host was determined to be stale) */ 03431 if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) 03432 temp_host->is_being_freshened=FALSE; 03433 03434 /* DISCARD INVALID FRESHNESS CHECK RESULTS */ 03435 /* If a host goes stale, Icinga will initiate a forced check in order to freshen it. There is a race condition whereby a passive check 03436 could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here. This would 03437 make the host fresh again, so we do a quick check to make sure the host is still stale before we accept the check result. */ 03438 if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_host_result_fresh(temp_host,current_time,FALSE)==TRUE){ 03439 log_debug_info(DEBUGL_CHECKS,0,"Discarding host freshness check result because the host is currently fresh (race condition avoided).\n"); 03440 return OK; 03441 } 03442 03443 /* was this check passive or active? */ 03444 temp_host->check_type=(queued_check_result->check_type==HOST_CHECK_ACTIVE)?HOST_CHECK_ACTIVE:HOST_CHECK_PASSIVE; 03445 03446 /* update check statistics for passive results */ 03447 if(queued_check_result->check_type==HOST_CHECK_PASSIVE) 03448 update_check_stats(PASSIVE_HOST_CHECK_STATS,queued_check_result->start_time.tv_sec); 03449 03450 /* should we reschedule the next check of the host? NOTE: this might be overridden later... */ 03451 reschedule_check=queued_check_result->reschedule_check; 03452 03453 /* check latency is passed to us for both active and passive checks */ 03454 temp_host->latency=queued_check_result->latency; 03455 03456 /* update the execution time for this check (millisecond resolution) */ 03457 temp_host->execution_time=(double)((double)(queued_check_result->finish_time.tv_sec-queued_check_result->start_time.tv_sec)+(double)((queued_check_result->finish_time.tv_usec-queued_check_result->start_time.tv_usec)/1000.0)/1000.0); 03458 if(temp_host->execution_time<0.0) 03459 temp_host->execution_time=0.0; 03460 03461 /* set the checked flag */ 03462 temp_host->has_been_checked=TRUE; 03463 03464 /* clear the execution flag if this was an active check */ 03465 if(queued_check_result->check_type==HOST_CHECK_ACTIVE) 03466 temp_host->is_executing=FALSE; 03467 03468 /* get the last check time */ 03469 temp_host->last_check=queued_check_result->start_time.tv_sec; 03470 03471 /* was this check passive or active? */ 03472 temp_host->check_type=(queued_check_result->check_type==HOST_CHECK_ACTIVE)?HOST_CHECK_ACTIVE:HOST_CHECK_PASSIVE; 03473 03474 /* save the old host state */ 03475 temp_host->last_state=temp_host->current_state; 03476 if(temp_host->state_type==HARD_STATE) 03477 temp_host->last_hard_state=temp_host->current_state; 03478 03479 /* save old plugin output */ 03480 if(temp_host->plugin_output) 03481 old_plugin_output=(char *)strdup(temp_host->plugin_output); 03482 03483 /* clear the old plugin output and perf data buffers */ 03484 my_free(temp_host->plugin_output); 03485 my_free(temp_host->long_plugin_output); 03486 my_free(temp_host->perf_data); 03487 03488 /* parse check output to get: (1) short output, (2) long output, (3) perf data */ 03489 parse_check_output(queued_check_result->output,&temp_host->plugin_output,&temp_host->long_plugin_output,&temp_host->perf_data,TRUE,TRUE); 03490 03491 /* make sure we have some data */ 03492 if(temp_host->plugin_output==NULL || !strcmp(temp_host->plugin_output,"")){ 03493 my_free(temp_host->plugin_output); 03494 temp_host->plugin_output=(char *)strdup("(No output returned from host check)"); 03495 } 03496 03497 /* replace semicolons in plugin output (but not performance data) with colons */ 03498 if((temp_ptr=temp_host->plugin_output)){ 03499 while((temp_ptr=strchr(temp_ptr,';'))) 03500 *temp_ptr=':'; 03501 } 03502 03503 log_debug_info(DEBUGL_CHECKS,2,"Parsing check output...\n"); 03504 log_debug_info(DEBUGL_CHECKS,2,"Short Output: %s\n",(temp_host->plugin_output==NULL)?"NULL":temp_host->plugin_output); 03505 log_debug_info(DEBUGL_CHECKS,2,"Long Output: %s\n",(temp_host->long_plugin_output==NULL)?"NULL":temp_host->long_plugin_output); 03506 log_debug_info(DEBUGL_CHECKS,2,"Perf Data: %s\n",(temp_host->perf_data==NULL)?"NULL":temp_host->perf_data); 03507 03508 /* get the unprocessed return code */ 03509 /* NOTE: for passive checks, this is the final/processed state */ 03510 result=queued_check_result->return_code; 03511 03512 /* adjust return code (active checks only) */ 03513 if(queued_check_result->check_type==HOST_CHECK_ACTIVE){ 03514 03515 /* if there was some error running the command, just skip it (this shouldn't be happening) */ 03516 if(queued_check_result->exited_ok==FALSE){ 03517 03518 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check of host '%s' did not exit properly!\n",temp_host->name); 03519 03520 my_free(temp_host->plugin_output); 03521 my_free(temp_host->long_plugin_output); 03522 my_free(temp_host->perf_data); 03523 03524 temp_host->plugin_output=(char *)strdup("(Host check did not exit properly)"); 03525 03526 result=STATE_CRITICAL; 03527 } 03528 03529 /* make sure the return code is within bounds */ 03530 else if(queued_check_result->return_code<0 || queued_check_result->return_code>3){ 03531 03532 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Return code of %d for check of host '%s' was out of bounds.%s\n",queued_check_result->return_code,temp_host->name,(queued_check_result->return_code==126 || queued_check_result->return_code==127)?" Make sure the plugin you're trying to run actually exists.":""); 03533 03534 my_free(temp_host->plugin_output); 03535 my_free(temp_host->long_plugin_output); 03536 my_free(temp_host->perf_data); 03537 03538 dummy=asprintf(&temp_host->plugin_output,"(Return code of %d is out of bounds%s)",queued_check_result->return_code,(queued_check_result->return_code==126 || queued_check_result->return_code==127)?" - plugin may be missing":""); 03539 03540 result=STATE_CRITICAL; 03541 } 03542 03543 /* a NULL host check command means we should assume the host is UP */ 03544 if(temp_host->host_check_command==NULL){ 03545 my_free(temp_host->plugin_output); 03546 temp_host->plugin_output=(char *)strdup("(Host assumed to be UP)"); 03547 result=STATE_OK; 03548 } 03549 } 03550 03551 /* translate return code to basic UP/DOWN state - the DOWN/UNREACHABLE state determination is made later */ 03552 /* NOTE: only do this for active checks - passive check results already have the final state */ 03553 if(queued_check_result->check_type==HOST_CHECK_ACTIVE){ 03554 03555 /* if we're not doing aggressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */ 03556 if(use_aggressive_host_checking==FALSE && result==STATE_WARNING) 03557 result=STATE_OK; 03558 03559 /* OK states means the host is UP */ 03560 if(result==STATE_OK) 03561 result=HOST_UP; 03562 03563 /* any problem state indicates the host is not UP */ 03564 else 03565 result=HOST_DOWN; 03566 } 03567 03568 03569 /******************* PROCESS THE CHECK RESULTS ******************/ 03570 03571 /* process the host check result */ 03572 process_host_check_result_3x(temp_host,result,old_plugin_output,CHECK_OPTION_NONE,reschedule_check,TRUE,cached_host_check_horizon); 03573 03574 /* free memory */ 03575 my_free(old_plugin_output); 03576 03577 log_debug_info(DEBUGL_CHECKS,1,"** Async check result for host '%s' handled: new state=%d\n",temp_host->name,temp_host->current_state); 03578 03579 /* high resolution start time for event broker */ 03580 start_time_hires=queued_check_result->start_time; 03581 03582 /* high resolution end time for event broker */ 03583 gettimeofday(&end_time_hires,NULL); 03584 03585 #ifdef USE_EVENT_BROKER 03586 /* send data to event broker */ 03587 broker_host_check(NEBTYPE_HOSTCHECK_PROCESSED,NEBFLAG_NONE,NEBATTR_NONE,temp_host,temp_host->check_type,temp_host->current_state,temp_host->state_type,start_time_hires,end_time_hires,temp_host->host_check_command,temp_host->latency,temp_host->execution_time,host_check_timeout,queued_check_result->early_timeout,queued_check_result->return_code,temp_host->processed_command,temp_host->plugin_output,temp_host->long_plugin_output,temp_host->perf_data,NULL); 03588 #endif 03589 03590 return OK; 03591 } 03592 03593 03594 03595 /* processes the result of a synchronous or asynchronous host check */ 03596 int process_host_check_result_3x(host *hst, int new_state, char *old_plugin_output, int check_options, int reschedule_check, int use_cached_result, unsigned long check_timestamp_horizon){ 03597 hostsmember *temp_hostsmember=NULL; 03598 host *child_host=NULL; 03599 host *parent_host=NULL; 03600 host *master_host=NULL; 03601 host *temp_host=NULL; 03602 hostdependency *temp_dependency=NULL; 03603 objectlist *check_hostlist=NULL; 03604 objectlist *hostlist_item=NULL; 03605 int parent_state=HOST_UP; 03606 time_t current_time=0L; 03607 time_t next_check=0L; 03608 time_t preferred_time=0L; 03609 time_t next_valid_time=0L; 03610 int run_async_check=TRUE; 03611 void *ptr=NULL; 03612 03613 03614 log_debug_info(DEBUGL_FUNCTIONS,0,"process_host_check_result_3x()\n"); 03615 03616 log_debug_info(DEBUGL_CHECKS,1,"HOST: %s, ATTEMPT=%d/%d, CHECK TYPE=%s, STATE TYPE=%s, OLD STATE=%d, NEW STATE=%d\n",hst->name,hst->current_attempt,hst->max_attempts,(hst->check_type==HOST_CHECK_ACTIVE)?"ACTIVE":"PASSIVE",(hst->state_type==HARD_STATE)?"HARD":"SOFT",hst->current_state,new_state); 03617 03618 /* get the current time */ 03619 time(¤t_time); 03620 03621 /* default next check time */ 03622 next_check=(unsigned long)(current_time+(hst->check_interval*interval_length)); 03623 03624 /* we have to adjust current attempt # for passive checks, as it isn't done elsewhere */ 03625 if(hst->check_type==HOST_CHECK_PASSIVE && passive_host_checks_are_soft==TRUE) 03626 adjust_host_check_attempt_3x(hst,FALSE); 03627 03628 /* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */ 03629 if(hst->check_type==HOST_CHECK_PASSIVE){ 03630 if(log_passive_checks==TRUE) 03631 logit(NSLOG_PASSIVE_CHECK,FALSE,"PASSIVE HOST CHECK: %s;%d;%s\n",hst->name,new_state,hst->plugin_output); 03632 } 03633 03634 03635 /******* HOST WAS DOWN/UNREACHABLE INITIALLY *******/ 03636 if(hst->current_state!=HOST_UP){ 03637 03638 log_debug_info(DEBUGL_CHECKS,1,"Host was DOWN/UNREACHABLE.\n"); 03639 03640 /***** HOST IS NOW UP *****/ 03641 /* the host just recovered! */ 03642 if(new_state==HOST_UP){ 03643 03644 /* set the current state */ 03645 hst->current_state=HOST_UP; 03646 03647 /* set the state type */ 03648 /* set state type to HARD for passive checks and active checks that were previously in a HARD STATE */ 03649 if(hst->state_type==HARD_STATE || (hst->check_type==HOST_CHECK_PASSIVE && passive_host_checks_are_soft==FALSE)) 03650 hst->state_type=HARD_STATE; 03651 else 03652 hst->state_type=SOFT_STATE; 03653 03654 log_debug_info(DEBUGL_CHECKS,1,"Host experienced a %s recovery (it's now UP).\n",(hst->state_type==HARD_STATE)?"HARD":"SOFT"); 03655 03656 /* reschedule the next check of the host at the normal interval */ 03657 reschedule_check=TRUE; 03658 next_check=(unsigned long)(current_time+(hst->check_interval*interval_length)); 03659 03660 /* propagate checks to immediate parents if they are not already UP */ 03661 /* we do this because a parent host (or grandparent) may have recovered somewhere and we should catch the recovery as soon as possible */ 03662 log_debug_info(DEBUGL_CHECKS,1,"Propagating checks to parent host(s)...\n"); 03663 03664 for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){ 03665 if((parent_host=temp_hostsmember->host_ptr)==NULL) 03666 continue; 03667 if(parent_host->current_state!=HOST_UP){ 03668 log_debug_info(DEBUGL_CHECKS,1,"Check of parent host '%s' queued.\n",parent_host->name); 03669 add_object_to_objectlist(&check_hostlist,(void *)parent_host); 03670 } 03671 } 03672 03673 /* propagate checks to immediate children if they are not already UP */ 03674 /* we do this because children may currently be UNREACHABLE, but may (as a result of this recovery) switch to UP or DOWN states */ 03675 log_debug_info(DEBUGL_CHECKS,1,"Propagating checks to child host(s)...\n"); 03676 03677 for(temp_hostsmember=hst->child_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){ 03678 if((child_host=temp_hostsmember->host_ptr)==NULL) 03679 continue; 03680 if(child_host->current_state!=HOST_UP){ 03681 log_debug_info(DEBUGL_CHECKS,1,"Check of child host '%s' queued.\n",child_host->name); 03682 add_object_to_objectlist(&check_hostlist,(void *)child_host); 03683 } 03684 } 03685 } 03686 03687 /***** HOST IS STILL DOWN/UNREACHABLE *****/ 03688 /* we're still in a problem state... */ 03689 else{ 03690 03691 log_debug_info(DEBUGL_CHECKS,1,"Host is still DOWN/UNREACHABLE.\n"); 03692 03693 /* passive checks are treated as HARD states by default... */ 03694 if(hst->check_type==HOST_CHECK_PASSIVE && passive_host_checks_are_soft==FALSE){ 03695 03696 /* set the state type */ 03697 hst->state_type=HARD_STATE; 03698 03699 /* reset the current attempt */ 03700 hst->current_attempt=1; 03701 } 03702 03703 /* active checks and passive checks (treated as SOFT states) */ 03704 else{ 03705 03706 /* set the state type */ 03707 /* we've maxed out on the retries */ 03708 if(hst->current_attempt==hst->max_attempts) 03709 hst->state_type=HARD_STATE; 03710 /* the host was in a hard problem state before, so it still is now */ 03711 else if(hst->current_attempt==1) 03712 hst->state_type=HARD_STATE; 03713 /* the host is in a soft state and the check will be retried */ 03714 else 03715 hst->state_type=SOFT_STATE; 03716 } 03717 03718 /* make a determination of the host's state */ 03719 /* translate host state between DOWN/UNREACHABLE (only for passive checks if enabled) */ 03720 hst->current_state=new_state; 03721 if(hst->check_type==HOST_CHECK_ACTIVE || translate_passive_host_checks==TRUE) 03722 hst->current_state=determine_host_reachability(hst); 03723 03724 /* reschedule the next check if the host state changed */ 03725 if(hst->last_state!=hst->current_state || hst->last_hard_state!=hst->current_state){ 03726 03727 reschedule_check=TRUE; 03728 03729 /* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */ 03730 if(hst->state_type==SOFT_STATE) 03731 next_check=(unsigned long)(current_time+(hst->retry_interval*interval_length)); 03732 03733 /* host has maxed out on retries (or was previously in a hard problem state), so reschedule the next check at the normal interval */ 03734 else 03735 next_check=(unsigned long)(current_time+(hst->check_interval*interval_length)); 03736 } 03737 03738 } 03739 03740 } 03741 03742 /******* HOST WAS UP INITIALLY *******/ 03743 else{ 03744 03745 log_debug_info(DEBUGL_CHECKS,1,"Host was UP.\n"); 03746 03747 /***** HOST IS STILL UP *****/ 03748 /* either the host never went down since last check */ 03749 if(new_state==HOST_UP){ 03750 03751 log_debug_info(DEBUGL_CHECKS,1,"Host is still UP.\n"); 03752 03753 /* set the current state */ 03754 hst->current_state=HOST_UP; 03755 03756 /* set the state type */ 03757 hst->state_type=HARD_STATE; 03758 03759 /* reschedule the next check at the normal interval */ 03760 if(reschedule_check==TRUE) 03761 next_check=(unsigned long)(current_time+(hst->check_interval*interval_length)); 03762 } 03763 03764 /***** HOST IS NOW DOWN/UNREACHABLE *****/ 03765 else{ 03766 03767 log_debug_info(DEBUGL_CHECKS,1,"Host is now DOWN/UNREACHABLE.\n"); 03768 03769 /***** SPECIAL CASE FOR HOSTS WITH MAX_ATTEMPTS==1 *****/ 03770 if(hst->max_attempts==1){ 03771 03772 log_debug_info(DEBUGL_CHECKS,1,"Max attempts = 1!.\n"); 03773 03774 /* set the state type */ 03775 hst->state_type=HARD_STATE; 03776 03777 /* host has maxed out on retries, so reschedule the next check at the normal interval */ 03778 reschedule_check=TRUE; 03779 next_check=(unsigned long)(current_time+(hst->check_interval*interval_length)); 03780 03781 /* we need to run SYNCHRONOUS checks of all parent hosts to accurately determine the state of this host */ 03782 /* this is extremely inefficient (reminiscent of Icinga 2.x logic), but there's no other good way around it */ 03783 /* check all parent hosts to see if we're DOWN or UNREACHABLE */ 03784 /* only do this for ACTIVE checks, as PASSIVE checks contain a pre-determined state */ 03785 if(hst->check_type==HOST_CHECK_ACTIVE){ 03786 03787 log_debug_info(DEBUGL_CHECKS,1,"** WARNING: Max attempts = 1, so we have to run serial checks of all parent hosts!\n"); 03788 03789 for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){ 03790 03791 if((parent_host=temp_hostsmember->host_ptr)==NULL) 03792 continue; 03793 03794 log_debug_info(DEBUGL_CHECKS,1,"Running serial check parent host '%s'...\n",parent_host->name); 03795 03796 /* run an immediate check of the parent host */ 03797 run_sync_host_check_3x(parent_host,&parent_state,check_options,use_cached_result,check_timestamp_horizon); 03798 03799 /* bail out as soon as we find one parent host that is UP */ 03800 if(parent_state==HOST_UP){ 03801 03802 log_debug_info(DEBUGL_CHECKS,1,"Parent host is UP, so this one is DOWN.\n"); 03803 03804 /* set the current state */ 03805 hst->current_state=HOST_DOWN; 03806 break; 03807 } 03808 } 03809 03810 if(temp_hostsmember==NULL){ 03811 /* host has no parents, so its up */ 03812 if(hst->parent_hosts==NULL){ 03813 log_debug_info(DEBUGL_CHECKS,1,"Host has no parents, so it's DOWN.\n"); 03814 hst->current_state=HOST_DOWN; 03815 } 03816 else{ 03817 /* no parents were up, so this host is UNREACHABLE */ 03818 log_debug_info(DEBUGL_CHECKS,1,"No parents were UP, so this host is UNREACHABLE.\n"); 03819 hst->current_state=HOST_UNREACHABLE; 03820 } 03821 } 03822 } 03823 03824 /* set the host state for passive checks */ 03825 else{ 03826 /* set the state */ 03827 hst->current_state=new_state; 03828 03829 /* translate host state between DOWN/UNREACHABLE for passive checks (if enabled) */ 03830 /* make a determination of the host's state */ 03831 if(translate_passive_host_checks==TRUE) 03832 hst->current_state=determine_host_reachability(hst); 03833 03834 } 03835 03836 /* propagate checks to immediate children if they are not UNREACHABLE */ 03837 /* we do this because we may now be blocking the route to child hosts */ 03838 log_debug_info(DEBUGL_CHECKS,1,"Propagating check to immediate non-UNREACHABLE child hosts...\n"); 03839 03840 for(temp_hostsmember=hst->child_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){ 03841 if((child_host=temp_hostsmember->host_ptr)==NULL) 03842 continue; 03843 if(child_host->current_state!=HOST_UNREACHABLE){ 03844 log_debug_info(DEBUGL_CHECKS,1,"Check of child host '%s' queued.\n",child_host->name); 03845 add_object_to_objectlist(&check_hostlist,(void *)child_host); 03846 } 03847 } 03848 } 03849 03850 /***** MAX ATTEMPTS > 1 *****/ 03851 else{ 03852 03853 /* active and (in some cases) passive check results are treated as SOFT states */ 03854 if(hst->check_type==HOST_CHECK_ACTIVE || passive_host_checks_are_soft==TRUE){ 03855 03856 /* set the state type */ 03857 hst->state_type=SOFT_STATE; 03858 } 03859 03860 /* by default, passive check results are treated as HARD states */ 03861 else{ 03862 03863 /* set the state type */ 03864 hst->state_type=HARD_STATE; 03865 03866 /* reset the current attempt */ 03867 hst->current_attempt=1; 03868 } 03869 03870 /* make a (in some cases) preliminary determination of the host's state */ 03871 /* translate host state between DOWN/UNREACHABLE (for passive checks only if enabled) */ 03872 hst->current_state=new_state; 03873 if(hst->check_type==HOST_CHECK_ACTIVE || translate_passive_host_checks==TRUE) 03874 hst->current_state=determine_host_reachability(hst); 03875 03876 /* reschedule a check of the host */ 03877 reschedule_check=TRUE; 03878 03879 /* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */ 03880 if(hst->check_type==HOST_CHECK_ACTIVE || passive_host_checks_are_soft==TRUE) 03881 next_check=(unsigned long)(current_time+(hst->retry_interval*interval_length)); 03882 03883 /* schedule a re-check of the host at the normal interval */ 03884 else 03885 next_check=(unsigned long)(current_time+(hst->check_interval*interval_length)); 03886 03887 /* propagate checks to immediate parents if they are UP */ 03888 /* we do this because a parent host (or grandparent) may have gone down and blocked our route */ 03889 /* checking the parents ASAP will allow us to better determine the final state (DOWN/UNREACHABLE) of this host later */ 03890 log_debug_info(DEBUGL_CHECKS,1,"Propagating checks to immediate parent hosts that are UP...\n"); 03891 03892 for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){ 03893 if((parent_host=temp_hostsmember->host_ptr)==NULL) 03894 continue; 03895 if(parent_host->current_state==HOST_UP){ 03896 add_object_to_objectlist(&check_hostlist,(void *)parent_host); 03897 log_debug_info(DEBUGL_CHECKS,1,"Check of host '%s' queued.\n",parent_host->name); 03898 } 03899 } 03900 03901 /* propagate checks to immediate children if they are not UNREACHABLE */ 03902 /* we do this because we may now be blocking the route to child hosts */ 03903 log_debug_info(DEBUGL_CHECKS,1,"Propagating checks to immediate non-UNREACHABLE child hosts...\n"); 03904 03905 for(temp_hostsmember=hst->child_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){ 03906 if((child_host=temp_hostsmember->host_ptr)==NULL) 03907 continue; 03908 if(child_host->current_state!=HOST_UNREACHABLE){ 03909 log_debug_info(DEBUGL_CHECKS,1,"Check of child host '%s' queued.\n",child_host->name); 03910 add_object_to_objectlist(&check_hostlist,(void *)child_host); 03911 } 03912 } 03913 03914 /* check dependencies on second to last host check */ 03915 if(enable_predictive_host_dependency_checks==TRUE && hst->current_attempt==(hst->max_attempts-1)){ 03916 03917 /* propagate checks to hosts that THIS ONE depends on for notifications AND execution */ 03918 /* we do to help ensure that the dependency checks are accurate before it comes time to notify */ 03919 log_debug_info(DEBUGL_CHECKS,1,"Propagating predictive dependency checks to hosts this one depends on...\n"); 03920 03921 for(temp_dependency=get_first_hostdependency_by_dependent_host(hst->name,&ptr);temp_dependency!=NULL;temp_dependency=get_next_hostdependency_by_dependent_host(hst->name,&ptr)){ 03922 if(temp_dependency->dependent_host_ptr==hst && temp_dependency->master_host_ptr!=NULL){ 03923 master_host=(host *)temp_dependency->master_host_ptr; 03924 log_debug_info(DEBUGL_CHECKS,1,"Check of host '%s' queued.\n",master_host->name); 03925 add_object_to_objectlist(&check_hostlist,(void *)master_host); 03926 } 03927 } 03928 } 03929 } 03930 } 03931 } 03932 03933 log_debug_info(DEBUGL_CHECKS,1,"Pre-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d\n",hst->name,hst->current_attempt,hst->max_attempts,(hst->state_type==HARD_STATE)?"HARD":"SOFT",hst->current_state); 03934 03935 /* handle the host state */ 03936 handle_host_state(hst); 03937 03938 log_debug_info(DEBUGL_CHECKS,1,"Post-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d\n",hst->name,hst->current_attempt,hst->max_attempts,(hst->state_type==HARD_STATE)?"HARD":"SOFT",hst->current_state); 03939 03940 03941 /******************** POST-PROCESSING STUFF *********************/ 03942 03943 /* if the plugin output differs from previous check and no state change, log the current state/output if state stalking is enabled */ 03944 if(hst->last_state==hst->current_state && compare_strings(old_plugin_output,hst->plugin_output)){ 03945 03946 if(hst->current_state==HOST_UP && hst->stalk_on_up==TRUE) { 03947 03948 log_host_event(hst); 03949 03950 /* should we run event handlers ? */ 03951 if (stalking_event_handlers_for_hosts==TRUE) 03952 handle_host_event(hst); 03953 03954 } else if(hst->current_state==HOST_DOWN && hst->stalk_on_down==TRUE) { 03955 03956 log_host_event(hst); 03957 03958 /* should we run event handlers ? */ 03959 if (stalking_event_handlers_for_hosts==TRUE) 03960 handle_host_event(hst); 03961 03962 } else if(hst->current_state==HOST_UNREACHABLE && hst->stalk_on_unreachable==TRUE) { 03963 03964 log_host_event(hst); 03965 03966 /* should we run event handlers ? */ 03967 if (stalking_event_handlers_for_hosts==TRUE) 03968 handle_host_event(hst); 03969 } 03970 } 03971 03972 /* check to see if the associated host is flapping */ 03973 check_for_host_flapping(hst,TRUE,TRUE,TRUE); 03974 03975 /* reschedule the next check of the host (usually ONLY for scheduled, active checks, unless overridden above) */ 03976 if(reschedule_check==TRUE){ 03977 03978 log_debug_info(DEBUGL_CHECKS,1,"Rescheduling next check of host at %s",ctime(&next_check)); 03979 03980 /* default is to reschedule host check unless a test below fails... */ 03981 hst->should_be_scheduled=TRUE; 03982 03983 /* get the new current time */ 03984 time(¤t_time); 03985 03986 /* make sure we don't get ourselves into too much trouble... */ 03987 if(current_time>next_check) 03988 hst->next_check=current_time; 03989 else 03990 hst->next_check=next_check; 03991 03992 /* make sure we rescheduled the next service check at a valid time */ 03993 preferred_time=hst->next_check; 03994 get_next_valid_time(preferred_time,&next_valid_time,hst->check_period_ptr); 03995 hst->next_check=next_valid_time; 03996 03997 /* hosts with non-recurring intervals do not get rescheduled if we're in a HARD or UP state */ 03998 if(hst->check_interval==0 && (hst->state_type==HARD_STATE || hst->current_state==HOST_UP)) 03999 hst->should_be_scheduled=FALSE; 04000 04001 /* host with active checks disabled do not get rescheduled */ 04002 if(hst->checks_enabled==FALSE) 04003 hst->should_be_scheduled=FALSE; 04004 04005 /* schedule a non-forced check if we can */ 04006 if(hst->should_be_scheduled==TRUE){ 04007 schedule_host_check(hst,hst->next_check,CHECK_OPTION_NONE); 04008 } 04009 04010 } 04011 04012 /* update host status - for both active (scheduled) and passive (non-scheduled) hosts */ 04013 update_host_status(hst,FALSE); 04014 04015 /* run async checks of all hosts we added above */ 04016 /* don't run a check if one is already executing or we can get by with a cached state */ 04017 for(hostlist_item=check_hostlist;hostlist_item!=NULL;hostlist_item=hostlist_item->next){ 04018 run_async_check=TRUE; 04019 temp_host=(host *)hostlist_item->object_ptr; 04020 04021 log_debug_info(DEBUGL_CHECKS,2,"ASYNC CHECK OF HOST: %s, CURRENTTIME: %lu, LASTHOSTCHECK: %lu, CACHEDTIMEHORIZON: %lu, USECACHEDRESULT: %d, ISEXECUTING: %d\n",temp_host->name,current_time,temp_host->last_check,check_timestamp_horizon,use_cached_result,temp_host->is_executing); 04022 04023 if(use_cached_result==TRUE && ((current_time-temp_host->last_check)<=check_timestamp_horizon)) 04024 run_async_check=FALSE; 04025 if(temp_host->is_executing==TRUE) 04026 run_async_check=FALSE; 04027 if(run_async_check==TRUE) 04028 run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL); 04029 } 04030 free_objectlist(&check_hostlist); 04031 04032 return OK; 04033 } 04034 04035 04036 04037 /* checks viability of performing a host check */ 04038 int check_host_check_viability_3x(host *hst, int check_options, int *time_is_valid, time_t *new_time){ 04039 int result=OK; 04040 int perform_check=TRUE; 04041 time_t current_time=0L; 04042 time_t preferred_time=0L; 04043 int check_interval=0; 04044 04045 log_debug_info(DEBUGL_FUNCTIONS,0,"check_host_check_viability_3x()\n"); 04046 04047 /* make sure we have a host */ 04048 if(hst==NULL) 04049 return ERROR; 04050 04051 /* get the check interval to use if we need to reschedule the check */ 04052 if(hst->state_type==SOFT_STATE && hst->current_state!=HOST_UP) 04053 check_interval=(hst->retry_interval*interval_length); 04054 else 04055 check_interval=(hst->check_interval*interval_length); 04056 04057 /* make sure check interval is positive - otherwise use 5 minutes out for next check */ 04058 if(check_interval<=0) 04059 check_interval=300; 04060 04061 /* get the current time */ 04062 time(¤t_time); 04063 04064 /* initialize the next preferred check time */ 04065 preferred_time=current_time; 04066 04067 /* can we check the host right now? */ 04068 if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)){ 04069 04070 /* if checks of the host are currently disabled... */ 04071 if(hst->checks_enabled==FALSE){ 04072 preferred_time=current_time+check_interval; 04073 perform_check=FALSE; 04074 } 04075 04076 /* make sure this is a valid time to check the host */ 04077 if(check_time_against_period((unsigned long)current_time,hst->check_period_ptr)==ERROR){ 04078 preferred_time=current_time; 04079 if(time_is_valid) 04080 *time_is_valid=FALSE; 04081 perform_check=FALSE; 04082 } 04083 04084 /* check host dependencies for execution */ 04085 if(check_host_dependencies(hst,EXECUTION_DEPENDENCY)==DEPENDENCIES_FAILED){ 04086 preferred_time=current_time+check_interval; 04087 perform_check=FALSE; 04088 } 04089 } 04090 04091 /* pass back the next viable check time */ 04092 if(new_time) 04093 *new_time=preferred_time; 04094 04095 result=(perform_check==TRUE)?OK:ERROR; 04096 04097 return result; 04098 } 04099 04100 04101 04102 /* adjusts current host check attempt before a new check is performed */ 04103 int adjust_host_check_attempt_3x(host *hst, int is_active){ 04104 04105 log_debug_info(DEBUGL_FUNCTIONS,0,"adjust_host_check_attempt_3x()\n"); 04106 04107 if(hst==NULL) 04108 return ERROR; 04109 04110 log_debug_info(DEBUGL_CHECKS,2,"Adjusting check attempt number for host '%s': current attempt=%d/%d, state=%d, state type=%d\n",hst->name,hst->current_attempt,hst->max_attempts,hst->current_state,hst->state_type); 04111 04112 /* if host is in a hard state, reset current attempt number */ 04113 if(hst->state_type==HARD_STATE) 04114 hst->current_attempt=1; 04115 04116 /* if host is in a soft UP state, reset current attempt number (active checks only) */ 04117 else if(is_active==TRUE && hst->state_type==SOFT_STATE && hst->current_state==HOST_UP) 04118 hst->current_attempt=1; 04119 04120 /* increment current attempt number */ 04121 else if(hst->current_attempt < hst->max_attempts) 04122 hst->current_attempt++; 04123 04124 log_debug_info(DEBUGL_CHECKS,2,"New check attempt number = %d\n",hst->current_attempt); 04125 04126 return OK; 04127 } 04128 04129 04130 04131 /* determination of the host's state based on route availability*/ 04132 /* used only to determine difference between DOWN and UNREACHABLE states */ 04133 int determine_host_reachability(host *hst){ 04134 int state=HOST_DOWN; 04135 host *parent_host=NULL; 04136 hostsmember *temp_hostsmember=NULL; 04137 04138 log_debug_info(DEBUGL_FUNCTIONS,0,"determine_host_reachability()\n"); 04139 04140 if(hst==NULL) 04141 return HOST_DOWN; 04142 04143 log_debug_info(DEBUGL_CHECKS,2,"Determining state of host '%s': current state=%d\n",hst->name,hst->current_state); 04144 04145 /* host is UP - no translation needed */ 04146 if(hst->current_state==HOST_UP){ 04147 state=HOST_UP; 04148 log_debug_info(DEBUGL_CHECKS,2,"Host is UP, no state translation needed.\n"); 04149 } 04150 04151 /* host has no parents, so it is DOWN */ 04152 else if(hst->parent_hosts==NULL){ 04153 state=HOST_DOWN; 04154 log_debug_info(DEBUGL_CHECKS,2,"Host has no parents, so it is DOWN.\n"); 04155 } 04156 04157 /* check all parent hosts to see if we're DOWN or UNREACHABLE */ 04158 else{ 04159 04160 for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){ 04161 04162 if((parent_host=temp_hostsmember->host_ptr)==NULL) 04163 continue; 04164 04165 /* bail out as soon as we find one parent host that is UP */ 04166 if(parent_host->current_state==HOST_UP){ 04167 /* set the current state */ 04168 state=HOST_DOWN; 04169 log_debug_info(DEBUGL_CHECKS,2,"At least one parent (%s) is up, so host is DOWN.\n",parent_host->name); 04170 break; 04171 } 04172 } 04173 /* no parents were up, so this host is UNREACHABLE */ 04174 if(temp_hostsmember==NULL){ 04175 state=HOST_UNREACHABLE; 04176 log_debug_info(DEBUGL_CHECKS,2,"No parents were up, so host is UNREACHABLE.\n"); 04177 } 04178 } 04179 04180 return state; 04181 }