Changeset 8ebbffb5373202a2665c35c8382f9e9192db3581

Show
Ignore:
Timestamp:
04/17/12 15:46:32 (2 years ago)
Author:
Philip Maddox <pmaddox@circonus.com>
git-committer:
Philip Maddox <pmaddox@circonus.com> 1334677592 +0000
git-parent:

[a3518fe2da96fee5c577d97b77bb7714465165d2]

git-author:
Philip Maddox <pmaddox@circonus.com> 1334677592 +0000
Message:

Added retry limit for noit_watchdog to prevent infinite spawning

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • src/noit_main.c

    rd35e575 r8ebbffb  
    125125  char *glider = (char *)_glider; 
    126126  char *watchdog_timeout_str; 
     127  char *retries = NULL; 
     128  char *span = NULL; 
     129  int retry_val = 5; 
     130  int span_val = 60; 
     131   
    127132    
    128133  /* First initialize logging, so we can log errors */ 
     
    160165  noit_conf_get_string(NULL, appscratch, &trace_dir); 
    161166  if(trace_dir) noit_watchdog_glider_trace_dir(trace_dir); 
     167 
     168  snprintf(appscratch, sizeof(appscratch), "/%s/watchdog/@retries", appname); 
     169  noit_conf_get_string(NULL, appscratch, &retries); 
     170  if(retries) { 
     171    retry_val = atoi(retries); 
     172  } 
     173  snprintf(appscratch, sizeof(appscratch), "/%s/watchdog/@span", appname); 
     174  noit_conf_get_string(NULL, appscratch, &span); 
     175  if(span) { 
     176    span_val = atoi(span); 
     177  } 
    162178 
    163179  /* Lastly, run through all other system inits */ 
     
    227243 
    228244  signal(SIGHUP, SIG_IGN); 
    229   return noit_watchdog_start_child("noitd", passed_child_main, watchdog_timeout); 
    230 } 
     245  return noit_watchdog_start_child("noitd", passed_child_main, watchdog_timeout, retry_val, span_val); 
     246} 
  • src/utils/noit_watchdog.c

    r4741bb2 r8ebbffb  
    4141#include <sys/mman.h> 
    4242#include <signal.h> 
     43#include <time.h> 
    4344#ifdef HAVE_SYS_WAIT_H 
    4445#include <sys/wait.h> 
     
    120121 
    121122int noit_watchdog_start_child(const char *app, int (*func)(), 
    122                               int child_watchdog_timeout) { 
     123                              int child_watchdog_timeout,  
     124                              int retries,  
     125                              int span) { 
    123126  int child_pid; 
     127  retry_data* retry_head = NULL; 
    124128  appname = strdup(app); 
    125129  if(child_watchdog_timeout == 0) 
     
    156160        else if (rv == child_pid) { 
    157161          /* We died!... we need to relaunch, unless the status was a requested exit (2) */ 
     162          int quit; 
    158163          sig = WTERMSIG(status); 
    159164          exit_val = WEXITSTATUS(status); 
    160           if(sig == SIGINT || sig == SIGQUIT || 
     165          quit = update_retries(retries, span, &retry_head); 
     166          if (quit) { 
     167            noitL(noit_error, "noit exceeded retry limit of %d retries in %d seconds... exiting...\n", retries, span); 
     168            exit(0); 
     169          } 
     170          else if(sig == SIGINT || sig == SIGQUIT || 
    161171             (sig == 0 && (exit_val == 2 || exit_val < 0))) { 
    162172            noitL(noit_error, "%s shutdown acknowledged.\n", app); 
     
    184194} 
    185195 
     196int update_retries(int retries, int span, retry_data** data) { 
     197  int count = 0; 
     198  retry_data* iter; 
     199  retry_data* prev = NULL; 
     200  retry_data* new_data = NULL; 
     201  retry_data* temp = NULL; 
     202  time_t curr_time = time(NULL); 
     203 
     204  /* Allocate the new entry and set it to the head of the list */ 
     205  new_data = (retry_data*)malloc(sizeof(retry_data)); 
     206  new_data->event_time = curr_time; 
     207  new_data->next = *data; 
     208  *data = new_data; 
     209 
     210  /* We always want to count the first one, so start on the second element */ 
     211  count = 1; 
     212  iter = (retry_data*)new_data->next; 
     213  prev = new_data; 
     214 
     215  while (iter != NULL) { 
     216    int diff = curr_time - iter->event_time; 
     217    if (diff <= span) { /* Count it, since it's not too old */ 
     218      prev = iter; 
     219      iter = (retry_data*)iter->next; 
     220      count++; 
     221    } 
     222    else { /* Remove node */ 
     223      temp = iter; 
     224      prev->next = iter->next; 
     225      iter = iter->next; 
     226      free(temp); 
     227    } 
     228  } 
     229  if (count >= retries) { 
     230    return 1; 
     231  } 
     232  return 0; 
     233} 
     234 
    186235static int watchdog_tick(eventer_t e, int mask, void *unused, struct timeval *now) { 
    187236  it_ticks(); 
  • src/utils/noit_watchdog.h

    r634b4a3 r8ebbffb  
    3737#include "noit_defines.h" 
    3838 
     39typedef struct{ 
     40    time_t event_time; 
     41    void* next; 
     42} __attribute__ ((packed)) retry_data; 
     43 
    3944/*! \fn int noit_watchdog_prefork_init() 
    4045    \brief Prepare the program to split into a child/parent-monitor relationship. 
     
    4853  noit_watchdog_prefork_init(); 
    4954 
    50 /*! \fn int noit_watchdog_start_child(const char *app, int (*func)(), int timeout) 
     55/*! \fn int update_retries(int retries, int span, retry_data** data) 
     56    \brief Updates the list of retries and signals to quit if the limit is exceeded 
     57    \param retries The number of times to attempt to restart the task with a certain span of time 
     58    \param span The amount of time in seconds to measure attempts to restart the task over 
     59    \param data A pointer to the list of event data 
     60    \return Returns 1 to signal a quit, 0 otherwise 
     61 
     62
     63 
     64    update_retries will iterate through a list of times the task has restarted. If it determines that the system has been restarted too many times in too short a period, it will return 1 and reconnoiter will terminate. Otherwise, it will return 0 and reconnoiter will restart. 
     65 */ 
     66 
     67API_EXPORT(int) 
     68  update_retries(int retries, int span, retry_data** data); 
     69 
     70/*! \fn int noit_watchdog_start_child(const char *app, int (*func)(), int timeout, int retries, int span) 
    5171    \brief Starts a function as a separate child under close watch. 
    5272    \param app The name of the application (for error output). 
    5373    \param func The function that will be the child process. 
    5474    \param timeout The number of seconds of lifelessness before the parent reaps and restarts the child. 
     75    \param retries The number of times to attempt to restart the task with a certain span of time 
     76    \param span The amount of time in seconds to measure attempts to restart the task over 
    5577    \return Returns on program termination. 
    5678. 
     
    5981 */ 
    6082API_EXPORT(int) 
    61   noit_watchdog_start_child(const char *app, int (*func)(), int timeout); 
     83  noit_watchdog_start_child(const char *app, int (*func)(), int timeout, int retries, int span); 
    6284 
    6385/*! \fn int noit_watchdog_child_heartbeat()