root/src/utils/noit_watchdog.c

Revision e9dea15c596d028be11696594b9a5653665a4d2d, 5.9 kB (checked in by Theo Schlossnagle <jesus@omniti.com>, 7 years ago)

run the glider when we watchdog too

  • Property mode set to 100644
Line 
1 /*
2  * Copyright (c) 2007-2009, OmniTI Computer Consulting, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are
7  * met:
8  *
9  *     * Redistributions of source code must retain the above copyright
10  *       notice, this list of conditions and the following disclaimer.
11  *     * Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *     * Neither the name OmniTI Computer Consulting, Inc. nor the names
16  *       of its contributors may be used to endorse or promote products
17  *       derived from this software without specific prior written
18  *       permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 #include "noit_defines.h"
33
34 #include <assert.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <sys/ioctl.h>
40 #include <fcntl.h>
41 #include <sys/mman.h>
42 #include <signal.h>
43 #ifdef HAVE_SYS_WAIT_H
44 #include <sys/wait.h>
45 #endif
46
47 #include "eventer/eventer.h"
48 #include "utils/noit_log.h"
49 #include "utils/noit_watchdog.h"
50
51 #define CHILD_WATCHDOG_TIMEOUT 5 /*seconds*/
52 const char *appname = "unknown";
53 const char *glider_path = NULL;
54 const char *trace_dir = "/var/tmp";
55
56 void noit_watchdog_glider(const char *path) {
57   glider_path = path;
58   if(glider_path)
59     noitL(noit_error, "Setting watchdog glider to '%s'\n", glider_path);
60 }
61 void noit_watchdog_glider_trace_dir(const char *path) {
62   trace_dir = path;
63 }
64
65 /* Watchdog stuff */
66 static int *lifeline = NULL;
67 static unsigned long last_tick_time() {
68   static struct timeval lastchange = { 0, 0 };
69   static int lastcheck = 0;
70   struct timeval now, diff;
71
72   gettimeofday(&now, NULL);
73   if(lastcheck != *lifeline) {
74     lastcheck = *lifeline;
75     memcpy(&lastchange, &now, sizeof(lastchange));
76   }
77   if(lastchange.tv_sec == 0) return 0;
78
79   sub_timeval(now, lastchange, &diff);
80   return (unsigned long)diff.tv_sec;
81 }
82 static void it_ticks_zero() {
83   (*lifeline) = 0;
84 }
85 static void it_ticks() {
86   (*lifeline)++;
87 }
88 int noit_watchdog_child_heartbeat() {
89   it_ticks();
90   return 0;
91 }
92 int noit_watchdog_prefork_init() {
93   lifeline = (int *)mmap(NULL, sizeof(int), PROT_READ|PROT_WRITE,
94                          MAP_SHARED|MAP_ANON, -1, 0);
95   if(lifeline == (void *)-1) {
96     noitL(noit_error, "Failed to mmap anon for watchdog\n");
97     return -1;
98   }
99   (*lifeline) = 0;
100   return 0;
101 }
102
103 int noit_monitored_child_pid = -1;
104
105 void run_glider(int pid) {
106   char cmd[1024], unused;
107   if(glider_path) {
108     snprintf(cmd, sizeof(cmd), "%s %d > %s/%s.%d.trc",
109              glider_path, pid, trace_dir, appname, pid);
110     unused = system(cmd);
111   }
112 }
113
114 void glideme(int sig) {
115   signal(sig, SIG_DFL);
116   run_glider(noit_monitored_child_pid);
117   kill(noit_monitored_child_pid, sig);
118 }
119
120 int noit_watchdog_start_child(const char *app, int (*func)(),
121                               int child_watchdog_timeout) {
122   int child_pid;
123   appname = strdup(app);
124   if(child_watchdog_timeout == 0)
125     child_watchdog_timeout = CHILD_WATCHDOG_TIMEOUT;
126   while(1) {
127     child_pid = fork();
128     if(child_pid == -1) {
129       noitL(noit_error, "fork failed: %s\n", strerror(errno));
130       exit(-1);
131     }
132     if(child_pid == 0) {
133       /* This sets up things so we start alive */
134       it_ticks_zero();
135       /* trace handlers */
136       noit_monitored_child_pid = getpid();
137       if(glider_path) {
138         noitL(noit_error, "catching faults with glider\n");
139         signal(SIGSEGV, glideme);
140       }
141       /* run the program */
142       exit(func());
143     }
144     else {
145       int sig = -1, exit_val = -1;
146       while(1) {
147         unsigned long ltt;
148         int status, rv;
149         sleep(1); /* Just check child status every second */
150         rv = waitpid(child_pid, &status, WNOHANG);
151         if(rv == 0) {
152           /* Nothing */
153         }
154         else if (rv == child_pid) {
155           /* We died!... we need to relaunch, unless the status was a requested exit (2) */
156           sig = WTERMSIG(status);
157           exit_val = WEXITSTATUS(status);
158           if(sig == SIGINT || sig == SIGQUIT ||
159              (sig == 0 && (exit_val == 2 || exit_val < 0))) {
160             noitL(noit_error, "%s shutdown acknowledged.\n", app);
161             exit(0);
162           }
163           break;
164         }
165         else {
166           noitL(noit_error, "Unexpected return from waitpid: %d\n", rv);
167           exit(-1);
168         }
169         /* Now check out timeout */
170         if((ltt = last_tick_time()) > child_watchdog_timeout) {
171           noitL(noit_error,
172                 "Watchdog timeout (%lu s)... terminating child\n",
173                 ltt);
174           run_glider(child_pid);
175           kill(child_pid, SIGKILL);
176         }
177       }
178       noitL(noit_error, "%s child died [%d/%d], restarting.\n",
179             app, exit_val, sig);
180     }
181   }
182 }
183
184 static int watchdog_tick(eventer_t e, int mask, void *unused, struct timeval *now) {
185   it_ticks();
186   return 0;
187 }
188 int noit_watchdog_child_eventer_heartbeat() {
189   eventer_t e;
190
191   assert(__eventer);
192
193  /* Setup our hearbeat */
194   e = eventer_alloc();
195   e->mask = EVENTER_RECURRENT;
196   e->callback = watchdog_tick;
197   eventer_add_recurrent(e);
198
199   return 0;
200 }
201
Note: See TracBrowser for help on using the browser.