root/src/utils/noit_watchdog.c

Revision 23fef54d7918dadb6d28d39c62180971829328bc, 5.9 kB (checked in by Theo Schlossnagle <jesus@omniti.com>, 3 years ago)

catch abort too

  • Property mode set to 100644
Line 
1 /*
2  * Copyright (c) 2007-2009, OmniTI Computer Consulting, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are
7  * met:
8  *
9  *     * Redistributions of source code must retain the above copyright
10  *       notice, this list of conditions and the following disclaimer.
11  *     * Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *     * Neither the name OmniTI Computer Consulting, Inc. nor the names
16  *       of its contributors may be used to endorse or promote products
17  *       derived from this software without specific prior written
18  *       permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 #include "noit_defines.h"
33
34 #include <assert.h>
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <sys/ioctl.h>
40 #include <fcntl.h>
41 #include <sys/mman.h>
42 #include <signal.h>
43 #ifdef HAVE_SYS_WAIT_H
44 #include <sys/wait.h>
45 #endif
46
47 #include "eventer/eventer.h"
48 #include "utils/noit_log.h"
49 #include "utils/noit_watchdog.h"
50
51 #define CHILD_WATCHDOG_TIMEOUT 5 /*seconds*/
52 const char *appname = "unknown";
53 const char *glider_path = NULL;
54 const char *trace_dir = "/var/tmp";
55
56 void noit_watchdog_glider(const char *path) {
57   glider_path = path;
58   if(glider_path)
59     noitL(noit_error, "Setting watchdog glider to '%s'\n", glider_path);
60 }
61 void noit_watchdog_glider_trace_dir(const char *path) {
62   trace_dir = path;
63 }
64
65 /* Watchdog stuff */
66 static int *lifeline = NULL;
67 static unsigned long last_tick_time() {
68   static struct timeval lastchange = { 0, 0 };
69   static int lastcheck = 0;
70   struct timeval now, diff;
71
72   gettimeofday(&now, NULL);
73   if(lastcheck != *lifeline) {
74     lastcheck = *lifeline;
75     memcpy(&lastchange, &now, sizeof(lastchange));
76   }
77   if(lastchange.tv_sec == 0) return 0;
78
79   sub_timeval(now, lastchange, &diff);
80   return (unsigned long)diff.tv_sec;
81 }
82 static void it_ticks_zero() {
83   (*lifeline) = 0;
84 }
85 static void it_ticks() {
86   (*lifeline)++;
87 }
88 int noit_watchdog_child_heartbeat() {
89   it_ticks();
90   return 0;
91 }
92 int noit_watchdog_prefork_init() {
93   lifeline = (int *)mmap(NULL, sizeof(int), PROT_READ|PROT_WRITE,
94                          MAP_SHARED|MAP_ANON, -1, 0);
95   if(lifeline == (void *)-1) {
96     noitL(noit_error, "Failed to mmap anon for watchdog\n");
97     return -1;
98   }
99   (*lifeline) = 0;
100   return 0;
101 }
102
103 int noit_monitored_child_pid = -1;
104
105 void run_glider(int pid) {
106   char cmd[1024], unused;
107   if(glider_path) {
108     snprintf(cmd, sizeof(cmd), "%s %d > %s/%s.%d.trc",
109              glider_path, pid, trace_dir, appname, pid);
110     unused = system(cmd);
111   }
112 }
113
114 void glideme(int sig) {
115   signal(sig, SIG_DFL);
116   run_glider(noit_monitored_child_pid);
117   kill(noit_monitored_child_pid, sig);
118 }
119
120 int noit_watchdog_start_child(const char *app, int (*func)(),
121                               int child_watchdog_timeout) {
122   int child_pid;
123   appname = strdup(app);
124   if(child_watchdog_timeout == 0)
125     child_watchdog_timeout = CHILD_WATCHDOG_TIMEOUT;
126   while(1) {
127     child_pid = fork();
128     if(child_pid == -1) {
129       noitL(noit_error, "fork failed: %s\n", strerror(errno));
130       exit(-1);
131     }
132     if(child_pid == 0) {
133       /* This sets up things so we start alive */
134       it_ticks_zero();
135       /* trace handlers */
136       noit_monitored_child_pid = getpid();
137       if(glider_path) {
138         noitL(noit_error, "catching faults with glider\n");
139         signal(SIGSEGV, glideme);
140         signal(SIGABRT, glideme);
141       }
142       /* run the program */
143       exit(func());
144     }
145     else {
146       int sig = -1, exit_val = -1;
147       while(1) {
148         unsigned long ltt;
149         int status, rv;
150         sleep(1); /* Just check child status every second */
151         rv = waitpid(child_pid, &status, WNOHANG);
152         if(rv == 0) {
153           /* Nothing */
154         }
155         else if (rv == child_pid) {
156           /* We died!... we need to relaunch, unless the status was a requested exit (2) */
157           sig = WTERMSIG(status);
158           exit_val = WEXITSTATUS(status);
159           if(sig == SIGINT || sig == SIGQUIT ||
160              (sig == 0 && (exit_val == 2 || exit_val < 0))) {
161             noitL(noit_error, "%s shutdown acknowledged.\n", app);
162             exit(0);
163           }
164           break;
165         }
166         else {
167           noitL(noit_error, "Unexpected return from waitpid: %d\n", rv);
168           exit(-1);
169         }
170         /* Now check out timeout */
171         if((ltt = last_tick_time()) > child_watchdog_timeout) {
172           noitL(noit_error,
173                 "Watchdog timeout (%lu s)... terminating child\n",
174                 ltt);
175           run_glider(child_pid);
176           kill(child_pid, SIGKILL);
177         }
178       }
179       noitL(noit_error, "%s child died [%d/%d], restarting.\n",
180             app, exit_val, sig);
181     }
182   }
183 }
184
185 static int watchdog_tick(eventer_t e, int mask, void *unused, struct timeval *now) {
186   it_ticks();
187   return 0;
188 }
189 int noit_watchdog_child_eventer_heartbeat() {
190   eventer_t e;
191
192   assert(__eventer);
193
194  /* Setup our hearbeat */
195   e = eventer_alloc();
196   e->mask = EVENTER_RECURRENT;
197   e->callback = watchdog_tick;
198   eventer_add_recurrent(e);
199
200   return 0;
201 }
202
Note: See TracBrowser for help on using the browser.