root/src/noit_check.c

Revision 7b927621aa1d1fb7e9e915e74ab2a86ce90f73ac, 18.3 kB (checked in by Theo Schlossnagle <jesus@omniti.com>, 7 years ago)

make the metrics stuff more flexible, setup check/status/metrics logging.

  • Property mode set to 100644
Line 
1 /*
2  * Copyright (c) 2007, OmniTI Computer Consulting, Inc.
3  * All rights reserved.
4  */
5
6 #include "noit_defines.h"
7
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include <assert.h>
12 #include <netinet/in.h>
13 #include <arpa/inet.h>
14
15 #include "utils/noit_log.h"
16 #include "utils/noit_hash.h"
17 #include "utils/noit_skiplist.h"
18 #include "noit_conf.h"
19 #include "noit_check.h"
20 #include "noit_module.h"
21 #include "noit_console.h"
22 #include "eventer/eventer.h"
23
24 /* 60 seconds of possible stutter */
25 #define MAX_INITIAL_STUTTER (60*1000)
26
27 static noit_hash_table polls = NOIT_HASH_EMPTY;
28 static noit_skiplist polls_by_name = { 0 };
29 static u_int32_t __config_load_generation = 0;
30 struct uuid_dummy {
31   uuid_t foo;
32 };
33
34 static void register_console_check_commands();
35
36 #define UUID_SIZE sizeof(struct uuid_dummy)
37
38 static const char *
39 __noit_check_available_string(int16_t available) {
40   switch(available) {
41     case NP_AVAILABLE:    return "available";
42     case NP_UNAVAILABLE:  return "unavailable";
43     case NP_UNKNOWN:      return "unknown";
44   }
45   return "???";
46 }
47 static const char *
48 __noit_check_state_string(int16_t state) {
49   switch(state) {
50     case NP_GOOD:         return "good";
51     case NP_BAD:          return "bad";
52     case NP_UNKNOWN:      return "unknown";
53   }
54   return "???";
55 }
56 static int __check_name_compare(void *a, void *b) {
57   noit_check_t *ac = a;
58   noit_check_t *bc = b;
59   int rv;
60   if((rv = strcmp(ac->target, bc->target)) != 0) return rv;
61   if((rv = strcmp(ac->name, bc->name)) != 0) return rv;
62   return 0;
63 }
64 int
65 noit_check_max_initial_stutter() {
66   return MAX_INITIAL_STUTTER;
67 }
68 void
69 noit_check_fake_last_check(noit_check_t *check,
70                            struct timeval *lc, struct timeval *_now) {
71   struct timeval now, period;
72   double r;
73   int offset;
74
75   r = drand48();
76   offset = r * (MIN(MAX_INITIAL_STUTTER, check->period));
77   period.tv_sec = (check->period - offset) / 1000;
78   period.tv_usec = ((check->period - offset) % 1000) * 1000;
79   if(!_now) {
80     gettimeofday(&now, NULL);
81     _now = &now;
82   }
83   sub_timeval(*_now, period, lc);
84 }
85 void
86 noit_poller_process_checks(const char *xpath) {
87   int i, flags, cnt = 0;
88   noit_conf_section_t *sec;
89   __config_load_generation++;
90   sec = noit_conf_get_sections(NULL, xpath, &cnt);
91   for(i=0; i<cnt; i++) {
92     noit_check_t *existing_check;
93     char uuid_str[37];
94     char target[256];
95     char module[256];
96     char name[256];
97     char oncheck[1024] = "";
98     int no_period = 0;
99     int no_oncheck = 0;
100     int period = 0, timeout = 0;
101     noit_conf_boolean disabled = noit_false, busted = noit_false;
102     uuid_t uuid, out_uuid;
103     noit_hash_table *options;
104
105 #define NEXT(...) noitL(noit_stderr, __VA_ARGS__); continue
106 #define MYATTR(type,a,...) noit_conf_get_##type(sec[i], "@" #a, __VA_ARGS__)
107 #define INHERIT(type,a,...) \
108   noit_conf_get_##type(sec[i], "ancestor-or-self::node()/@" #a, __VA_ARGS__)
109
110     if(!MYATTR(stringbuf, uuid, uuid_str, sizeof(uuid_str))) {
111       noitL(noit_stderr, "check %d has no uuid\n", i+1);
112       continue;
113     }
114
115     if(uuid_parse(uuid_str, uuid)) {
116       noitL(noit_stderr, "check uuid: '%s' is invalid\n", uuid_str);
117       continue;
118     }
119
120     if(!INHERIT(stringbuf, target, target, sizeof(target))) {
121       noitL(noit_stderr, "check uuid: '%s' has no target\n", uuid_str);
122       busted = noit_true;
123     }
124     if(!INHERIT(stringbuf, module, module, sizeof(module))) {
125       noitL(noit_stderr, "check uuid: '%s' has no module\n", uuid_str);
126       busted = noit_true;
127     }
128
129     if(!MYATTR(stringbuf, name, name, sizeof(name)))
130       strlcpy(name, module, sizeof(name));
131
132     if(!INHERIT(int, period, &period) || period == 0)
133       no_period = 1;
134
135     if(!INHERIT(stringbuf, oncheck, oncheck, sizeof(oncheck)) || !oncheck[0])
136       no_oncheck = 1;
137
138     if(no_period && no_oncheck) {
139       noitL(noit_stderr, "check uuid: '%s' has neither period nor oncheck\n",
140             uuid_str);
141       busted = noit_true;
142     }
143     if(!(no_period || no_oncheck)) {
144       noitL(noit_stderr, "check uuid: '%s' has oncheck and period.\n",
145             uuid_str);
146       busted = noit_true;
147     }
148     if(!INHERIT(int, timeout, &timeout)) {
149       noitL(noit_stderr, "check uuid: '%s' has no timeout\n", uuid_str);
150       busted = noit_true;
151     }
152     if(!no_period && timeout >= period) {
153       noitL(noit_stderr, "check uuid: '%s' timeout > period\n", uuid_str);
154       timeout = period/2;
155     }
156     options = noit_conf_get_hash(sec[i], "ancestor-or-self::node()/config/*");
157
158     INHERIT(boolean, disable, &disabled);
159     flags = 0;
160     if(busted) flags |= NP_UNCONFIG;
161     if(disabled) flags |= NP_DISABLED;
162
163     if(noit_hash_retrieve(&polls, (char *)uuid, UUID_SIZE,
164                           (void **)&existing_check)) {
165       /* Once set, we can never change it. */
166       assert(!existing_check->module || !existing_check->module[0] ||
167              !strcmp(existing_check->module, module));
168       /* Only set it if it is not yet set */
169       if(!existing_check->module || !existing_check->module[0]) {
170         if(existing_check->module) free(existing_check->module);
171         existing_check->module = strdup(module);
172       }
173       noit_check_update(existing_check, target, name, options,
174                            period, timeout, oncheck[0] ? oncheck : NULL,
175                            flags);
176       noitL(noit_debug, "reloaded uuid: %s\n", uuid_str);
177     }
178     else {
179       noit_poller_schedule(target, module, name, options,
180                            period, timeout, oncheck[0] ? oncheck : NULL,
181                            flags, uuid, out_uuid);
182       noitL(noit_debug, "loaded uuid: %s\n", uuid_str);
183     }
184   }
185 }
186
187 void
188 noit_poller_initiate() {
189   noit_hash_iter iter = NOIT_HASH_ITER_ZERO;
190   uuid_t key_id;
191   int klen;
192   noit_check_t *check;
193   while(noit_hash_next(&polls, &iter, (const char **)key_id, &klen,
194                        (void **)&check)) {
195     noit_module_t *mod;
196     mod = noit_module_lookup(check->module);
197     if(mod) {
198       if(NOIT_CHECK_LIVE(check))
199         continue;
200       if((check->flags & NP_DISABLED) == 0)
201         mod->initiate_check(mod, check, 0, NULL);
202       else
203         noitL(noit_debug, "Skipping %s`%s, disabled.\n",
204               check->target, check->name);
205     }
206     else {
207       noitL(noit_stderr, "Cannot find module '%s'\n", check->module);
208       check->flags |= NP_DISABLED;
209     }
210   }
211 }
212
213 void
214 noit_poller_flush_epoch(int oldest_allowed) {
215   noit_hash_iter iter = NOIT_HASH_ITER_ZERO;
216   uuid_t key_id;
217   int klen;
218   noit_check_t *check, *tofree = NULL;
219
220   /* Cleanup any previous causal map */
221   while(noit_hash_next(&polls, &iter, (const char **)key_id, &klen,
222                        (void **)&check)) {
223     /* We don't free the one we're looking at... we free it on the next
224      * pass.  This leaves out iterator in good shape.  We just need to
225      * remember to free it one last time outside the while loop, down...
226      */
227     if(tofree) {
228       noit_poller_deschedule(tofree->checkid);
229       tofree = NULL;
230     }
231     if(check->generation < oldest_allowed) {
232       tofree = check;
233     }
234   }
235   /* ... here */
236   if(tofree) noit_poller_deschedule(tofree->checkid);
237 }
238
239 void
240 noit_poller_make_causal_map() {
241   noit_hash_iter iter = NOIT_HASH_ITER_ZERO;
242   uuid_t key_id;
243   int klen;
244   noit_check_t *check, *parent;
245
246   /* Cleanup any previous causal map */
247   while(noit_hash_next(&polls, &iter, (const char **)key_id, &klen,
248                        (void **)&check)) {
249     dep_list_t *dep;
250     while((dep = check->causal_checks) != NULL) {
251       check->causal_checks = dep->next;
252       free(dep);
253     }
254   }
255
256   memset(&iter, 0, sizeof(iter));
257   /* Walk all checks and add check dependencies to their parents */
258   while(noit_hash_next(&polls, &iter, (const char **)key_id, &klen,
259                        (void **)&check)) {
260     if(check->oncheck) {
261       /* This service is causally triggered by another service */
262       char fullcheck[1024];
263       char *name = check->oncheck;
264       char *target = NULL;
265
266       noitL(noit_debug, "Searching for upstream trigger on %s\n", name);
267       if((target = strchr(check->oncheck, '`')) != NULL) {
268         strlcpy(fullcheck, check->oncheck, target - check->oncheck);
269         name = target + 1;
270         target = fullcheck;
271       }
272       else
273        target = check->target;
274
275       parent = noit_poller_lookup_by_name(target, name);
276       if(!parent) {
277         check->flags |= NP_DISABLED;
278         noitL(noit_stderr, "Disabling check %s`%s, can't find oncheck %s`%s\n",
279               check->target, check->name, target, name);
280       }
281       else {
282         dep_list_t *dep;
283         dep = malloc(sizeof(*dep));
284         dep->check = check;
285         dep->next = parent->causal_checks;
286         parent->causal_checks = dep;
287         noitL(noit_debug, "Causal map %s`%s --> %s`%s\n",
288               parent->target, parent->name, check->target, check->name);
289       }
290     }
291   }
292 }
293 void
294 noit_poller_reload(const char *xpath)
295 {
296   noit_poller_process_checks(xpath ? xpath : "/noit/checks//check");
297   if(!xpath) {
298     /* Full reload, we need to wipe old checks */
299     noit_poller_flush_epoch(__config_load_generation);
300   }
301   noit_poller_make_causal_map();
302   noit_poller_initiate();
303 }
304 void
305 noit_poller_init() {
306   noit_skiplist_init(&polls_by_name);
307   noit_skiplist_set_compare(&polls_by_name, __check_name_compare,
308                             __check_name_compare);
309   register_console_check_commands();
310   noit_poller_reload(NULL);
311 }
312
313 int
314 noit_check_update(noit_check_t *new_check,
315                   const char *target,
316                   const char *name,
317                   noit_hash_table *config,
318                   u_int32_t period,
319                   u_int32_t timeout,
320                   const char *oncheck,
321                   int flags) {
322   int8_t family;
323   int rv;
324   int mask = NP_DISABLED | NP_UNCONFIG;
325   union {
326     struct in_addr addr4;
327     struct in6_addr addr6;
328   } a;
329
330
331   family = AF_INET;
332   rv = inet_pton(family, target, &a);
333   if(rv != 1) {
334     family = AF_INET6;
335     rv = inet_pton(family, target, &a);
336     if(rv != 1) {
337       noitL(noit_stderr, "Cannot translate '%s' to IP\n", target);
338       memset(&a, 0, sizeof(a));
339       flags |= (NP_UNCONFIG & NP_DISABLED);
340     }
341   }
342
343   new_check->generation = __config_load_generation;
344   new_check->target_family = family;
345   memcpy(&new_check->target_addr, &a, sizeof(a));
346   if(new_check->target) free(new_check->target);
347   new_check->target = strdup(target);
348   if(new_check->name) free(new_check->name);
349   new_check->name = name ? strdup(name): NULL;
350
351   if(config != NULL) {
352     noit_hash_iter iter = NOIT_HASH_ITER_ZERO;
353     const char *k;
354     int klen;
355     void *data;
356     if(new_check->config) noit_hash_delete_all(new_check->config, free, free);
357     else new_check->config = calloc(1, sizeof(*new_check->config));
358     while(noit_hash_next(config, &iter, &k, &klen, &data)) {
359       noit_hash_store(new_check->config, strdup(k), klen, strdup((char *)data));
360     }
361   }
362   if(new_check->oncheck) free(new_check->oncheck);
363   new_check->oncheck = oncheck ? strdup(oncheck) : NULL;
364   new_check->period = period;
365   new_check->timeout = timeout;
366
367   /* Unset what could be set.. then set what should be set */
368   new_check->flags = (new_check->flags & ~mask) | flags;
369
370   /* This remove could fail -- no big deal */
371   noit_skiplist_remove(&polls_by_name, new_check, NULL);
372
373   /* This insert could fail.. which means we have a conflict on
374    * target`name.  That should result in the check being disabled. */
375   if(!noit_skiplist_insert(&polls_by_name, new_check)) {
376     noitL(noit_stderr, "Check %s`%s disabled due to naming conflict\n",
377           new_check->target, new_check->name);
378     new_check->flags |= NP_DISABLED;
379   }
380   noit_check_log_check(new_check);
381   return 0;
382 }
383 int
384 noit_poller_schedule(const char *target,
385                      const char *module,
386                      const char *name,
387                      noit_hash_table *config,
388                      u_int32_t period,
389                      u_int32_t timeout,
390                      const char *oncheck,
391                      int flags,
392                      uuid_t in,
393                      uuid_t out) {
394   noit_check_t *new_check;
395   new_check = calloc(1, sizeof(*new_check));
396   if(!new_check) return -1;
397
398   /* The module and the UUID can never be changed */
399   new_check->module = strdup(module);
400   if(uuid_is_null(in))
401     uuid_generate(new_check->checkid);
402   else
403     uuid_copy(new_check->checkid, in);
404
405   noit_check_update(new_check, target, name, config,
406                     period, timeout, oncheck, flags);
407   assert(noit_hash_store(&polls,
408                          (char *)new_check->checkid, UUID_SIZE,
409                          new_check));
410   uuid_copy(out, new_check->checkid);
411
412   return 0;
413 }
414
415 int
416 noit_poller_deschedule(uuid_t in) {
417   noit_check_t *checker;
418   noit_module_t *mod;
419   if(noit_hash_retrieve(&polls,
420                         (char *)in, UUID_SIZE,
421                         (void **)&checker) == 0) {
422     return -1;
423   }
424   if(checker->flags & NP_RUNNING) {
425     checker->flags |= NP_KILLED;
426     return 0;
427   }
428   checker->flags |= NP_KILLED;
429
430   noit_skiplist_remove(&polls_by_name, checker, NULL);
431   noit_hash_delete(&polls, (char *)in, UUID_SIZE, NULL, NULL);
432
433   mod = noit_module_lookup(checker->module);
434   mod->cleanup(mod, checker);
435   if(checker->fire_event) {
436      eventer_remove(checker->fire_event);
437      eventer_free(checker->fire_event);
438      checker->fire_event = NULL;
439   }
440
441   if(checker->target) free(checker->target);
442   if(checker->module) free(checker->module);
443   if(checker->name) free(checker->name);
444   if(checker->config) {
445     noit_hash_destroy(checker->config, free, free);
446     free(checker->config);
447     checker->config = NULL;
448   }
449   free(checker);
450   return 0;
451 }
452
453 noit_check_t *
454 noit_poller_lookup(uuid_t in) {
455   noit_check_t *check;
456   if(noit_hash_retrieve(&polls,
457                         (char *)in, UUID_SIZE,
458                         (void **)&check)) {
459     return check;
460   }
461   return NULL;
462 }
463 noit_check_t *
464 noit_poller_lookup_by_name(char *target, char *name) {
465   noit_check_t *check, *tmp_check;
466   tmp_check = calloc(1, sizeof(*tmp_check));
467   tmp_check->target = target;
468   tmp_check->name = name;
469   check = noit_skiplist_find(&polls_by_name, tmp_check, NULL);
470   free(tmp_check);
471   return check;
472 }
473
474 void
475 noit_check_stats_clear(stats_t *s) {
476   memset(s, 0, sizeof(*s));
477   s->state = NP_UNKNOWN;
478   s->available = NP_UNKNOWN;
479 }
480 static void
481 __free_metric(void *vm) {
482   metric_t *m = vm;
483   free(m->metric_name);
484   if(m->metric_value.i) free(m->metric_value.i);
485 }
486
487 void
488 __stats_add_metric(stats_t *newstate, metric_t *m) {
489   noit_hash_replace(&newstate->metrics, m->metric_name, strlen(m->metric_name),
490                     m, NULL, __free_metric);
491 }
492
493 static size_t
494 noit_metric_sizes(metric_type_t type, void *value) {
495   switch(type) {
496     case METRIC_INT32:
497     case METRIC_UINT32:
498       return sizeof(int32_t);
499     case METRIC_INT64:
500     case METRIC_UINT64:
501       return sizeof(int64_t);
502     case METRIC_DOUBLE:
503       return sizeof(double);
504     case METRIC_STRING:
505       return strlen((char *)value) + 1;
506     case METRIC_GUESS:
507       break;
508   }
509   assert(type != type);
510   return 0;
511 }
512 static metric_type_t
513 noit_metric_guess_type(const char *s) {
514   if(!s) return METRIC_GUESS;
515   return METRIC_STRING;
516 }
517 void
518 noit_stats_set_metric(stats_t *newstate, char *name, metric_type_t type,
519                       void *value) {
520   metric_t *m;
521   if(type == METRIC_GUESS) type = noit_metric_guess_type((char *)value);
522   if(type == METRIC_GUESS) return;
523
524   m = calloc(1, sizeof(*m));
525   m->metric_name = strdup(name);
526   m->metric_type = type;
527   if(value) {
528     size_t len;
529     len = noit_metric_sizes(type, value);
530     m->metric_value.vp = calloc(1, len);
531     memcpy(m->metric_value.vp, value, len);
532   }
533   __stats_add_metric(newstate, m);
534 }
535
536 void
537 noit_check_set_stats(struct _noit_module *module,
538                      noit_check_t *check, stats_t *newstate) {
539   int report_change = 0;
540   dep_list_t *dep;
541   if(check->stats.previous.status)
542     free(check->stats.previous.status);
543   noit_hash_destroy(&check->stats.previous.metrics, NULL, __free_metric);
544   memcpy(&check->stats.previous, &check->stats.current, sizeof(stats_t));
545   memcpy(&check->stats.current, newstate, sizeof(stats_t));
546   if(check->stats.current.status)
547     check->stats.current.status = strdup(check->stats.current.status);
548
549   /* check for state changes */
550   if(check->stats.current.available != NP_UNKNOWN &&
551      check->stats.previous.available != NP_UNKNOWN &&
552      check->stats.current.available != check->stats.previous.available)
553     report_change = 1;
554   if(check->stats.current.state != NP_UNKNOWN &&
555      check->stats.previous.state != NP_UNKNOWN &&
556      check->stats.current.state != check->stats.previous.state)
557     report_change = 1;
558
559   noitL(noit_error, "%s`%s <- [%s]\n", check->target, check->name,
560         check->stats.current.status);
561   if(report_change) {
562     noitL(noit_error, "%s`%s -> [%s:%s]\n",
563           check->target, check->name,
564           __noit_check_available_string(check->stats.current.available),
565           __noit_check_state_string(check->stats.current.state));
566   }
567
568   /* Write out our status */
569   noit_check_log_status(check);
570   /* Write out all metrics */
571   noit_check_log_metrics(check);
572
573   for(dep = check->causal_checks; dep; dep = dep->next) {
574     noit_module_t *mod;
575     mod = noit_module_lookup(dep->check->module);
576     assert(mod);
577     noitL(noit_debug, "Firing %s`%s in response to %s`%s\n",
578           dep->check->target, dep->check->name,
579           check->target, check->name);
580     mod->initiate_check(mod, dep->check, 1, check);
581   }
582 }
583
584 static void
585 nc_printf_check_brief(noit_console_closure_t ncct,
586                       noit_check_t *check) {
587   char out[512];
588   char uuid_str[37];
589   snprintf(out, sizeof(out), "%s`%s", check->target, check->name);
590   uuid_unparse_lower(check->checkid, uuid_str);
591   nc_printf(ncct, "%s %s\n", uuid_str, out);
592   if(check->stats.current.status)
593     nc_printf(ncct, "\t%s\n", check->stats.current.status);
594 }
595
596 static int
597 noit_console_show_checks(noit_console_closure_t ncct,
598                          int argc, char **argv,
599                          noit_console_state_t *dstate,
600                          void *closure) {
601   struct timeval _now;
602   noit_hash_iter iter = NOIT_HASH_ITER_ZERO;
603   uuid_t key_id;
604   int klen;
605   noit_check_t *check;
606
607   gettimeofday(&_now, NULL);
608   while(noit_hash_next(&polls, &iter, (const char **)key_id, &klen,
609                        (void **)&check)) {
610     nc_printf_check_brief(ncct, check);
611   }
612   return 0;
613 }
614
615 static void
616 register_console_check_commands() {
617   noit_console_state_t *tl;
618   cmd_info_t *showcmd;
619
620   tl = noit_console_state_initial();
621   showcmd = noit_console_state_get_cmd(tl, "show");
622   assert(showcmd && showcmd->dstate);
623
624   noit_console_state_add_cmd(showcmd->dstate,
625     NCSCMD("checks", noit_console_show_checks, NULL, NULL));
626 }
627
Note: See TracBrowser for help on using the browser.