From eb8d1b88adcb63b12986855f40a454e2c59d0b9c Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Mon, 21 Sep 2020 18:41:44 +0200 Subject: [PATCH] Add watchdog support. * src/cuirass/watchdog.scm: New file. * Makefile.am (dist_pkgmodule_DATA): Add it. * src/cuirass/utils.scm (with-timeout, get-message-with-timeout): Export them. * bin/cuirass.in (main): Start the watchdog. --- Makefile.am | 5 ++- bin/cuirass.in | 3 +- src/cuirass/utils.scm | 3 ++ src/cuirass/watchdog.scm | 88 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 src/cuirass/watchdog.scm diff --git a/Makefile.am b/Makefile.am index 60b1e24..a575755 100644 --- a/Makefile.am +++ b/Makefile.am @@ -50,8 +50,9 @@ dist_pkgmodule_DATA = \ src/cuirass/metrics.scm \ src/cuirass/send-events.scm \ src/cuirass/ui.scm \ - src/cuirass/utils.scm \ - src/cuirass/templates.scm + src/cuirass/utils.scm \ + src/cuirass/templates.scm \ + src/cuirass/watchdog.scm nodist_pkgmodule_DATA = \ src/cuirass/config.scm diff --git a/bin/cuirass.in b/bin/cuirass.in index d6c2695..55e92b6 100644 --- a/bin/cuirass.in +++ b/bin/cuirass.in @@ -32,6 +32,7 @@ exec ${GUILE:-@GUILE@} --no-auto-compile -e main -s "$0" "$@" (cuirass logging) (cuirass metrics) (cuirass utils) + (cuirass watchdog) (guix ui) ((guix build utils) #:select (mkdir-p)) (fibers) @@ -153,7 +154,7 @@ exec ${GUILE:-@GUILE@} --no-auto-compile -e main -s "$0" "$@" (if one-shot? (process-specs (db-get-specifications)) (let ((exit-channel (make-channel))) - + (start-watchdog) (if (option-ref opts 'web #f) (begin (spawn-fiber diff --git a/src/cuirass/utils.scm b/src/cuirass/utils.scm index 00cfef6..7ce4b83 100644 --- a/src/cuirass/utils.scm +++ b/src/cuirass/utils.scm @@ -37,6 +37,9 @@ define-enumeration unwind-protect + with-timeout + get-message-with-timeout + make-worker-thread-channel call-with-worker-thread with-worker-thread diff --git a/src/cuirass/watchdog.scm b/src/cuirass/watchdog.scm new file mode 100644 index 0000000..5c5c1df --- /dev/null +++ b/src/cuirass/watchdog.scm @@ -0,0 +1,88 @@ +;;; watchdog.scm -- Monitor fibers scheduling. +;;; This file is part of Cuirass. +;;; +;;; Cuirass is free software: you can redistribute it and/or modify +;;; it under the terms of the GNU General Public License as published by +;;; the Free Software Foundation, either version 3 of the License, or +;;; (at your option) any later version. +;;; +;;; Cuirass is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with Cuirass. If not, see . + +(define-module (cuirass watchdog) + #:use-module (cuirass logging) + #:use-module (cuirass utils) + #:use-module (fibers) + #:use-module (fibers channels) + #:use-module (fibers internal) + #:use-module (fibers operations) + #:use-module (ice-9 match) + #:use-module (ice-9 threads) + #:export (start-watchdog)) + +(define* (watchdog-fiber scheduler channel + #:key + (period 1)) + "Spawn a fiber running on SCHEDULER that sends over CHANNEL, every PERIOD +seconds, the scheduler name and the current time." + (spawn-fiber + (lambda () + (while #t + (put-message channel (list (scheduler-name scheduler) + (current-time))) + (sleep period))) + scheduler)) + +(define* (start-watchdog #:key (timeout 5)) + "Start a watchdog checking that each Fibers scheduler is not blocked for +more than TIMEOUT seconds. + +The watchdog mechanism consists in spawning a dedicated fiber per running +Fiber scheduler, using the above watchdog-fiber method. Those fibers send a +ping signal periodically to a separate thread. If no signal is received from +one of the schedulers for more than TIMEOUT seconds, a warning message is +printed." + (define (check-timeouts pings last-check) + (let* ((check-period timeout) + (cur-time (current-time)) + (diff-check (- cur-time last-check))) + (if (> diff-check check-period) + (begin + (for-each + (match-lambda + ((scheduler . time) + (let ((diff-ping (- cur-time time))) + (when (> diff-ping timeout) + (log-message "Scheduler ~a blocked since ~a seconds." + scheduler diff-ping))))) + pings) + cur-time) + last-check))) + + (let ((watchdog-channel (make-channel))) + (parameterize (((@@ (fibers internal) current-fiber) #f)) + (call-with-new-thread + (lambda () + (let loop ((pings '()) + (last-check 0)) + (let ((operation-timeout 10)) + (match (perform-operation + (with-timeout + (get-operation watchdog-channel) + #:seconds operation-timeout + #:wrap (const 'timeout))) + ((scheduler ping) + (loop (assq-set! pings scheduler ping) + (check-timeouts pings last-check))) + ('timeout + (loop pings + (check-timeouts pings last-check))))))))) + (fold-all-schedulers + (lambda (name scheduler seed) + (watchdog-fiber scheduler watchdog-channel)) + '())))