home-lab/packages/lab-tool/core/health.scm
2025-06-16 13:43:21 +02:00

75 lines
2.9 KiB
Scheme

;; lab/core/health.scm - Health check functionality
(define-module (lab core health)
#:use-module (ice-9 rdelim)
#:use-module (srfi srfi-1)
#:use-module (lab core logging)
#:use-module (lab core ssh)
#:export (check-system-health
check-disk-space
check-system-load
check-critical-services
check-network-connectivity))
(define (check-system-health machine-name)
"Perform comprehensive health check on a machine"
(log-info "Performing health check on ~a..." machine-name)
(let ((health-checks
'(("connectivity" . test-ssh-connection)
("disk-space" . check-disk-space)
("system-load" . check-system-load)
("critical-services" . check-critical-services)
("network" . check-network-connectivity))))
(map (lambda (check-pair)
(let ((check-name (car check-pair))
(check-proc (cdr check-pair)))
(log-debug "Running ~a check..." check-name)
(catch #t
(lambda ()
(let ((result (check-proc machine-name)))
`(,check-name . ((status . ,(if result 'pass 'fail))
(result . ,result)))))
(lambda (key . args)
(log-warn "Health check ~a failed: ~a" check-name key)
`(,check-name . ((status . error)
(error . ,key)))))))
health-checks)))
(define (check-disk-space machine-name)
"Check if disk space is below critical threshold"
(call-with-values
(lambda () (run-remote-command machine-name "df / | tail -1 | awk '{print $5}' | sed 's/%//'"))
(lambda (success output)
(if success
(let ((usage (string->number (string-trim-right output))))
(< usage 90)) ; Pass if usage < 90%
#f))))
(define (check-system-load machine-name)
"Check if system load is reasonable"
(call-with-values
(lambda () (run-remote-command machine-name "cat /proc/loadavg | cut -d' ' -f1"))
(lambda (success output)
(if success
(let ((load (string->number (string-trim-right output))))
(< load 5.0)) ; Pass if load < 5.0
#f))))
(define (check-critical-services machine-name)
"Check that critical services are running"
(let ((critical-services '("sshd")))
(every (lambda (service)
(call-with-values
(lambda () (run-remote-command machine-name "systemctl is-active" service))
(lambda (success output)
(and success (string=? (string-trim-right output) "active")))))
critical-services)))
(define (check-network-connectivity machine-name)
"Check basic network connectivity"
(call-with-values
(lambda () (run-remote-command machine-name "ping -c 1 -W 5 8.8.8.8 > /dev/null 2>&1; echo $?"))
(lambda (success output)
(and success (string=? (string-trim-right output) "0")))))