75 lines
2.9 KiB
Scheme
75 lines
2.9 KiB
Scheme
;; lab/core/health.scm - Health check functionality
|
|
|
|
(define-module (lab core health)
|
|
#:use-module (ice-9 rdelim)
|
|
#:use-module (srfi srfi-1)
|
|
#:use-module (lab core logging)
|
|
#:use-module (lab core ssh)
|
|
#:export (check-system-health
|
|
check-disk-space
|
|
check-system-load
|
|
check-critical-services
|
|
check-network-connectivity))
|
|
|
|
(define (check-system-health machine-name)
|
|
"Perform comprehensive health check on a machine"
|
|
(log-info "Performing health check on ~a..." machine-name)
|
|
|
|
(let ((health-checks
|
|
'(("connectivity" . test-ssh-connection)
|
|
("disk-space" . check-disk-space)
|
|
("system-load" . check-system-load)
|
|
("critical-services" . check-critical-services)
|
|
("network" . check-network-connectivity))))
|
|
|
|
(map (lambda (check-pair)
|
|
(let ((check-name (car check-pair))
|
|
(check-proc (cdr check-pair)))
|
|
(log-debug "Running ~a check..." check-name)
|
|
(catch #t
|
|
(lambda ()
|
|
(let ((result (check-proc machine-name)))
|
|
`(,check-name . ((status . ,(if result 'pass 'fail))
|
|
(result . ,result)))))
|
|
(lambda (key . args)
|
|
(log-warn "Health check ~a failed: ~a" check-name key)
|
|
`(,check-name . ((status . error)
|
|
(error . ,key)))))))
|
|
health-checks)))
|
|
|
|
(define (check-disk-space machine-name)
|
|
"Check if disk space is below critical threshold"
|
|
(call-with-values
|
|
(lambda () (run-remote-command machine-name "df / | tail -1 | awk '{print $5}' | sed 's/%//'"))
|
|
(lambda (success output)
|
|
(if success
|
|
(let ((usage (string->number (string-trim-right output))))
|
|
(< usage 90)) ; Pass if usage < 90%
|
|
#f))))
|
|
|
|
(define (check-system-load machine-name)
|
|
"Check if system load is reasonable"
|
|
(call-with-values
|
|
(lambda () (run-remote-command machine-name "cat /proc/loadavg | cut -d' ' -f1"))
|
|
(lambda (success output)
|
|
(if success
|
|
(let ((load (string->number (string-trim-right output))))
|
|
(< load 5.0)) ; Pass if load < 5.0
|
|
#f))))
|
|
|
|
(define (check-critical-services machine-name)
|
|
"Check that critical services are running"
|
|
(let ((critical-services '("sshd")))
|
|
(every (lambda (service)
|
|
(call-with-values
|
|
(lambda () (run-remote-command machine-name "systemctl is-active" service))
|
|
(lambda (success output)
|
|
(and success (string=? (string-trim-right output) "active")))))
|
|
critical-services)))
|
|
|
|
(define (check-network-connectivity machine-name)
|
|
"Check basic network connectivity"
|
|
(call-with-values
|
|
(lambda () (run-remote-command machine-name "ping -c 1 -W 5 8.8.8.8 > /dev/null 2>&1; echo $?"))
|
|
(lambda (success output)
|
|
(and success (string=? (string-trim-right output) "0")))))
|