home-lab/packages/lab-tool/research/machines.scm
2025-06-16 13:43:21 +02:00

258 lines
11 KiB
Scheme

;; lab/machines.scm - Machine-specific operations
(define-module (lab machines)
#:use-module (ice-9 format)
#:use-module (ice-9 match)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-19)
#:use-module (utils logging)
#:use-module (utils config)
#:use-module (utils ssh)
#:use-module (lab core)
#:export (show-infrastructure-status
get-machine-details
discover-machines
validate-machine-health
get-machine-metrics
option-ref))
;; Helper function for option handling
(define (option-ref options key default)
"Get option value with default fallback"
(let ((value (assoc-ref options key)))
(if value value default)))
;; Display infrastructure status in a human-readable format
(define (show-infrastructure-status machine-name options)
"Display comprehensive infrastructure status"
(let ((verbose (option-ref options 'verbose #f))
(status-data (get-infrastructure-status machine-name)))
(log-info "Home-lab infrastructure status:")
(newline)
(for-each
(lambda (machine-status)
(display-machine-status machine-status verbose))
status-data)
;; Summary statistics
(let ((total-machines (length status-data))
(online-machines (length (filter
(lambda (status)
(eq? (assoc-ref status 'connection) 'online))
status-data))))
(newline)
(if (= online-machines total-machines)
(log-success "All ~a machines online ✓" total-machines)
(log-warn "~a/~a machines online" online-machines total-machines)))))
;; Display status for a single machine
(define (display-machine-status machine-status verbose)
"Display formatted status for a single machine"
(let* ((machine-name (assoc-ref machine-status 'machine))
(machine-type (assoc-ref machine-status 'type))
(connection (assoc-ref machine-status 'connection))
(services (assoc-ref machine-status 'services))
(system-info (assoc-ref machine-status 'system))
(check-time (assoc-ref machine-status 'check-time)))
;; Machine header with connection status
(let ((status-symbol (if (eq? connection 'online) "✅" "❌"))
(type-label (if (eq? machine-type 'local) "(local)" "(remote)")))
(format #t "━━━ ~a ~a ~a ━━━~%" machine-name type-label status-symbol))
;; Connection details
(if (eq? connection 'online)
(begin
(when system-info
(let ((uptime (assoc-ref system-info 'uptime))
(load (assoc-ref system-info 'load))
(memory (assoc-ref system-info 'memory))
(disk (assoc-ref system-info 'disk)))
(when uptime (format #t "⏱️ Uptime: ~a~%" uptime))
(when load (format #t "📊 Load: ~a~%" load))
(when memory (format #t "🧠 Memory: ~a~%" memory))
(when disk (format #t "💾 Disk: ~a~%" disk))))
;; Services status
(when (not (null? services))
(format #t "🔧 Services: ")
(for-each (lambda (service-status)
(let ((service-name (symbol->string (car service-status)))
(service-state (cdr service-status)))
(let ((status-icon (cond
((string=? service-state "active") "✅")
((string=? service-state "inactive") "❌")
((string=? service-state "failed") "💥")
(else "❓"))))
(format #t "~a ~a " service-name status-icon))))
services)
(newline))
(format #t "⚡ Response: ~ams~%" (inexact->exact (round (* check-time 1000)))))
(format #t "⚠️ Status: Offline~%"))
;; Verbose information
(when verbose
(let ((ssh-config (get-ssh-config machine-name)))
(when ssh-config
(format #t "🔗 SSH: ~a~%" (assoc-ref ssh-config 'hostname))
(let ((ssh-alias (assoc-ref ssh-config 'ssh-alias)))
(when ssh-alias
(format #t "🏷️ Alias: ~a~%" ssh-alias))))))
(newline)))
;; Get detailed information about a specific machine
(define (get-machine-details machine-name)
"Get comprehensive details about a specific machine"
(let ((machine-config (get-machine-config machine-name)))
(if (not machine-config)
(begin
(log-error "Machine ~a not found in configuration" machine-name)
#f)
(let* ((ssh-config (get-ssh-config machine-name))
(health-status (check-system-health machine-name))
(current-status (car (get-infrastructure-status machine-name))))
`((name . ,machine-name)
(config . ,machine-config)
(ssh . ,ssh-config)
(status . ,current-status)
(health . ,health-status)
(last-updated . ,(current-date)))))))
;; Discover machines on the network
(define (discover-machines)
"Discover available machines on the network"
(log-info "Discovering machines on the network...")
(let ((configured-machines (get-all-machines)))
(log-debug "Configured machines: ~a" configured-machines)
;; Test connectivity to each configured machine
(let ((discovery-results
(map (lambda (machine-name)
(log-debug "Testing connectivity to ~a..." machine-name)
(let ((reachable (test-ssh-connection machine-name))
(ssh-config (get-ssh-config machine-name)))
`((machine . ,machine-name)
(configured . #t)
(reachable . ,reachable)
(type . ,(if (and ssh-config (assoc-ref ssh-config 'is-local))
'local 'remote))
(hostname . ,(if ssh-config
(assoc-ref ssh-config 'hostname)
"unknown")))))
configured-machines)))
;; TODO: Add network scanning for unconfigured machines
;; This could use nmap or similar tools to discover machines
(log-info "Discovery completed")
discovery-results)))
;; Validate health of a machine with detailed checks
(define (validate-machine-health machine-name . detailed)
"Perform comprehensive health validation on a machine"
(let ((run-detailed (if (null? detailed) #f (car detailed))))
(log-info "Validating health of ~a..." machine-name)
(let ((basic-health (check-system-health machine-name)))
(if run-detailed
;; Extended health checks for detailed mode
(let ((extended-checks
'(("filesystem" . check-filesystem-health)
("network-services" . check-network-services)
("system-logs" . check-system-logs)
("performance" . check-performance-metrics))))
(let ((extended-results
(map (lambda (check-pair)
(let ((check-name (car check-pair))
(check-proc (cdr check-pair)))
(log-debug "Running extended check: ~a" check-name)
(catch #t
(lambda ()
`(,check-name . ,(check-proc machine-name)))
(lambda (key . args)
(log-warn "Extended check ~a failed: ~a" check-name key)
`(,check-name . (error . ,key))))))
extended-checks)))
`((basic . ,basic-health)
(extended . ,extended-results)
(timestamp . ,(current-date)))))
;; Just basic health checks
`((basic . ,basic-health)
(timestamp . ,(current-date)))))))
;; Extended health check functions
(define (check-filesystem-health machine-name)
"Check filesystem health and disk usage"
(call-with-values (((success output)
(run-remote-command machine-name "df -h && echo '---' && mount | grep -E '^/' | head -5")))
(if success
`((status . pass)
(details . ,(string-trim-right output)))
`((status . fail)
(error . "Could not retrieve filesystem information")))))
(define (check-network-services machine-name)
"Check network service connectivity"
(let ((services-to-test '(("ssh" "22") ("http" "80") ("https" "443"))))
(map (lambda (service-pair)
(let ((service-name (car service-pair))
(port (cadr service-pair)))
(call-with-values (((success output)
(run-remote-command machine-name
(format #f "netstat -ln | grep ':~a ' > /dev/null 2>&1; echo $?" port))))
`(,service-name . ,(if (and success (string=? (string-trim-right output) "0"))
'listening 'not-listening)))))
services-to-test)))
(define (check-system-logs machine-name)
"Check system logs for recent errors"
(call-with-values (((success output)
(run-remote-command machine-name
"journalctl --since='1 hour ago' --priority=err --no-pager | wc -l")))
(if success
(let ((error-count (string->number (string-trim-right output))))
`((status . ,(if (< error-count 10) 'good 'concerning))
(error-count . ,error-count)))
`((status . unknown)
(error . "Could not check system logs")))))
(define (check-performance-metrics machine-name)
"Get basic performance metrics"
(let ((metrics-commands
'(("cpu-usage" "top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | sed 's/%us,//'")
("memory-usage" "free | grep Mem | awk '{printf \"%.1f\", ($3/$2) * 100.0}'")
("io-wait" "iostat 1 2 | tail -1 | awk '{print $4}'"))))
(map (lambda (metric-pair)
(let ((metric-name (car metric-pair))
(command (cadr metric-pair)))
(call-with-values (((success output) (run-remote-command machine-name command)))
`(,(string->symbol metric-name) .
,(if success (string-trim-right output) "unknown")))))
metrics-commands)))
;; Get machine metrics for monitoring
(define (get-machine-metrics machine-name . time-range)
"Get machine metrics for monitoring and analysis"
(let ((range (if (null? time-range) "1h" (car time-range))))
(log-debug "Collecting metrics for ~a (range: ~a)" machine-name range)
(let ((current-time (current-date))
(performance (check-performance-metrics machine-name))
(health (validate-machine-health machine-name)))
`((machine . ,machine-name)
(timestamp . ,current-time)
(performance . ,performance)
(health . ,health)
(range . ,range)))))