# Lab-Wide Auto-Update Service with Staggered Reboots ## Overview A NixOS service that runs on this machine (orchestrator) to update the entire homelab using existing lab tool commands, then perform staggered reboots to ensure you wake up to a freshly updated lab every morning. ## Service Architecture ### Central Orchestrator Approach - Runs on this machine (the controller) - Uses existing `lab update` and `lab deploy-all` commands - Orchestrates staggered reboots: sleeper-service → grey-area → reverse-proxy → self - 10-minute delays between each machine reboot ## Implementation ### 1. Nix Service Module ```nix # /home/geir/Home-lab/nix/modules/lab-orchestrator.nix { config, lib, pkgs, ... }: with lib; let cfg = config.services.lab-orchestrator; labPath = "/home/geir/Home-lab"; # Machine reboot order with delays rebootSequence = [ { machine = "sleeper-service"; delay = 0; } { machine = "grey-area"; delay = 600; } # 10 minutes { machine = "reverse-proxy"; delay = 1200; } # 20 minutes total { machine = "self"; delay = 1800; } # 30 minutes total ]; orchestratorScript = pkgs.writeShellScript "lab-orchestrator" '' #!/usr/bin/env bash set -euo pipefail LOG_FILE="/var/log/lab-orchestrator.log" LAB_TOOL="${labPath}/result/bin/lab" log() { echo "$(date '+%Y-%m-%d %H:%M:%S'): $1" | tee -a "$LOG_FILE" } # Ensure lab tool is available if [[ ! -x "$LAB_TOOL" ]]; then log "ERROR: Lab tool not found at $LAB_TOOL" log "Building lab tool first..." cd "${labPath}" if ! nix build .#lab-tool; then log "ERROR: Failed to build lab tool" exit 1 fi fi log "=== Starting Lab-Wide Update Orchestration ===" # Step 1: Update flake inputs log "Updating flake inputs..." cd "${labPath}" if ! $LAB_TOOL update; then log "ERROR: Failed to update flake inputs" exit 1 fi log "Flake inputs updated successfully" # Step 2: Deploy to all machines log "Deploying to all machines..." if ! $LAB_TOOL deploy-all; then log "ERROR: Failed to deploy to all machines" exit 1 fi log "Deployment completed successfully" # Step 3: Staggered reboots log "Starting staggered reboot sequence..." # Reboot sleeper-service immediately log "Rebooting sleeper-service..." if $LAB_TOOL reboot sleeper-service; then log "✓ sleeper-service reboot initiated" else log "WARNING: Failed to reboot sleeper-service" fi # Wait 10 minutes, then reboot grey-area log "Waiting 10 minutes before rebooting grey-area..." sleep 600 log "Rebooting grey-area..." if $LAB_TOOL reboot grey-area; then log "✓ grey-area reboot initiated" else log "WARNING: Failed to reboot grey-area" fi # Wait 10 minutes, then reboot reverse-proxy log "Waiting 10 minutes before rebooting reverse-proxy..." sleep 600 log "Rebooting reverse-proxy..." if $LAB_TOOL reboot reverse-proxy; then log "✓ reverse-proxy reboot initiated" else log "WARNING: Failed to reboot reverse-proxy" fi # Wait 10 minutes, then reboot self log "Waiting 10 minutes before rebooting self..." sleep 600 log "Rebooting this machine (orchestrator)..." log "=== Lab Update Orchestration Completed ===" # Reboot this machine systemctl reboot ''; in { options.services.lab-orchestrator = { enable = mkEnableOption "Lab orchestrator auto-update service"; schedule = mkOption { type = types.str; default = "02:00"; description = "Time to start lab update (HH:MM format)"; }; user = mkOption { type = types.str; default = "geir"; description = "User to run the lab tool as"; }; }; config = mkIf cfg.enable { systemd.services.lab-orchestrator = { description = "Lab-Wide Update Orchestrator"; serviceConfig = { Type = "oneshot"; User = cfg.user; Group = "users"; WorkingDirectory = labPath; ExecStart = "${orchestratorScript}"; # Give it plenty of time (2 hours) TimeoutStartSec = 7200; }; # Ensure network is ready after = [ "network-online.target" ]; wants = [ "network-online.target" ]; }; systemd.timers.lab-orchestrator = { description = "Lab-Wide Update Orchestrator Timer"; timerConfig = { OnCalendar = "*-*-* ${cfg.schedule}:00"; Persistent = true; # No randomization - we want predictable timing }; wantedBy = [ "timers.target" ]; }; # Ensure log directory and file exist with proper permissions systemd.tmpfiles.rules = [ "f /var/log/lab-orchestrator.log 0644 ${cfg.user} users -" ]; }; } ``` ### 2. Lab Tool Reboot Command Extension Add reboot capability to the existing Guile lab tool: ```scheme ;; lab/reboot.scm - New module for machine reboots (define-module (lab reboot) #:use-module (ice-9 format) #:use-module (ice-9 popen) #:use-module (utils logging) #:use-module (lab machines) #:export (reboot-machine)) (define (execute-ssh-command hostname command) "Execute command on remote machine via SSH" (let* ((ssh-cmd (format #f "ssh root@~a '~a'" hostname command)) (port (open-input-pipe ssh-cmd)) (output (read-string port))) (close-pipe port) output)) (define (reboot-machine machine-name) "Reboot a specific machine via SSH" (log-info "Attempting to reboot machine: ~a" machine-name) (if (validate-machine-name machine-name) (let* ((ssh-config (get-ssh-config machine-name)) (hostname (if ssh-config (assoc-ref ssh-config 'hostname) machine-name)) (is-local (if ssh-config (assoc-ref ssh-config 'is-local) #f))) (cond (is-local (log-info "Rebooting local machine...") (system "sudo systemctl reboot") #t) (hostname (log-info "Rebooting ~a via SSH..." hostname) (catch #t (lambda () ;; Send reboot command - connection will drop (execute-ssh-command hostname "sudo systemctl reboot") (log-success "Reboot command sent to ~a" machine-name) #t) (lambda (key . args) ;; SSH connection drop is expected during reboot (if (string-contains (format #f "~a" args) "Connection") (begin (log-info "Connection dropped (expected during reboot)") #t) (begin (log-error "Failed to reboot ~a: ~a" machine-name args) #f))))) (else (log-error "No hostname found for machine: ~a" machine-name) #f))) (begin (log-error "Invalid machine name: ~a" machine-name) #f))) ``` ### 3. CLI Integration Update the main.scm dispatcher to include reboot command: ```scheme ;; main.scm (additions to command dispatcher) (use-modules ;; ...existing modules... (lab reboot)) ;; Add to dispatch-command function (define (dispatch-command command args) "Dispatch command with appropriate handler" (match command ;; ...existing cases... ('reboot (if (null? args) (begin (log-error "reboot command requires machine name") (format #t "Usage: lab reboot \n")) (let ((result (reboot-machine (car args)))) (if result (log-success "Reboot initiated") (log-error "Reboot failed"))))) ;; ...rest of existing cases... )) ;; Update help text to include reboot command (define (get-help-text) "Pure function returning help text" "Home Lab Tool - K.I.S.S Refactored Edition USAGE: lab [args...] COMMANDS: status Show infrastructure status machines List all machines deploy Deploy configuration to machine deploy-all Deploy to all machines update Update flake inputs health [machine] Check machine health (all if no machine specified) ssh SSH to machine reboot Reboot machine via SSH test-modules Test modular implementation help Show this help EXAMPLES: lab status lab machines lab deploy congenital-optimist lab deploy-all lab update lab health lab health sleeper-service lab ssh sleeper-service lab reboot sleeper-service lab test-modules ") ### 4. Configuration Enable the service on this machine (the orchestrator): ```nix # hosts/this-machine/configuration.nix { imports = [ ../../nix/modules/lab-orchestrator.nix ]; services.lab-orchestrator = { enable = true; schedule = "02:00"; # 2 AM start user = "geir"; }; } ``` ## Timeline Breakdown ### Nightly Execution (Starting 2:00 AM) ``` 02:00 - Start orchestration 02:00-02:15 - Update flake inputs (lab update) 02:15-02:45 - Deploy to all machines (lab deploy-all) 02:45 - Reboot sleeper-service 02:55 - Reboot grey-area (10 min later) 03:05 - Reboot reverse-proxy (10 min later) 03:15 - Reboot orchestrator machine (10 min later) 03:20 - All machines back online and updated ``` ### Total Duration: ~1 hour 20 minutes - Deployment: ~30 minutes - Staggered reboots: ~50 minutes - Everything done by 3:20 AM ## Safety Features ### Logging and Monitoring ```bash # Check orchestrator logs sudo journalctl -u lab-orchestrator.service -f # Check orchestrator log file tail -f /var/log/lab-orchestrator.log # Check timer status systemctl status lab-orchestrator.timer ``` ### Manual Controls ```bash # Start update manually sudo systemctl start lab-orchestrator.service # Disable automatic updates sudo systemctl disable lab-orchestrator.timer # Check when next run is scheduled systemctl list-timers lab-orchestrator.timer ``` ### Recovery Options ```bash # If orchestration fails, machines can be individually managed lab deploy sleeper-service lab deploy grey-area lab deploy reverse-proxy # Emergency reboot sequence lab reboot sleeper-service sleep 600 lab reboot grey-area sleep 600 lab reboot reverse-proxy ``` ## Machine Configuration Requirements ### SSH Key Setup Ensure this machine can SSH to all target machines: ```bash # Test connectivity ssh root@sleeper-service "echo 'Connection OK'" ssh root@grey-area "echo 'Connection OK'" ssh root@reverse-proxy "echo 'Connection OK'" ``` ### Lab Tool Configuration Ensure lab.yaml includes all machines: ```yaml machines: sleeper-service: host: sleeper-service.local user: root grey-area: host: grey-area.local user: root reverse-proxy: host: reverse-proxy.local user: root ``` ## Deployment Steps ### 1. Create the Service Module Add the Nix module file and import it ### 2. Extend Lab Tool Add reboot command functionality ### 3. Test Components ```bash # Build the lab tool first cd /home/geir/Home-lab nix build .#lab-tool # Test lab commands work ./result/bin/lab update ./result/bin/lab deploy-all ./result/bin/lab machines ./result/bin/lab reboot sleeper-service # Test reboot (be careful!) ``` ### 4. Enable Service ```bash # Add to configuration and rebuild nixos-rebuild switch # Verify timer is active systemctl status lab-orchestrator.timer ``` ### 5. Monitor First Run ```bash # Watch the logs during first execution sudo journalctl -u lab-orchestrator.service -f ``` ## Benefits ### Morning Routine - Wake up to fully updated homelab - All services running latest versions - No manual intervention needed - Predictable update schedule ### Reliability - Uses existing, tested lab tool commands - Proper error handling and logging - Graceful degradation if individual reboots fail - Easy to disable or modify timing ### Visibility - Comprehensive logging of entire process - Clear timestamps for each phase - Easy troubleshooting if issues occur This gives you the "wake up to fresh lab" experience with minimal complexity, leveraging your existing infrastructure!