some research and loose thoughts

2025-06-20 15:32:34 +02:00 · 2025-06-20 15:32:34 +02:00 · 12fb56f35b
commit 12fb56f35b
parent 076c38d829
7 changed files with 2160 additions and 5 deletions
--- a/research/lab-orchestrator-service.md
+++ b/research/lab-orchestrator-service.md
@ -0,0 +1,455 @@
+# Lab-Wide Auto-Update Service with Staggered Reboots
+
+## Overview
+A NixOS service that runs on this machine (orchestrator) to update the entire homelab using existing lab tool commands, then perform staggered reboots to ensure you wake up to a freshly updated lab every morning.
+
+## Service Architecture
+
+### Central Orchestrator Approach
+- Runs on this machine (the controller)
+- Uses existing `lab update` and `lab deploy-all` commands
+- Orchestrates staggered reboots: sleeper-service → grey-area → reverse-proxy → self
+- 10-minute delays between each machine reboot
+
+## Implementation
+
+### 1. Nix Service Module
+```nix
+# /home/geir/Home-lab/nix/modules/lab-orchestrator.nix
+{ config, lib, pkgs, ... }:
+
+with lib;
+
+let
+  cfg = config.services.lab-orchestrator;
+  
+  labPath = "/home/geir/Home-lab";
+  
+  # Machine reboot order with delays
+  rebootSequence = [
+    { machine = "sleeper-service"; delay = 0; }
+    { machine = "grey-area"; delay = 600; }  # 10 minutes
+    { machine = "reverse-proxy"; delay = 1200; }  # 20 minutes total
+    { machine = "self"; delay = 1800; }  # 30 minutes total
+  ];
+  
+  orchestratorScript = pkgs.writeShellScript "lab-orchestrator" ''
+    #!/usr/bin/env bash
+    set -euo pipefail
+    
+    LOG_FILE="/var/log/lab-orchestrator.log"
+    LAB_TOOL="${labPath}/result/bin/lab"
+    
+    log() {
+      echo "$(date '+%Y-%m-%d %H:%M:%S'): $1" | tee -a "$LOG_FILE"
+    }
+    
+    # Ensure lab tool is available
+    if [[ ! -x "$LAB_TOOL" ]]; then
+      log "ERROR: Lab tool not found at $LAB_TOOL"
+      log "Building lab tool first..."
+      cd "${labPath}"
+      if ! nix build .#lab-tool; then
+        log "ERROR: Failed to build lab tool"
+        exit 1
+      fi
+    fi
+    
+    log "=== Starting Lab-Wide Update Orchestration ==="
+    
+    # Step 1: Update flake inputs
+    log "Updating flake inputs..."
+    cd "${labPath}"
+    if ! $LAB_TOOL update; then
+      log "ERROR: Failed to update flake inputs"
+      exit 1
+    fi
+    log "Flake inputs updated successfully"
+    
+    # Step 2: Deploy to all machines
+    log "Deploying to all machines..."
+    if ! $LAB_TOOL deploy-all; then
+      log "ERROR: Failed to deploy to all machines"
+      exit 1
+    fi
+    log "Deployment completed successfully"
+    
+    # Step 3: Staggered reboots
+    log "Starting staggered reboot sequence..."
+    
+    # Reboot sleeper-service immediately
+    log "Rebooting sleeper-service..."
+    if $LAB_TOOL reboot sleeper-service; then
+      log "✓ sleeper-service reboot initiated"
+    else
+      log "WARNING: Failed to reboot sleeper-service"
+    fi
+    
+    # Wait 10 minutes, then reboot grey-area
+    log "Waiting 10 minutes before rebooting grey-area..."
+    sleep 600
+    log "Rebooting grey-area..."
+    if $LAB_TOOL reboot grey-area; then
+      log "✓ grey-area reboot initiated"
+    else
+      log "WARNING: Failed to reboot grey-area"
+    fi
+    
+    # Wait 10 minutes, then reboot reverse-proxy
+    log "Waiting 10 minutes before rebooting reverse-proxy..."
+    sleep 600
+    log "Rebooting reverse-proxy..."
+    if $LAB_TOOL reboot reverse-proxy; then
+      log "✓ reverse-proxy reboot initiated"
+    else
+      log "WARNING: Failed to reboot reverse-proxy"
+    fi
+    
+    # Wait 10 minutes, then reboot self
+    log "Waiting 10 minutes before rebooting self..."
+    sleep 600
+    log "Rebooting this machine (orchestrator)..."
+    log "=== Lab Update Orchestration Completed ==="
+    
+    # Reboot this machine
+    systemctl reboot
+  '';
+  
+in
+{
+  options.services.lab-orchestrator = {
+    enable = mkEnableOption "Lab orchestrator auto-update service";
+    
+    schedule = mkOption {
+      type = types.str;
+      default = "02:00";
+      description = "Time to start lab update (HH:MM format)";
+    };
+    
+    user = mkOption {
+      type = types.str;
+      default = "geir";
+      description = "User to run the lab tool as";
+    };
+  };
+
+  config = mkIf cfg.enable {
+    systemd.services.lab-orchestrator = {
+      description = "Lab-Wide Update Orchestrator";
+      serviceConfig = {
+        Type = "oneshot";
+        User = cfg.user;
+        Group = "users";
+        WorkingDirectory = labPath;
+        ExecStart = "${orchestratorScript}";
+        # Give it plenty of time (2 hours)
+        TimeoutStartSec = 7200;
+      };
+      # Ensure network is ready
+      after = [ "network-online.target" ];
+      wants = [ "network-online.target" ];
+    };
+
+    systemd.timers.lab-orchestrator = {
+      description = "Lab-Wide Update Orchestrator Timer";
+      timerConfig = {
+        OnCalendar = "*-*-* ${cfg.schedule}:00";
+        Persistent = true;
+        # No randomization - we want predictable timing
+      };
+      wantedBy = [ "timers.target" ];
+    };
+
+    # Ensure log directory and file exist with proper permissions
+    systemd.tmpfiles.rules = [
+      "f /var/log/lab-orchestrator.log 0644 ${cfg.user} users -"
+    ];
+  };
+}
+```
+
+### 2. Lab Tool Reboot Command Extension
+Add reboot capability to the existing Guile lab tool:
+
+```scheme
+;; lab/reboot.scm - New module for machine reboots
+(define-module (lab reboot)
+  #:use-module (ice-9 format)
+  #:use-module (ice-9 popen)
+  #:use-module (utils logging)
+  #:use-module (lab machines)
+  #:export (reboot-machine))
+
+(define (execute-ssh-command hostname command)
+  "Execute command on remote machine via SSH"
+  (let* ((ssh-cmd (format #f "ssh root@~a '~a'" hostname command))
+         (port (open-input-pipe ssh-cmd))
+         (output (read-string port)))
+    (close-pipe port)
+    output))
+
+(define (reboot-machine machine-name)
+  "Reboot a specific machine via SSH"
+  (log-info "Attempting to reboot machine: ~a" machine-name)
+  
+  (if (validate-machine-name machine-name)
+      (let* ((ssh-config (get-ssh-config machine-name))
+             (hostname (if ssh-config 
+                          (assoc-ref ssh-config 'hostname)
+                          machine-name))
+             (is-local (if ssh-config 
+                          (assoc-ref ssh-config 'is-local) 
+                          #f)))
+        
+        (cond
+         (is-local
+          (log-info "Rebooting local machine...")
+          (system "sudo systemctl reboot")
+          #t)
+         
+         (hostname
+          (log-info "Rebooting ~a via SSH..." hostname)
+          (catch #t
+            (lambda ()
+              ;; Send reboot command - connection will drop
+              (execute-ssh-command hostname "sudo systemctl reboot")
+              (log-success "Reboot command sent to ~a" machine-name)
+              #t)
+            (lambda (key . args)
+              ;; SSH connection drop is expected during reboot
+              (if (string-contains (format #f "~a" args) "Connection")
+                  (begin
+                    (log-info "Connection dropped (expected during reboot)")
+                    #t)
+                  (begin
+                    (log-error "Failed to reboot ~a: ~a" machine-name args)
+                    #f)))))
+         
+         (else
+          (log-error "No hostname found for machine: ~a" machine-name)
+          #f)))
+      
+      (begin
+        (log-error "Invalid machine name: ~a" machine-name)
+        #f)))
+```
+
+### 3. CLI Integration
+Update the main.scm dispatcher to include reboot command:
+
+```scheme
+;; main.scm (additions to command dispatcher)
+(use-modules ;; ...existing modules...
+             (lab reboot))
+
+;; Add to dispatch-command function
+(define (dispatch-command command args)
+  "Dispatch command with appropriate handler"
+  (match command
+    ;; ...existing cases...
+    
+    ('reboot
+     (if (null? args)
+         (begin
+           (log-error "reboot command requires machine name")
+           (format #t "Usage: lab reboot <machine>\n"))
+         (let ((result (reboot-machine (car args))))
+           (if result
+               (log-success "Reboot initiated")
+               (log-error "Reboot failed")))))
+    
+    ;; ...rest of existing cases...
+    ))
+
+;; Update help text to include reboot command
+(define (get-help-text)
+  "Pure function returning help text"
+  "Home Lab Tool - K.I.S.S Refactored Edition
+
+USAGE: lab <command> [args...]
+
+COMMANDS:
+  status              Show infrastructure status
+  machines            List all machines  
+  deploy <machine>    Deploy configuration to machine
+  deploy-all          Deploy to all machines
+  update              Update flake inputs
+  health [machine]    Check machine health (all if no machine specified)
+  ssh <machine>       SSH to machine
+  reboot <machine>    Reboot machine via SSH
+  test-modules        Test modular implementation
+  help                Show this help
+
+EXAMPLES:
+  lab status
+  lab machines
+  lab deploy congenital-optimist
+  lab deploy-all
+  lab update
+  lab health
+  lab health sleeper-service
+  lab ssh sleeper-service
+  lab reboot sleeper-service
+  lab test-modules
+")
+
+### 4. Configuration
+Enable the service on this machine (the orchestrator):
+
+```nix
+# hosts/this-machine/configuration.nix
+{
+  imports = [
+    ../../nix/modules/lab-orchestrator.nix
+  ];
+
+  services.lab-orchestrator = {
+    enable = true;
+    schedule = "02:00";  # 2 AM start
+    user = "geir";
+  };
+}
+```
+
+## Timeline Breakdown
+
+### Nightly Execution (Starting 2:00 AM)
+```
+02:00 - Start orchestration
+02:00-02:15 - Update flake inputs (lab update)
+02:15-02:45 - Deploy to all machines (lab deploy-all)  
+02:45 - Reboot sleeper-service
+02:55 - Reboot grey-area (10 min later)
+03:05 - Reboot reverse-proxy (10 min later)
+03:15 - Reboot orchestrator machine (10 min later)
+03:20 - All machines back online and updated
+```
+
+### Total Duration: ~1 hour 20 minutes
+- Deployment: ~30 minutes
+- Staggered reboots: ~50 minutes
+- Everything done by 3:20 AM
+
+## Safety Features
+
+### Logging and Monitoring
+```bash
+# Check orchestrator logs
+sudo journalctl -u lab-orchestrator.service -f
+
+# Check orchestrator log file
+tail -f /var/log/lab-orchestrator.log
+
+# Check timer status
+systemctl status lab-orchestrator.timer
+```
+
+### Manual Controls
+```bash
+# Start update manually
+sudo systemctl start lab-orchestrator.service
+
+# Disable automatic updates
+sudo systemctl disable lab-orchestrator.timer
+
+# Check when next run is scheduled
+systemctl list-timers lab-orchestrator.timer
+```
+
+### Recovery Options
+```bash
+# If orchestration fails, machines can be individually managed
+lab deploy sleeper-service
+lab deploy grey-area
+lab deploy reverse-proxy
+
+# Emergency reboot sequence
+lab reboot sleeper-service
+sleep 600
+lab reboot grey-area
+sleep 600
+lab reboot reverse-proxy
+```
+
+## Machine Configuration Requirements
+
+### SSH Key Setup
+Ensure this machine can SSH to all target machines:
+```bash
+# Test connectivity
+ssh root@sleeper-service "echo 'Connection OK'"
+ssh root@grey-area "echo 'Connection OK'"  
+ssh root@reverse-proxy "echo 'Connection OK'"
+```
+
+### Lab Tool Configuration
+Ensure lab.yaml includes all machines:
+```yaml
+machines:
+  sleeper-service:
+    host: sleeper-service.local
+    user: root
+  grey-area:
+    host: grey-area.local
+    user: root
+  reverse-proxy:
+    host: reverse-proxy.local
+    user: root
+```
+
+## Deployment Steps
+
+### 1. Create the Service Module
+Add the Nix module file and import it
+
+### 2. Extend Lab Tool
+Add reboot command functionality
+
+### 3. Test Components
+```bash
+# Build the lab tool first
+cd /home/geir/Home-lab
+nix build .#lab-tool
+
+# Test lab commands work
+./result/bin/lab update
+./result/bin/lab deploy-all
+./result/bin/lab machines
+./result/bin/lab reboot sleeper-service  # Test reboot (be careful!)
+```
+
+### 4. Enable Service
+```bash
+# Add to configuration and rebuild
+nixos-rebuild switch
+
+# Verify timer is active
+systemctl status lab-orchestrator.timer
+```
+
+### 5. Monitor First Run
+```bash
+# Watch the logs during first execution
+sudo journalctl -u lab-orchestrator.service -f
+```
+
+## Benefits
+
+### Morning Routine
+- Wake up to fully updated homelab
+- All services running latest versions
+- No manual intervention needed
+- Predictable update schedule
+
+### Reliability
+- Uses existing, tested lab tool commands
+- Proper error handling and logging
+- Graceful degradation if individual reboots fail
+- Easy to disable or modify timing
+
+### Visibility
+- Comprehensive logging of entire process
+- Clear timestamps for each phase
+- Easy troubleshooting if issues occur
+
+This gives you the "wake up to fresh lab" experience with minimal complexity, leveraging your existing infrastructure!