Geir Okkenhaug Jerstad 12fb56f35b some research and loose thoughts

2025-06-20 15:32:34 +02:00

12 KiB

Raw Blame History

Lab-Wide Auto-Update Service with Staggered Reboots

Overview

A NixOS service that runs on this machine (orchestrator) to update the entire homelab using existing lab tool commands, then perform staggered reboots to ensure you wake up to a freshly updated lab every morning.

Service Architecture

Central Orchestrator Approach

Runs on this machine (the controller)
Uses existing lab update and lab deploy-all commands
Orchestrates staggered reboots: sleeper-service → grey-area → reverse-proxy → self
10-minute delays between each machine reboot

Implementation

1. Nix Service Module

# /home/geir/Home-lab/nix/modules/lab-orchestrator.nix
{ config, lib, pkgs, ... }:

with lib;

let
  cfg = config.services.lab-orchestrator;
  
  labPath = "/home/geir/Home-lab";
  
  # Machine reboot order with delays
  rebootSequence = [
    { machine = "sleeper-service"; delay = 0; }
    { machine = "grey-area"; delay = 600; }  # 10 minutes
    { machine = "reverse-proxy"; delay = 1200; }  # 20 minutes total
    { machine = "self"; delay = 1800; }  # 30 minutes total
  ];
  
  orchestratorScript = pkgs.writeShellScript "lab-orchestrator" ''
    #!/usr/bin/env bash
    set -euo pipefail
    
    LOG_FILE="/var/log/lab-orchestrator.log"
    LAB_TOOL="${labPath}/result/bin/lab"
    
    log() {
      echo "$(date '+%Y-%m-%d %H:%M:%S'): $1" | tee -a "$LOG_FILE"
    }
    
    # Ensure lab tool is available
    if [[ ! -x "$LAB_TOOL" ]]; then
      log "ERROR: Lab tool not found at $LAB_TOOL"
      log "Building lab tool first..."
      cd "${labPath}"
      if ! nix build .#lab-tool; then
        log "ERROR: Failed to build lab tool"
        exit 1
      fi
    fi
    
    log "=== Starting Lab-Wide Update Orchestration ==="
    
    # Step 1: Update flake inputs
    log "Updating flake inputs..."
    cd "${labPath}"
    if ! $LAB_TOOL update; then
      log "ERROR: Failed to update flake inputs"
      exit 1
    fi
    log "Flake inputs updated successfully"
    
    # Step 2: Deploy to all machines
    log "Deploying to all machines..."
    if ! $LAB_TOOL deploy-all; then
      log "ERROR: Failed to deploy to all machines"
      exit 1
    fi
    log "Deployment completed successfully"
    
    # Step 3: Staggered reboots
    log "Starting staggered reboot sequence..."
    
    # Reboot sleeper-service immediately
    log "Rebooting sleeper-service..."
    if $LAB_TOOL reboot sleeper-service; then
      log "✓ sleeper-service reboot initiated"
    else
      log "WARNING: Failed to reboot sleeper-service"
    fi
    
    # Wait 10 minutes, then reboot grey-area
    log "Waiting 10 minutes before rebooting grey-area..."
    sleep 600
    log "Rebooting grey-area..."
    if $LAB_TOOL reboot grey-area; then
      log "✓ grey-area reboot initiated"
    else
      log "WARNING: Failed to reboot grey-area"
    fi
    
    # Wait 10 minutes, then reboot reverse-proxy
    log "Waiting 10 minutes before rebooting reverse-proxy..."
    sleep 600
    log "Rebooting reverse-proxy..."
    if $LAB_TOOL reboot reverse-proxy; then
      log "✓ reverse-proxy reboot initiated"
    else
      log "WARNING: Failed to reboot reverse-proxy"
    fi
    
    # Wait 10 minutes, then reboot self
    log "Waiting 10 minutes before rebooting self..."
    sleep 600
    log "Rebooting this machine (orchestrator)..."
    log "=== Lab Update Orchestration Completed ==="
    
    # Reboot this machine
    systemctl reboot
  '';
  
in
{
  options.services.lab-orchestrator = {
    enable = mkEnableOption "Lab orchestrator auto-update service";
    
    schedule = mkOption {
      type = types.str;
      default = "02:00";
      description = "Time to start lab update (HH:MM format)";
    };
    
    user = mkOption {
      type = types.str;
      default = "geir";
      description = "User to run the lab tool as";
    };
  };

  config = mkIf cfg.enable {
    systemd.services.lab-orchestrator = {
      description = "Lab-Wide Update Orchestrator";
      serviceConfig = {
        Type = "oneshot";
        User = cfg.user;
        Group = "users";
        WorkingDirectory = labPath;
        ExecStart = "${orchestratorScript}";
        # Give it plenty of time (2 hours)
        TimeoutStartSec = 7200;
      };
      # Ensure network is ready
      after = [ "network-online.target" ];
      wants = [ "network-online.target" ];
    };

    systemd.timers.lab-orchestrator = {
      description = "Lab-Wide Update Orchestrator Timer";
      timerConfig = {
        OnCalendar = "*-*-* ${cfg.schedule}:00";
        Persistent = true;
        # No randomization - we want predictable timing
      };
      wantedBy = [ "timers.target" ];
    };

    # Ensure log directory and file exist with proper permissions
    systemd.tmpfiles.rules = [
      "f /var/log/lab-orchestrator.log 0644 ${cfg.user} users -"
    ];
  };
}

2. Lab Tool Reboot Command Extension

Add reboot capability to the existing Guile lab tool:

;; lab/reboot.scm - New module for machine reboots
(define-module (lab reboot)
  #:use-module (ice-9 format)
  #:use-module (ice-9 popen)
  #:use-module (utils logging)
  #:use-module (lab machines)
  #:export (reboot-machine))

(define (execute-ssh-command hostname command)
  "Execute command on remote machine via SSH"
  (let* ((ssh-cmd (format #f "ssh root@~a '~a'" hostname command))
         (port (open-input-pipe ssh-cmd))
         (output (read-string port)))
    (close-pipe port)
    output))

(define (reboot-machine machine-name)
  "Reboot a specific machine via SSH"
  (log-info "Attempting to reboot machine: ~a" machine-name)
  
  (if (validate-machine-name machine-name)
      (let* ((ssh-config (get-ssh-config machine-name))
             (hostname (if ssh-config 
                          (assoc-ref ssh-config 'hostname)
                          machine-name))
             (is-local (if ssh-config 
                          (assoc-ref ssh-config 'is-local) 
                          #f)))
        
        (cond
         (is-local
          (log-info "Rebooting local machine...")
          (system "sudo systemctl reboot")
          #t)
         
         (hostname
          (log-info "Rebooting ~a via SSH..." hostname)
          (catch #t
            (lambda ()
              ;; Send reboot command - connection will drop
              (execute-ssh-command hostname "sudo systemctl reboot")
              (log-success "Reboot command sent to ~a" machine-name)
              #t)
            (lambda (key . args)
              ;; SSH connection drop is expected during reboot
              (if (string-contains (format #f "~a" args) "Connection")
                  (begin
                    (log-info "Connection dropped (expected during reboot)")
                    #t)
                  (begin
                    (log-error "Failed to reboot ~a: ~a" machine-name args)
                    #f)))))
         
         (else
          (log-error "No hostname found for machine: ~a" machine-name)
          #f)))
      
      (begin
        (log-error "Invalid machine name: ~a" machine-name)
        #f)))

3. CLI Integration

Update the main.scm dispatcher to include reboot command:

;; main.scm (additions to command dispatcher)
(use-modules ;; ...existing modules...
             (lab reboot))

;; Add to dispatch-command function
(define (dispatch-command command args)
  "Dispatch command with appropriate handler"
  (match command
    ;; ...existing cases...
    
    ('reboot
     (if (null? args)
         (begin
           (log-error "reboot command requires machine name")
           (format #t "Usage: lab reboot <machine>\n"))
         (let ((result (reboot-machine (car args))))
           (if result
               (log-success "Reboot initiated")
               (log-error "Reboot failed")))))
    
    ;; ...rest of existing cases...
    ))

;; Update help text to include reboot command
(define (get-help-text)
  "Pure function returning help text"
  "Home Lab Tool - K.I.S.S Refactored Edition

USAGE: lab <command> [args...]

COMMANDS:
  status              Show infrastructure status
  machines            List all machines  
  deploy <machine>    Deploy configuration to machine
  deploy-all          Deploy to all machines
  update              Update flake inputs
  health [machine]    Check machine health (all if no machine specified)
  ssh <machine>       SSH to machine
  reboot <machine>    Reboot machine via SSH
  test-modules        Test modular implementation
  help                Show this help

EXAMPLES:
  lab status
  lab machines
  lab deploy congenital-optimist
  lab deploy-all
  lab update
  lab health
  lab health sleeper-service
  lab ssh sleeper-service
  lab reboot sleeper-service
  lab test-modules
")

### 4. Configuration
Enable the service on this machine (the orchestrator):

```nix
# hosts/this-machine/configuration.nix
{
  imports = [
    ../../nix/modules/lab-orchestrator.nix
  ];

  services.lab-orchestrator = {
    enable = true;
    schedule = "02:00";  # 2 AM start
    user = "geir";
  };
}

Timeline Breakdown

Nightly Execution (Starting 2:00 AM)

02:00 - Start orchestration
02:00-02:15 - Update flake inputs (lab update)
02:15-02:45 - Deploy to all machines (lab deploy-all)  
02:45 - Reboot sleeper-service
02:55 - Reboot grey-area (10 min later)
03:05 - Reboot reverse-proxy (10 min later)
03:15 - Reboot orchestrator machine (10 min later)
03:20 - All machines back online and updated

Total Duration: ~1 hour 20 minutes

Deployment: ~30 minutes
Staggered reboots: ~50 minutes
Everything done by 3:20 AM

Safety Features

Logging and Monitoring

# Check orchestrator logs
sudo journalctl -u lab-orchestrator.service -f

# Check orchestrator log file
tail -f /var/log/lab-orchestrator.log

# Check timer status
systemctl status lab-orchestrator.timer

Manual Controls

# Start update manually
sudo systemctl start lab-orchestrator.service

# Disable automatic updates
sudo systemctl disable lab-orchestrator.timer

# Check when next run is scheduled
systemctl list-timers lab-orchestrator.timer

Recovery Options

# If orchestration fails, machines can be individually managed
lab deploy sleeper-service
lab deploy grey-area
lab deploy reverse-proxy

# Emergency reboot sequence
lab reboot sleeper-service
sleep 600
lab reboot grey-area
sleep 600
lab reboot reverse-proxy

Machine Configuration Requirements

SSH Key Setup

Ensure this machine can SSH to all target machines:

# Test connectivity
ssh root@sleeper-service "echo 'Connection OK'"
ssh root@grey-area "echo 'Connection OK'"  
ssh root@reverse-proxy "echo 'Connection OK'"

Lab Tool Configuration

Ensure lab.yaml includes all machines:

machines:
  sleeper-service:
    host: sleeper-service.local
    user: root
  grey-area:
    host: grey-area.local
    user: root
  reverse-proxy:
    host: reverse-proxy.local
    user: root

Deployment Steps

1. Create the Service Module

Add the Nix module file and import it

2. Extend Lab Tool

Add reboot command functionality

3. Test Components

# Build the lab tool first
cd /home/geir/Home-lab
nix build .#lab-tool

# Test lab commands work
./result/bin/lab update
./result/bin/lab deploy-all
./result/bin/lab machines
./result/bin/lab reboot sleeper-service  # Test reboot (be careful!)

4. Enable Service

# Add to configuration and rebuild
nixos-rebuild switch

# Verify timer is active
systemctl status lab-orchestrator.timer

5. Monitor First Run

# Watch the logs during first execution
sudo journalctl -u lab-orchestrator.service -f

Benefits

Morning Routine

Wake up to fully updated homelab
All services running latest versions
No manual intervention needed
Predictable update schedule

Reliability

Uses existing, tested lab tool commands
Proper error handling and logging
Graceful degradation if individual reboots fail
Easy to disable or modify timing

Visibility

Comprehensive logging of entire process
Clear timestamps for each phase
Easy troubleshooting if issues occur

This gives you the "wake up to fresh lab" experience with minimal complexity, leveraging your existing infrastructure!

12 KiB Raw Blame History