home-lab/research/lab-orchestrator-service.md
2025-06-20 15:32:34 +02:00

12 KiB

Lab-Wide Auto-Update Service with Staggered Reboots

Overview

A NixOS service that runs on this machine (orchestrator) to update the entire homelab using existing lab tool commands, then perform staggered reboots to ensure you wake up to a freshly updated lab every morning.

Service Architecture

Central Orchestrator Approach

  • Runs on this machine (the controller)
  • Uses existing lab update and lab deploy-all commands
  • Orchestrates staggered reboots: sleeper-service → grey-area → reverse-proxy → self
  • 10-minute delays between each machine reboot

Implementation

1. Nix Service Module

# /home/geir/Home-lab/nix/modules/lab-orchestrator.nix
{ config, lib, pkgs, ... }:

with lib;

let
  cfg = config.services.lab-orchestrator;
  
  labPath = "/home/geir/Home-lab";
  
  # Machine reboot order with delays
  rebootSequence = [
    { machine = "sleeper-service"; delay = 0; }
    { machine = "grey-area"; delay = 600; }  # 10 minutes
    { machine = "reverse-proxy"; delay = 1200; }  # 20 minutes total
    { machine = "self"; delay = 1800; }  # 30 minutes total
  ];
  
  orchestratorScript = pkgs.writeShellScript "lab-orchestrator" ''
    #!/usr/bin/env bash
    set -euo pipefail
    
    LOG_FILE="/var/log/lab-orchestrator.log"
    LAB_TOOL="${labPath}/result/bin/lab"
    
    log() {
      echo "$(date '+%Y-%m-%d %H:%M:%S'): $1" | tee -a "$LOG_FILE"
    }
    
    # Ensure lab tool is available
    if [[ ! -x "$LAB_TOOL" ]]; then
      log "ERROR: Lab tool not found at $LAB_TOOL"
      log "Building lab tool first..."
      cd "${labPath}"
      if ! nix build .#lab-tool; then
        log "ERROR: Failed to build lab tool"
        exit 1
      fi
    fi
    
    log "=== Starting Lab-Wide Update Orchestration ==="
    
    # Step 1: Update flake inputs
    log "Updating flake inputs..."
    cd "${labPath}"
    if ! $LAB_TOOL update; then
      log "ERROR: Failed to update flake inputs"
      exit 1
    fi
    log "Flake inputs updated successfully"
    
    # Step 2: Deploy to all machines
    log "Deploying to all machines..."
    if ! $LAB_TOOL deploy-all; then
      log "ERROR: Failed to deploy to all machines"
      exit 1
    fi
    log "Deployment completed successfully"
    
    # Step 3: Staggered reboots
    log "Starting staggered reboot sequence..."
    
    # Reboot sleeper-service immediately
    log "Rebooting sleeper-service..."
    if $LAB_TOOL reboot sleeper-service; then
      log "✓ sleeper-service reboot initiated"
    else
      log "WARNING: Failed to reboot sleeper-service"
    fi
    
    # Wait 10 minutes, then reboot grey-area
    log "Waiting 10 minutes before rebooting grey-area..."
    sleep 600
    log "Rebooting grey-area..."
    if $LAB_TOOL reboot grey-area; then
      log "✓ grey-area reboot initiated"
    else
      log "WARNING: Failed to reboot grey-area"
    fi
    
    # Wait 10 minutes, then reboot reverse-proxy
    log "Waiting 10 minutes before rebooting reverse-proxy..."
    sleep 600
    log "Rebooting reverse-proxy..."
    if $LAB_TOOL reboot reverse-proxy; then
      log "✓ reverse-proxy reboot initiated"
    else
      log "WARNING: Failed to reboot reverse-proxy"
    fi
    
    # Wait 10 minutes, then reboot self
    log "Waiting 10 minutes before rebooting self..."
    sleep 600
    log "Rebooting this machine (orchestrator)..."
    log "=== Lab Update Orchestration Completed ==="
    
    # Reboot this machine
    systemctl reboot
  '';
  
in
{
  options.services.lab-orchestrator = {
    enable = mkEnableOption "Lab orchestrator auto-update service";
    
    schedule = mkOption {
      type = types.str;
      default = "02:00";
      description = "Time to start lab update (HH:MM format)";
    };
    
    user = mkOption {
      type = types.str;
      default = "geir";
      description = "User to run the lab tool as";
    };
  };

  config = mkIf cfg.enable {
    systemd.services.lab-orchestrator = {
      description = "Lab-Wide Update Orchestrator";
      serviceConfig = {
        Type = "oneshot";
        User = cfg.user;
        Group = "users";
        WorkingDirectory = labPath;
        ExecStart = "${orchestratorScript}";
        # Give it plenty of time (2 hours)
        TimeoutStartSec = 7200;
      };
      # Ensure network is ready
      after = [ "network-online.target" ];
      wants = [ "network-online.target" ];
    };

    systemd.timers.lab-orchestrator = {
      description = "Lab-Wide Update Orchestrator Timer";
      timerConfig = {
        OnCalendar = "*-*-* ${cfg.schedule}:00";
        Persistent = true;
        # No randomization - we want predictable timing
      };
      wantedBy = [ "timers.target" ];
    };

    # Ensure log directory and file exist with proper permissions
    systemd.tmpfiles.rules = [
      "f /var/log/lab-orchestrator.log 0644 ${cfg.user} users -"
    ];
  };
}

2. Lab Tool Reboot Command Extension

Add reboot capability to the existing Guile lab tool:

;; lab/reboot.scm - New module for machine reboots
(define-module (lab reboot)
  #:use-module (ice-9 format)
  #:use-module (ice-9 popen)
  #:use-module (utils logging)
  #:use-module (lab machines)
  #:export (reboot-machine))

(define (execute-ssh-command hostname command)
  "Execute command on remote machine via SSH"
  (let* ((ssh-cmd (format #f "ssh root@~a '~a'" hostname command))
         (port (open-input-pipe ssh-cmd))
         (output (read-string port)))
    (close-pipe port)
    output))

(define (reboot-machine machine-name)
  "Reboot a specific machine via SSH"
  (log-info "Attempting to reboot machine: ~a" machine-name)
  
  (if (validate-machine-name machine-name)
      (let* ((ssh-config (get-ssh-config machine-name))
             (hostname (if ssh-config 
                          (assoc-ref ssh-config 'hostname)
                          machine-name))
             (is-local (if ssh-config 
                          (assoc-ref ssh-config 'is-local) 
                          #f)))
        
        (cond
         (is-local
          (log-info "Rebooting local machine...")
          (system "sudo systemctl reboot")
          #t)
         
         (hostname
          (log-info "Rebooting ~a via SSH..." hostname)
          (catch #t
            (lambda ()
              ;; Send reboot command - connection will drop
              (execute-ssh-command hostname "sudo systemctl reboot")
              (log-success "Reboot command sent to ~a" machine-name)
              #t)
            (lambda (key . args)
              ;; SSH connection drop is expected during reboot
              (if (string-contains (format #f "~a" args) "Connection")
                  (begin
                    (log-info "Connection dropped (expected during reboot)")
                    #t)
                  (begin
                    (log-error "Failed to reboot ~a: ~a" machine-name args)
                    #f)))))
         
         (else
          (log-error "No hostname found for machine: ~a" machine-name)
          #f)))
      
      (begin
        (log-error "Invalid machine name: ~a" machine-name)
        #f)))

3. CLI Integration

Update the main.scm dispatcher to include reboot command:

;; main.scm (additions to command dispatcher)
(use-modules ;; ...existing modules...
             (lab reboot))

;; Add to dispatch-command function
(define (dispatch-command command args)
  "Dispatch command with appropriate handler"
  (match command
    ;; ...existing cases...
    
    ('reboot
     (if (null? args)
         (begin
           (log-error "reboot command requires machine name")
           (format #t "Usage: lab reboot <machine>\n"))
         (let ((result (reboot-machine (car args))))
           (if result
               (log-success "Reboot initiated")
               (log-error "Reboot failed")))))
    
    ;; ...rest of existing cases...
    ))

;; Update help text to include reboot command
(define (get-help-text)
  "Pure function returning help text"
  "Home Lab Tool - K.I.S.S Refactored Edition

USAGE: lab <command> [args...]

COMMANDS:
  status              Show infrastructure status
  machines            List all machines  
  deploy <machine>    Deploy configuration to machine
  deploy-all          Deploy to all machines
  update              Update flake inputs
  health [machine]    Check machine health (all if no machine specified)
  ssh <machine>       SSH to machine
  reboot <machine>    Reboot machine via SSH
  test-modules        Test modular implementation
  help                Show this help

EXAMPLES:
  lab status
  lab machines
  lab deploy congenital-optimist
  lab deploy-all
  lab update
  lab health
  lab health sleeper-service
  lab ssh sleeper-service
  lab reboot sleeper-service
  lab test-modules
")

### 4. Configuration
Enable the service on this machine (the orchestrator):

```nix
# hosts/this-machine/configuration.nix
{
  imports = [
    ../../nix/modules/lab-orchestrator.nix
  ];

  services.lab-orchestrator = {
    enable = true;
    schedule = "02:00";  # 2 AM start
    user = "geir";
  };
}

Timeline Breakdown

Nightly Execution (Starting 2:00 AM)

02:00 - Start orchestration
02:00-02:15 - Update flake inputs (lab update)
02:15-02:45 - Deploy to all machines (lab deploy-all)  
02:45 - Reboot sleeper-service
02:55 - Reboot grey-area (10 min later)
03:05 - Reboot reverse-proxy (10 min later)
03:15 - Reboot orchestrator machine (10 min later)
03:20 - All machines back online and updated

Total Duration: ~1 hour 20 minutes

  • Deployment: ~30 minutes
  • Staggered reboots: ~50 minutes
  • Everything done by 3:20 AM

Safety Features

Logging and Monitoring

# Check orchestrator logs
sudo journalctl -u lab-orchestrator.service -f

# Check orchestrator log file
tail -f /var/log/lab-orchestrator.log

# Check timer status
systemctl status lab-orchestrator.timer

Manual Controls

# Start update manually
sudo systemctl start lab-orchestrator.service

# Disable automatic updates
sudo systemctl disable lab-orchestrator.timer

# Check when next run is scheduled
systemctl list-timers lab-orchestrator.timer

Recovery Options

# If orchestration fails, machines can be individually managed
lab deploy sleeper-service
lab deploy grey-area
lab deploy reverse-proxy

# Emergency reboot sequence
lab reboot sleeper-service
sleep 600
lab reboot grey-area
sleep 600
lab reboot reverse-proxy

Machine Configuration Requirements

SSH Key Setup

Ensure this machine can SSH to all target machines:

# Test connectivity
ssh root@sleeper-service "echo 'Connection OK'"
ssh root@grey-area "echo 'Connection OK'"  
ssh root@reverse-proxy "echo 'Connection OK'"

Lab Tool Configuration

Ensure lab.yaml includes all machines:

machines:
  sleeper-service:
    host: sleeper-service.local
    user: root
  grey-area:
    host: grey-area.local
    user: root
  reverse-proxy:
    host: reverse-proxy.local
    user: root

Deployment Steps

1. Create the Service Module

Add the Nix module file and import it

2. Extend Lab Tool

Add reboot command functionality

3. Test Components

# Build the lab tool first
cd /home/geir/Home-lab
nix build .#lab-tool

# Test lab commands work
./result/bin/lab update
./result/bin/lab deploy-all
./result/bin/lab machines
./result/bin/lab reboot sleeper-service  # Test reboot (be careful!)

4. Enable Service

# Add to configuration and rebuild
nixos-rebuild switch

# Verify timer is active
systemctl status lab-orchestrator.timer

5. Monitor First Run

# Watch the logs during first execution
sudo journalctl -u lab-orchestrator.service -f

Benefits

Morning Routine

  • Wake up to fully updated homelab
  • All services running latest versions
  • No manual intervention needed
  • Predictable update schedule

Reliability

  • Uses existing, tested lab tool commands
  • Proper error handling and logging
  • Graceful degradation if individual reboots fail
  • Easy to disable or modify timing

Visibility

  • Comprehensive logging of entire process
  • Clear timestamps for each phase
  • Easy troubleshooting if issues occur

This gives you the "wake up to fresh lab" experience with minimal complexity, leveraging your existing infrastructure!