455 lines
No EOL
12 KiB
Markdown
455 lines
No EOL
12 KiB
Markdown
# Lab-Wide Auto-Update Service with Staggered Reboots
|
|
|
|
## Overview
|
|
A NixOS service that runs on this machine (orchestrator) to update the entire homelab using existing lab tool commands, then perform staggered reboots to ensure you wake up to a freshly updated lab every morning.
|
|
|
|
## Service Architecture
|
|
|
|
### Central Orchestrator Approach
|
|
- Runs on this machine (the controller)
|
|
- Uses existing `lab update` and `lab deploy-all` commands
|
|
- Orchestrates staggered reboots: sleeper-service → grey-area → reverse-proxy → self
|
|
- 10-minute delays between each machine reboot
|
|
|
|
## Implementation
|
|
|
|
### 1. Nix Service Module
|
|
```nix
|
|
# /home/geir/Home-lab/nix/modules/lab-orchestrator.nix
|
|
{ config, lib, pkgs, ... }:
|
|
|
|
with lib;
|
|
|
|
let
|
|
cfg = config.services.lab-orchestrator;
|
|
|
|
labPath = "/home/geir/Home-lab";
|
|
|
|
# Machine reboot order with delays
|
|
rebootSequence = [
|
|
{ machine = "sleeper-service"; delay = 0; }
|
|
{ machine = "grey-area"; delay = 600; } # 10 minutes
|
|
{ machine = "reverse-proxy"; delay = 1200; } # 20 minutes total
|
|
{ machine = "self"; delay = 1800; } # 30 minutes total
|
|
];
|
|
|
|
orchestratorScript = pkgs.writeShellScript "lab-orchestrator" ''
|
|
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
LOG_FILE="/var/log/lab-orchestrator.log"
|
|
LAB_TOOL="${labPath}/result/bin/lab"
|
|
|
|
log() {
|
|
echo "$(date '+%Y-%m-%d %H:%M:%S'): $1" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
# Ensure lab tool is available
|
|
if [[ ! -x "$LAB_TOOL" ]]; then
|
|
log "ERROR: Lab tool not found at $LAB_TOOL"
|
|
log "Building lab tool first..."
|
|
cd "${labPath}"
|
|
if ! nix build .#lab-tool; then
|
|
log "ERROR: Failed to build lab tool"
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
log "=== Starting Lab-Wide Update Orchestration ==="
|
|
|
|
# Step 1: Update flake inputs
|
|
log "Updating flake inputs..."
|
|
cd "${labPath}"
|
|
if ! $LAB_TOOL update; then
|
|
log "ERROR: Failed to update flake inputs"
|
|
exit 1
|
|
fi
|
|
log "Flake inputs updated successfully"
|
|
|
|
# Step 2: Deploy to all machines
|
|
log "Deploying to all machines..."
|
|
if ! $LAB_TOOL deploy-all; then
|
|
log "ERROR: Failed to deploy to all machines"
|
|
exit 1
|
|
fi
|
|
log "Deployment completed successfully"
|
|
|
|
# Step 3: Staggered reboots
|
|
log "Starting staggered reboot sequence..."
|
|
|
|
# Reboot sleeper-service immediately
|
|
log "Rebooting sleeper-service..."
|
|
if $LAB_TOOL reboot sleeper-service; then
|
|
log "✓ sleeper-service reboot initiated"
|
|
else
|
|
log "WARNING: Failed to reboot sleeper-service"
|
|
fi
|
|
|
|
# Wait 10 minutes, then reboot grey-area
|
|
log "Waiting 10 minutes before rebooting grey-area..."
|
|
sleep 600
|
|
log "Rebooting grey-area..."
|
|
if $LAB_TOOL reboot grey-area; then
|
|
log "✓ grey-area reboot initiated"
|
|
else
|
|
log "WARNING: Failed to reboot grey-area"
|
|
fi
|
|
|
|
# Wait 10 minutes, then reboot reverse-proxy
|
|
log "Waiting 10 minutes before rebooting reverse-proxy..."
|
|
sleep 600
|
|
log "Rebooting reverse-proxy..."
|
|
if $LAB_TOOL reboot reverse-proxy; then
|
|
log "✓ reverse-proxy reboot initiated"
|
|
else
|
|
log "WARNING: Failed to reboot reverse-proxy"
|
|
fi
|
|
|
|
# Wait 10 minutes, then reboot self
|
|
log "Waiting 10 minutes before rebooting self..."
|
|
sleep 600
|
|
log "Rebooting this machine (orchestrator)..."
|
|
log "=== Lab Update Orchestration Completed ==="
|
|
|
|
# Reboot this machine
|
|
systemctl reboot
|
|
'';
|
|
|
|
in
|
|
{
|
|
options.services.lab-orchestrator = {
|
|
enable = mkEnableOption "Lab orchestrator auto-update service";
|
|
|
|
schedule = mkOption {
|
|
type = types.str;
|
|
default = "02:00";
|
|
description = "Time to start lab update (HH:MM format)";
|
|
};
|
|
|
|
user = mkOption {
|
|
type = types.str;
|
|
default = "geir";
|
|
description = "User to run the lab tool as";
|
|
};
|
|
};
|
|
|
|
config = mkIf cfg.enable {
|
|
systemd.services.lab-orchestrator = {
|
|
description = "Lab-Wide Update Orchestrator";
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
User = cfg.user;
|
|
Group = "users";
|
|
WorkingDirectory = labPath;
|
|
ExecStart = "${orchestratorScript}";
|
|
# Give it plenty of time (2 hours)
|
|
TimeoutStartSec = 7200;
|
|
};
|
|
# Ensure network is ready
|
|
after = [ "network-online.target" ];
|
|
wants = [ "network-online.target" ];
|
|
};
|
|
|
|
systemd.timers.lab-orchestrator = {
|
|
description = "Lab-Wide Update Orchestrator Timer";
|
|
timerConfig = {
|
|
OnCalendar = "*-*-* ${cfg.schedule}:00";
|
|
Persistent = true;
|
|
# No randomization - we want predictable timing
|
|
};
|
|
wantedBy = [ "timers.target" ];
|
|
};
|
|
|
|
# Ensure log directory and file exist with proper permissions
|
|
systemd.tmpfiles.rules = [
|
|
"f /var/log/lab-orchestrator.log 0644 ${cfg.user} users -"
|
|
];
|
|
};
|
|
}
|
|
```
|
|
|
|
### 2. Lab Tool Reboot Command Extension
|
|
Add reboot capability to the existing Guile lab tool:
|
|
|
|
```scheme
|
|
;; lab/reboot.scm - New module for machine reboots
|
|
(define-module (lab reboot)
|
|
#:use-module (ice-9 format)
|
|
#:use-module (ice-9 popen)
|
|
#:use-module (utils logging)
|
|
#:use-module (lab machines)
|
|
#:export (reboot-machine))
|
|
|
|
(define (execute-ssh-command hostname command)
|
|
"Execute command on remote machine via SSH"
|
|
(let* ((ssh-cmd (format #f "ssh root@~a '~a'" hostname command))
|
|
(port (open-input-pipe ssh-cmd))
|
|
(output (read-string port)))
|
|
(close-pipe port)
|
|
output))
|
|
|
|
(define (reboot-machine machine-name)
|
|
"Reboot a specific machine via SSH"
|
|
(log-info "Attempting to reboot machine: ~a" machine-name)
|
|
|
|
(if (validate-machine-name machine-name)
|
|
(let* ((ssh-config (get-ssh-config machine-name))
|
|
(hostname (if ssh-config
|
|
(assoc-ref ssh-config 'hostname)
|
|
machine-name))
|
|
(is-local (if ssh-config
|
|
(assoc-ref ssh-config 'is-local)
|
|
#f)))
|
|
|
|
(cond
|
|
(is-local
|
|
(log-info "Rebooting local machine...")
|
|
(system "sudo systemctl reboot")
|
|
#t)
|
|
|
|
(hostname
|
|
(log-info "Rebooting ~a via SSH..." hostname)
|
|
(catch #t
|
|
(lambda ()
|
|
;; Send reboot command - connection will drop
|
|
(execute-ssh-command hostname "sudo systemctl reboot")
|
|
(log-success "Reboot command sent to ~a" machine-name)
|
|
#t)
|
|
(lambda (key . args)
|
|
;; SSH connection drop is expected during reboot
|
|
(if (string-contains (format #f "~a" args) "Connection")
|
|
(begin
|
|
(log-info "Connection dropped (expected during reboot)")
|
|
#t)
|
|
(begin
|
|
(log-error "Failed to reboot ~a: ~a" machine-name args)
|
|
#f)))))
|
|
|
|
(else
|
|
(log-error "No hostname found for machine: ~a" machine-name)
|
|
#f)))
|
|
|
|
(begin
|
|
(log-error "Invalid machine name: ~a" machine-name)
|
|
#f)))
|
|
```
|
|
|
|
### 3. CLI Integration
|
|
Update the main.scm dispatcher to include reboot command:
|
|
|
|
```scheme
|
|
;; main.scm (additions to command dispatcher)
|
|
(use-modules ;; ...existing modules...
|
|
(lab reboot))
|
|
|
|
;; Add to dispatch-command function
|
|
(define (dispatch-command command args)
|
|
"Dispatch command with appropriate handler"
|
|
(match command
|
|
;; ...existing cases...
|
|
|
|
('reboot
|
|
(if (null? args)
|
|
(begin
|
|
(log-error "reboot command requires machine name")
|
|
(format #t "Usage: lab reboot <machine>\n"))
|
|
(let ((result (reboot-machine (car args))))
|
|
(if result
|
|
(log-success "Reboot initiated")
|
|
(log-error "Reboot failed")))))
|
|
|
|
;; ...rest of existing cases...
|
|
))
|
|
|
|
;; Update help text to include reboot command
|
|
(define (get-help-text)
|
|
"Pure function returning help text"
|
|
"Home Lab Tool - K.I.S.S Refactored Edition
|
|
|
|
USAGE: lab <command> [args...]
|
|
|
|
COMMANDS:
|
|
status Show infrastructure status
|
|
machines List all machines
|
|
deploy <machine> Deploy configuration to machine
|
|
deploy-all Deploy to all machines
|
|
update Update flake inputs
|
|
health [machine] Check machine health (all if no machine specified)
|
|
ssh <machine> SSH to machine
|
|
reboot <machine> Reboot machine via SSH
|
|
test-modules Test modular implementation
|
|
help Show this help
|
|
|
|
EXAMPLES:
|
|
lab status
|
|
lab machines
|
|
lab deploy congenital-optimist
|
|
lab deploy-all
|
|
lab update
|
|
lab health
|
|
lab health sleeper-service
|
|
lab ssh sleeper-service
|
|
lab reboot sleeper-service
|
|
lab test-modules
|
|
")
|
|
|
|
### 4. Configuration
|
|
Enable the service on this machine (the orchestrator):
|
|
|
|
```nix
|
|
# hosts/this-machine/configuration.nix
|
|
{
|
|
imports = [
|
|
../../nix/modules/lab-orchestrator.nix
|
|
];
|
|
|
|
services.lab-orchestrator = {
|
|
enable = true;
|
|
schedule = "02:00"; # 2 AM start
|
|
user = "geir";
|
|
};
|
|
}
|
|
```
|
|
|
|
## Timeline Breakdown
|
|
|
|
### Nightly Execution (Starting 2:00 AM)
|
|
```
|
|
02:00 - Start orchestration
|
|
02:00-02:15 - Update flake inputs (lab update)
|
|
02:15-02:45 - Deploy to all machines (lab deploy-all)
|
|
02:45 - Reboot sleeper-service
|
|
02:55 - Reboot grey-area (10 min later)
|
|
03:05 - Reboot reverse-proxy (10 min later)
|
|
03:15 - Reboot orchestrator machine (10 min later)
|
|
03:20 - All machines back online and updated
|
|
```
|
|
|
|
### Total Duration: ~1 hour 20 minutes
|
|
- Deployment: ~30 minutes
|
|
- Staggered reboots: ~50 minutes
|
|
- Everything done by 3:20 AM
|
|
|
|
## Safety Features
|
|
|
|
### Logging and Monitoring
|
|
```bash
|
|
# Check orchestrator logs
|
|
sudo journalctl -u lab-orchestrator.service -f
|
|
|
|
# Check orchestrator log file
|
|
tail -f /var/log/lab-orchestrator.log
|
|
|
|
# Check timer status
|
|
systemctl status lab-orchestrator.timer
|
|
```
|
|
|
|
### Manual Controls
|
|
```bash
|
|
# Start update manually
|
|
sudo systemctl start lab-orchestrator.service
|
|
|
|
# Disable automatic updates
|
|
sudo systemctl disable lab-orchestrator.timer
|
|
|
|
# Check when next run is scheduled
|
|
systemctl list-timers lab-orchestrator.timer
|
|
```
|
|
|
|
### Recovery Options
|
|
```bash
|
|
# If orchestration fails, machines can be individually managed
|
|
lab deploy sleeper-service
|
|
lab deploy grey-area
|
|
lab deploy reverse-proxy
|
|
|
|
# Emergency reboot sequence
|
|
lab reboot sleeper-service
|
|
sleep 600
|
|
lab reboot grey-area
|
|
sleep 600
|
|
lab reboot reverse-proxy
|
|
```
|
|
|
|
## Machine Configuration Requirements
|
|
|
|
### SSH Key Setup
|
|
Ensure this machine can SSH to all target machines:
|
|
```bash
|
|
# Test connectivity
|
|
ssh root@sleeper-service "echo 'Connection OK'"
|
|
ssh root@grey-area "echo 'Connection OK'"
|
|
ssh root@reverse-proxy "echo 'Connection OK'"
|
|
```
|
|
|
|
### Lab Tool Configuration
|
|
Ensure lab.yaml includes all machines:
|
|
```yaml
|
|
machines:
|
|
sleeper-service:
|
|
host: sleeper-service.local
|
|
user: root
|
|
grey-area:
|
|
host: grey-area.local
|
|
user: root
|
|
reverse-proxy:
|
|
host: reverse-proxy.local
|
|
user: root
|
|
```
|
|
|
|
## Deployment Steps
|
|
|
|
### 1. Create the Service Module
|
|
Add the Nix module file and import it
|
|
|
|
### 2. Extend Lab Tool
|
|
Add reboot command functionality
|
|
|
|
### 3. Test Components
|
|
```bash
|
|
# Build the lab tool first
|
|
cd /home/geir/Home-lab
|
|
nix build .#lab-tool
|
|
|
|
# Test lab commands work
|
|
./result/bin/lab update
|
|
./result/bin/lab deploy-all
|
|
./result/bin/lab machines
|
|
./result/bin/lab reboot sleeper-service # Test reboot (be careful!)
|
|
```
|
|
|
|
### 4. Enable Service
|
|
```bash
|
|
# Add to configuration and rebuild
|
|
nixos-rebuild switch
|
|
|
|
# Verify timer is active
|
|
systemctl status lab-orchestrator.timer
|
|
```
|
|
|
|
### 5. Monitor First Run
|
|
```bash
|
|
# Watch the logs during first execution
|
|
sudo journalctl -u lab-orchestrator.service -f
|
|
```
|
|
|
|
## Benefits
|
|
|
|
### Morning Routine
|
|
- Wake up to fully updated homelab
|
|
- All services running latest versions
|
|
- No manual intervention needed
|
|
- Predictable update schedule
|
|
|
|
### Reliability
|
|
- Uses existing, tested lab tool commands
|
|
- Proper error handling and logging
|
|
- Graceful degradation if individual reboots fail
|
|
- Easy to disable or modify timing
|
|
|
|
### Visibility
|
|
- Comprehensive logging of entire process
|
|
- Clear timestamps for each phase
|
|
- Easy troubleshooting if issues occur
|
|
|
|
This gives you the "wake up to fresh lab" experience with minimal complexity, leveraging your existing infrastructure! |