home-lab/packages/home-lab-tools.nix
Geir Okkenhaug Jerstad 9f7c2640b5 feat: Complete deploy-rs integration with status monitoring
 Completed Tasks:
- Task 6: Successfully tested deploy-rs on all machines (grey-area, reverse-proxy, congenital-optimist)
- Task 7: Added deploy-rs status monitoring to lab tool

🔧 Infrastructure Improvements:
- Added sma user to local machine for consistent SSH access
- Created shared shell-aliases.nix module to eliminate conflicts
- Enhanced lab status command with deploy-rs deployment info
- Added generation tracking, build dates, and uptime monitoring

🚀 Deploy-rs Status:
- All 4 machines successfully tested with both dry-run and actual deployments
- Automatic rollback protection working correctly
- Health checks and magic rollback functioning properly
- Tailscale connectivity verified across all nodes

📊 New Status Features:
- lab status --deploy-rs: Shows deployment details
- lab status -v: Verbose SSH connection info
- lab status -vd: Combined verbose + deploy-rs info
- Real-time generation and system closure information

The hybrid deployment approach is now fully operational with modern safety features while maintaining legacy compatibility.
2025-06-15 10:51:36 +02:00

471 lines
16 KiB
Nix
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
lib,
stdenv,
writeShellScriptBin,
rsync,
openssh,
...
}:
writeShellScriptBin "lab" ''
#!/usr/bin/env bash
# Home-lab administration tools
# Deploy and manage NixOS configurations across home lab infrastructure
set -euo pipefail
# Configuration
HOMELAB_ROOT="/home/geir/Home-lab"
TEMP_CONFIG_DIR="/tmp/home-lab-config"
# Color output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
log() {
echo -e "''${BLUE}[lab]''${NC} $1"
}
success() {
echo -e "''${GREEN}[lab]''${NC} $1"
}
warn() {
echo -e "''${YELLOW}[lab]''${NC} $1"
}
error() {
echo -e "''${RED}[lab]''${NC} $1" >&2
}
# Deployment function
deploy_machine() {
local machine="$1"
local mode="''${2:-boot}" # boot, test, or switch
case "$machine" in
"congenital-optimist")
# Local deployment - no SSH needed
log "Deploying $machine (mode: $mode) locally"
# Deploy the configuration locally
log "Running nixos-rebuild $mode locally..."
if ! sudo nixos-rebuild $mode --flake "$HOMELAB_ROOT#$machine"; then
error "Failed to deploy configuration to $machine"
exit 1
fi
success "Successfully deployed $machine"
return 0
;;
"sleeper-service")
local target_host="sma@sleeper-service"
;;
"grey-area")
local target_host="sma@grey-area"
;;
"reverse-proxy")
local target_host="sma@reverse-proxy.tail807ea.ts.net"
;;
*)
error "Unknown machine: $machine"
error "Available machines: congenital-optimist, sleeper-service, grey-area, reverse-proxy"
exit 1
;;
esac
log "Deploying $machine (mode: $mode)"
# Sync configuration to target machine
log "Syncing configuration to $target_host..."
if ! ${rsync}/bin/rsync -av --delete "$HOMELAB_ROOT/" "$target_host:$TEMP_CONFIG_DIR/"; then
error "Failed to sync configuration to $machine"
exit 1
fi
# Deploy the configuration
log "Running nixos-rebuild $mode on $machine..."
if ! ${openssh}/bin/ssh "$target_host" "cd $TEMP_CONFIG_DIR && sudo nixos-rebuild $mode --flake .#$machine"; then
error "Failed to deploy configuration to $machine"
exit 1
fi
success "Successfully deployed $machine"
}
# Deploy with deploy-rs function
deploy_rs_machine() {
local machine="$1"
local dry_run="''${2:-false}"
log "Using deploy-rs for $machine deployment"
cd "$HOMELAB_ROOT"
if [[ "$dry_run" == "true" ]]; then
log "Running dry-run deployment..."
if ! nix run github:serokell/deploy-rs -- ".#$machine" --dry-activate; then
error "Deploy-rs dry-run failed for $machine"
return 1
fi
success "Deploy-rs dry-run completed for $machine"
else
log "Running actual deployment..."
if ! nix run github:serokell/deploy-rs -- ".#$machine"; then
error "Deploy-rs deployment failed for $machine"
return 1
fi
success "Deploy-rs deployment completed for $machine"
fi
}
# Update flake inputs function
update_flake() {
log "Updating flake inputs..."
cd "$HOMELAB_ROOT"
if ! nix flake update; then
error "Failed to update flake inputs"
return 1
fi
log "Checking updated flake configuration..."
if ! nix flake check; then
error "Flake check failed after update"
return 1
fi
success "Flake inputs updated successfully"
# Show what changed
log "Flake lock changes:"
git diff --no-index /dev/null flake.lock | grep "+" | head -10 || true
}
# Hybrid update: flake update + deploy-rs deployment
hybrid_update() {
local target="''${1:-all}"
local dry_run="''${2:-false}"
log "Starting hybrid update process (target: $target, dry-run: $dry_run)"
# Step 1: Update flake inputs
if ! update_flake; then
error "Failed to update flake - aborting hybrid update"
return 1
fi
# Step 2: Deploy with deploy-rs
if [[ "$target" == "all" ]]; then
local machines=("sleeper-service" "grey-area" "reverse-proxy" "congenital-optimist")
local failed_machines=()
for machine in "''${machines[@]}"; do
log "Deploying updated configuration to $machine..."
if deploy_rs_machine "$machine" "$dry_run"; then
success " $machine updated successfully"
else
error " Failed to update $machine"
failed_machines+=("$machine")
fi
echo ""
done
if [[ ''${#failed_machines[@]} -eq 0 ]]; then
success "All machines updated successfully with hybrid approach!"
else
error "Failed to update: ''${failed_machines[*]}"
return 1
fi
else
deploy_rs_machine "$target" "$dry_run"
fi
}
# Update all machines function (legacy method)
update_all_machines() {
local mode="''${1:-boot}" # boot, test, or switch
local machines=("congenital-optimist" "sleeper-service" "grey-area" "reverse-proxy")
local failed_machines=()
log "Starting update of all machines (mode: $mode) - using legacy method"
for machine in "''${machines[@]}"; do
log "Updating $machine..."
if deploy_machine "$machine" "$mode"; then
success " $machine updated successfully"
else
error " Failed to update $machine"
failed_machines+=("$machine")
fi
echo "" # Add spacing between machines
done
if [[ ''${#failed_machines[@]} -eq 0 ]]; then
success "All machines updated successfully!"
else
error "Failed to update: ''${failed_machines[*]}"
exit 1
fi
}
# Show deployment status
show_status() {
log "Home-lab infrastructure status:"
# Check if -v (verbose) flag is passed for deploy-rs details
local verbose=0
local show_deploy_rs=0
for arg in "$@"; do
case "$arg" in
"-v"|"--verbose") verbose=1 ;;
"--deploy-rs") show_deploy_rs=1 ;;
"-vd"|"--verbose-deploy-rs") verbose=1; show_deploy_rs=1 ;;
esac
done
# Check congenital-optimist (local)
if /run/current-system/sw/bin/systemctl is-active --quiet tailscaled; then
success " congenital-optimist: Online (local)"
if [[ $show_deploy_rs -eq 1 ]]; then
show_machine_deploy_info "congenital-optimist" "local"
fi
else
warn " congenital-optimist: Tailscale inactive"
fi
# Check remote machines
for machine in sleeper-service grey-area reverse-proxy; do
local ssh_user="sma" # Using sma as the admin user for remote machines
local connection_type=""
# Test SSH connectivity with debug info if in verbose mode
if [[ $verbose -eq 1 ]]; then
log "Testing SSH connection to $machine (LAN)..."
${openssh}/bin/ssh -v -o ConnectTimeout=5 -o BatchMode=yes "$ssh_user@$machine" "echo SSH connection to $machine successful" 2>&1
# Use specific hostname for reverse-proxy
if [[ "$machine" == "reverse-proxy" ]]; then
log "Testing SSH connection to reverse-proxy.tail807ea.ts.net (Tailscale)..."
${openssh}/bin/ssh -v -o ConnectTimeout=5 -o BatchMode=yes "$ssh_user@reverse-proxy.tail807ea.ts.net" "echo SSH connection to reverse-proxy.tail807ea.ts.net successful" 2>&1
else
log "Testing SSH connection to $machine.tailnet (Tailscale)..."
${openssh}/bin/ssh -v -o ConnectTimeout=5 -o BatchMode=yes "$ssh_user@$machine.tailnet" "echo SSH connection to $machine.tailnet successful" 2>&1
fi
fi
# For reverse-proxy, try Tailscale first as it's likely only accessible that way
if [[ "$machine" == "reverse-proxy" ]]; then
# Use the specific Tailscale hostname for reverse-proxy
if ${openssh}/bin/ssh -o ConnectTimeout=5 -o BatchMode=yes "$ssh_user@reverse-proxy.tail807ea.ts.net" "echo OK" >/dev/null 2>&1; then
success " $machine: Online (Tailscale)"
connection_type="reverse-proxy.tail807ea.ts.net"
elif ${openssh}/bin/ssh -o ConnectTimeout=2 -o BatchMode=yes "$ssh_user@$machine" "echo OK" >/dev/null 2>&1; then
success " $machine: Online (LAN)"
connection_type="$machine"
else
warn " $machine: Unreachable"
if [[ $verbose -eq 1 ]]; then
log " Note: reverse-proxy is likely only accessible via Tailscale"
log " Check if Tailscale is running on both machines and if the SSH service is active"
fi
fi
# For other machines, try LAN first then Tailscale as fallback
else
if ${openssh}/bin/ssh -o ConnectTimeout=2 -o BatchMode=yes "$ssh_user@$machine" "echo OK" >/dev/null 2>&1; then
success " $machine: Online (LAN)"
connection_type="$machine"
# Try with Tailscale hostname as fallback
elif ${openssh}/bin/ssh -o ConnectTimeout=3 -o BatchMode=yes "$ssh_user@$machine.tailnet" "echo OK" >/dev/null 2>&1; then
success " $machine: Online (Tailscale)"
connection_type="$machine.tailnet"
else
warn " $machine: Unreachable"
fi
fi
# Show deploy-rs information if requested and machine is reachable
if [[ $show_deploy_rs -eq 1 && -n "$connection_type" ]]; then
show_machine_deploy_info "$machine" "$connection_type"
fi
done
if [[ $show_deploy_rs -eq 1 ]]; then
echo ""
log "💡 Use 'lab status --deploy-rs' to see deployment details"
log "💡 Use 'lab status -vd' for verbose deploy-rs information"
fi
}
# Show deploy-rs deployment information for a machine
show_machine_deploy_info() {
local machine="$1"
local connection="$2"
if [[ "$connection" == "local" ]]; then
# Local machine - get info directly
local current_gen=$(readlink /nix/var/nix/profiles/system | sed 's/.*system-\([0-9]*\)-link/\1/')
local system_closure=$(readlink -f /run/current-system)
local build_date=$(stat -c %y "$system_closure" 2>/dev/null | cut -d' ' -f1 2>/dev/null || echo "unknown")
echo " 📦 Generation: $current_gen"
echo " 📅 Build Date: $build_date"
echo " 📍 Store Path: $system_closure"
else
# Remote machine - get info via SSH
local ssh_user="sma"
local ssh_host="$connection"
local remote_info=$(${openssh}/bin/ssh -o ConnectTimeout=3 -o BatchMode=yes "$ssh_user@$ssh_host" "
current_gen=\$(readlink /nix/var/nix/profiles/system 2>/dev/null | sed 's/.*system-\([0-9]*\)-link/\1/' 2>/dev/null || echo 'unknown')
system_closure=\$(readlink -f /run/current-system 2>/dev/null || echo 'unknown')
build_date=\$(stat -c %y \$system_closure 2>/dev/null | cut -d' ' -f1 2>/dev/null || echo 'unknown')
uptime=\$(uptime -s 2>/dev/null || echo 'unknown')
echo \"gen:\$current_gen|path:\$system_closure|date:\$build_date|uptime:\$uptime\"
" 2>/dev/null)
if [[ -n "$remote_info" ]]; then
local gen=$(echo "$remote_info" | cut -d'|' -f1 | cut -d':' -f2)
local path=$(echo "$remote_info" | cut -d'|' -f2 | cut -d':' -f2)
local date=$(echo "$remote_info" | cut -d'|' -f3 | cut -d':' -f2)
local uptime=$(echo "$remote_info" | cut -d'|' -f4 | cut -d':' -f2)
echo " 📦 Generation: $gen"
echo " 📅 Build Date: $date"
echo " Boot Time: $uptime"
echo " 📍 Store Path: $(basename "$path")"
else
echo " Unable to retrieve deployment info"
fi
fi
}
# Main command handling
case "''${1:-}" in
"deploy")
if [[ $# -lt 2 ]]; then
error "Usage: lab deploy <machine> [mode]"
error "Machines: congenital-optimist, sleeper-service, grey-area, reverse-proxy"
error "Modes: boot (default), test, switch"
exit 1
fi
machine="$2"
mode="''${3:-boot}"
if [[ ! "$mode" =~ ^(boot|test|switch)$ ]]; then
error "Invalid mode: $mode. Use boot, test, or switch"
exit 1
fi
deploy_machine "$machine" "$mode"
;;
"deploy-rs")
if [[ $# -lt 2 ]]; then
error "Usage: lab deploy-rs <machine> [--dry-run]"
error "Machines: congenital-optimist, sleeper-service, grey-area, reverse-proxy"
exit 1
fi
machine="$2"
dry_run="false"
if [[ "''${3:-}" == "--dry-run" ]]; then
dry_run="true"
fi
deploy_rs_machine "$machine" "$dry_run"
;;
"update-flake")
update_flake
;;
"hybrid-update")
target="''${2:-all}"
dry_run="false"
if [[ "''${3:-}" == "--dry-run" ]]; then
dry_run="true"
fi
hybrid_update "$target" "$dry_run"
;;
"status")
shift # Remove "status" from arguments
show_status "$@" # Pass all remaining arguments to show_status
;;
"update")
mode="''${2:-boot}"
if [[ ! "$mode" =~ ^(boot|test|switch)$ ]]; then
error "Invalid mode: $mode. Use boot, test, or switch"
exit 1
fi
update_all_machines "$mode"
;;
*)
echo "Home-lab Management Tool"
echo ""
echo "Usage: lab <command> [options]"
echo ""
echo "Available commands:"
echo " deploy <machine> [mode] - Deploy configuration to a machine (legacy method)"
echo " Machines: congenital-optimist, sleeper-service, grey-area, reverse-proxy"
echo " Modes: boot (default), test, switch"
echo " deploy-rs <machine> [opts] - Deploy using deploy-rs (modern method)"
echo " Options: --dry-run"
echo " update [mode] - Update all machines (legacy method)"
echo " Modes: boot (default), test, switch"
echo " update-flake - Update flake inputs and check configuration"
echo " hybrid-update [target] [opts] - Update flake + deploy with deploy-rs"
echo " Target: machine name or 'all' (default)"
echo " Options: --dry-run"
echo " status [options] - Check infrastructure connectivity"
echo " Options: -v (verbose), --deploy-rs (show deployment info)"
echo " -vd (verbose + deploy-rs info)"
echo ""
echo "Deployment Methods:"
echo " Legacy (SSH + rsync): Reliable, tested, slower"
echo " Deploy-rs: Modern, automatic rollback, parallel deployment"
echo " Hybrid: Combines flake updates with deploy-rs safety"
echo ""
echo "Ollama AI Tools (when available):"
echo " ollama-cli <command> - Manage Ollama service and models"
echo " monitor-ollama [opts] - Monitor Ollama service health"
echo ""
echo "Examples:"
echo " # Legacy deployment"
echo " lab deploy sleeper-service boot # Deploy and set for next boot"
echo " lab deploy grey-area switch # Deploy and switch immediately"
echo " lab update boot # Update all machines for next boot"
echo ""
echo " # Modern deploy-rs deployment"
echo " lab deploy-rs sleeper-service # Deploy with automatic rollback"
echo " lab deploy-rs grey-area --dry-run # Test deployment without applying"
echo ""
echo " # Hybrid approach (recommended for updates)"
echo " lab hybrid-update sleeper-service # Update flake + deploy specific machine"
echo " lab hybrid-update all --dry-run # Test update all machines"
echo " lab update-flake # Just update flake inputs"
echo ""
echo " # Status and monitoring"
echo " lab status # Check all machines"
echo " lab status --deploy-rs # Show deployment details"
echo " lab status -vd # Verbose with deploy-rs info"
echo ""
echo " # Ollama AI tools"
echo " ollama-cli status # Check Ollama service status"
echo " ollama-cli models # List installed AI models"
echo " monitor-ollama --test-inference # Full Ollama health check"
;;
esac
''