home-lab/packages/home-lab-tools.nix
Geir Okkenhaug Jerstad 39df6f2fcc Fix SSH connectivity in lab tool using admin aliases
- Updated lab status command to use admin SSH aliases (admin-sleeper, admin-grey, admin-reverse)
- Fixed SSH authentication issues by using correct admin keys
- Improved verbose mode to show detailed connection attempts
- Updated legacy deployment to use admin aliases for consistency
- Now properly connects to sleeper-service and grey-area via admin access
- reverse-proxy showing as unreachable due to fail2ban (expected security behavior)

Resolves SSH connectivity issues that were blocking task completion assessment.
2025-06-15 11:18:13 +02:00

396 lines
12 KiB
Nix
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
lib,
stdenv,
writeShellScriptBin,
rsync,
openssh,
...
}:
writeShellScriptBin "lab" ''
#!/usr/bin/env bash
# Home-lab administration tools
# Deploy and manage NixOS configurations across home lab infrastructure
set -euo pipefail
# Configuration
HOMELAB_ROOT="/home/geir/Home-lab"
TEMP_CONFIG_DIR="/tmp/home-lab-config"
# Color output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
log() {
echo -e "''${BLUE}[lab]''${NC} $1"
}
success() {
echo -e "''${GREEN}[lab]''${NC} $1"
}
warn() {
echo -e "''${YELLOW}[lab]''${NC} $1"
}
error() {
echo -e "''${RED}[lab]''${NC} $1" >&2
}
# Deployment function
deploy_machine() {
local machine="$1"
local mode="''${2:-boot}" # boot, test, or switch
case "$machine" in
"congenital-optimist")
# Local deployment - no SSH needed
log "Deploying $machine (mode: $mode) locally"
# Deploy the configuration locally
log "Running nixos-rebuild $mode locally..."
if ! sudo nixos-rebuild $mode --flake "$HOMELAB_ROOT#$machine"; then
error "Failed to deploy configuration to $machine"
exit 1
fi
success "Successfully deployed $machine"
return 0
;;
"sleeper-service")
local target_host="admin-sleeper"
;;
"grey-area")
local target_host="admin-grey"
;;
"reverse-proxy")
local target_host="admin-reverse"
;;
*)
error "Unknown machine: $machine"
error "Available machines: congenital-optimist, sleeper-service, grey-area, reverse-proxy"
exit 1
;;
esac
log "Deploying $machine (mode: $mode)"
# Sync configuration to target machine
log "Syncing configuration to $target_host..."
if ! ${rsync}/bin/rsync -av --delete "$HOMELAB_ROOT/" "$target_host:$TEMP_CONFIG_DIR/"; then
error "Failed to sync configuration to $machine"
exit 1
fi
# Deploy the configuration
log "Running nixos-rebuild $mode on $machine..."
if ! ${openssh}/bin/ssh "$target_host" "cd $TEMP_CONFIG_DIR && sudo nixos-rebuild $mode --flake .#$machine"; then
error "Failed to deploy configuration to $machine"
exit 1
fi
success "Successfully deployed $machine"
}
# Deploy with deploy-rs function
deploy_rs_machine() {
local machine="$1"
local dry_run="''${2:-false}"
log "Using deploy-rs for $machine deployment"
cd "$HOMELAB_ROOT"
if [[ "$dry_run" == "true" ]]; then
log "Running dry-run deployment..."
if ! nix run github:serokell/deploy-rs -- ".#$machine" --dry-activate; then
error "Deploy-rs dry-run failed for $machine"
return 1
fi
success "Deploy-rs dry-run completed for $machine"
else
log "Running actual deployment..."
if ! nix run github:serokell/deploy-rs -- ".#$machine"; then
error "Deploy-rs deployment failed for $machine"
return 1
fi
success "Deploy-rs deployment completed for $machine"
fi
}
# Update flake inputs function
update_flake() {
log "Updating flake inputs..."
cd "$HOMELAB_ROOT"
if ! nix flake update; then
error "Failed to update flake inputs"
return 1
fi
log "Checking updated flake configuration..."
if ! nix flake check; then
error "Flake check failed after update"
return 1
fi
success "Flake inputs updated successfully"
# Show what changed
log "Flake lock changes:"
git diff --no-index /dev/null flake.lock | grep "+" | head -10 || true
}
# Hybrid update: flake update + deploy-rs deployment
hybrid_update() {
local target="''${1:-all}"
local dry_run="''${2:-false}"
log "Starting hybrid update process (target: $target, dry-run: $dry_run)"
# Step 1: Update flake inputs
if ! update_flake; then
error "Failed to update flake - aborting hybrid update"
return 1
fi
# Step 2: Deploy with deploy-rs
if [[ "$target" == "all" ]]; then
local machines=("sleeper-service" "grey-area" "reverse-proxy" "congenital-optimist")
local failed_machines=()
for machine in "''${machines[@]}"; do
log "Deploying updated configuration to $machine..."
if deploy_rs_machine "$machine" "$dry_run"; then
success " $machine updated successfully"
else
error " Failed to update $machine"
failed_machines+=("$machine")
fi
echo ""
done
if [[ ''${#failed_machines[@]} -eq 0 ]]; then
success "All machines updated successfully with hybrid approach!"
else
error "Failed to update: ''${failed_machines[*]}"
return 1
fi
else
deploy_rs_machine "$target" "$dry_run"
fi
}
# Update all machines function (legacy method)
update_all_machines() {
local mode="''${1:-boot}" # boot, test, or switch
local machines=("congenital-optimist" "sleeper-service" "grey-area" "reverse-proxy")
local failed_machines=()
log "Starting update of all machines (mode: $mode) - using legacy method"
for machine in "''${machines[@]}"; do
log "Updating $machine..."
if deploy_machine "$machine" "$mode"; then
success " $machine updated successfully"
else
error " Failed to update $machine"
failed_machines+=("$machine")
fi
echo "" # Add spacing between machines
done
if [[ ''${#failed_machines[@]} -eq 0 ]]; then
success "All machines updated successfully!"
else
error "Failed to update: ''${failed_machines[*]}"
exit 1
fi
}
# Show deployment status
show_status() {
log "Home-lab infrastructure status:"
# Check congenital-optimist (local)
if /run/current-system/sw/bin/systemctl is-active --quiet tailscaled; then
success " congenital-optimist: Online (local)"
else
warn " congenital-optimist: Tailscale inactive"
fi
# Check if -v (verbose) flag is passed
local verbose=0
if [[ "''${1:-}" == "-v" ]]; then
verbose=1
fi
# Check remote machines
for machine in sleeper-service grey-area reverse-proxy; do
# Set admin alias for SSH connection
local admin_alias
case "$machine" in
"sleeper-service")
admin_alias="admin-sleeper"
tailscale_hostname="sleeper-service.tail807ea.ts.net"
;;
"grey-area")
admin_alias="admin-grey"
tailscale_hostname="grey-area.tail807ea.ts.net"
;;
"reverse-proxy")
admin_alias="admin-reverse"
tailscale_hostname="reverse-proxy.tail807ea.ts.net"
;;
esac
# Test SSH connectivity with debug info if in verbose mode
if [[ $verbose -eq 1 ]]; then
log "Testing SSH connection to $admin_alias (admin alias)..."
${openssh}/bin/ssh -v -o ConnectTimeout=5 -o BatchMode=yes "$admin_alias" "echo SSH connection to $admin_alias successful" 2>&1
log "Testing SSH connection to sma@$tailscale_hostname (Tailscale direct)..."
${openssh}/bin/ssh -v -o ConnectTimeout=5 -o BatchMode=yes -i ~/.ssh/id_ed25519_admin "sma@$tailscale_hostname" "echo SSH connection to $tailscale_hostname successful" 2>&1
fi
# Try admin alias first (should work for all machines)
if ${openssh}/bin/ssh -o ConnectTimeout=3 -o BatchMode=yes "$admin_alias" "echo OK" >/dev/null 2>&1; then
success " $machine: Online (admin access)"
# Fallback to direct Tailscale connection with admin key
elif ${openssh}/bin/ssh -o ConnectTimeout=5 -o BatchMode=yes -i ~/.ssh/id_ed25519_admin "sma@$tailscale_hostname" "echo OK" >/dev/null 2>&1; then
success " $machine: Online (Tailscale)"
else
warn " $machine: Unreachable"
if [[ $verbose -eq 1 ]]; then
log " Note: Tried both admin alias ($admin_alias) and direct Tailscale connection"
log " Check if machine is online and SSH service is running"
fi
fi
done
}
# Main command handling
case "''${1:-}" in
"deploy")
if [[ $# -lt 2 ]]; then
error "Usage: lab deploy <machine> [mode]"
error "Machines: congenital-optimist, sleeper-service, grey-area, reverse-proxy"
error "Modes: boot (default), test, switch"
exit 1
fi
machine="$2"
mode="''${3:-boot}"
if [[ ! "$mode" =~ ^(boot|test|switch)$ ]]; then
error "Invalid mode: $mode. Use boot, test, or switch"
exit 1
fi
deploy_machine "$machine" "$mode"
;;
"deploy-rs")
if [[ $# -lt 2 ]]; then
error "Usage: lab deploy-rs <machine> [--dry-run]"
error "Machines: congenital-optimist, sleeper-service, grey-area, reverse-proxy"
exit 1
fi
machine="$2"
dry_run="false"
if [[ "''${3:-}" == "--dry-run" ]]; then
dry_run="true"
fi
deploy_rs_machine "$machine" "$dry_run"
;;
"update-flake")
update_flake
;;
"hybrid-update")
target="''${2:-all}"
dry_run="false"
if [[ "''${3:-}" == "--dry-run" ]]; then
dry_run="true"
fi
hybrid_update "$target" "$dry_run"
;;
"status")
show_status "''${2:-}"
;;
"update")
mode="''${2:-boot}"
if [[ ! "$mode" =~ ^(boot|test|switch)$ ]]; then
error "Invalid mode: $mode. Use boot, test, or switch"
exit 1
fi
update_all_machines "$mode"
;;
*)
echo "Home-lab Management Tool"
echo ""
echo "Usage: lab <command> [options]"
echo ""
echo "Available commands:"
echo " deploy <machine> [mode] - Deploy configuration to a machine (legacy method)"
echo " Machines: congenital-optimist, sleeper-service, grey-area, reverse-proxy"
echo " Modes: boot (default), test, switch"
echo " deploy-rs <machine> [opts] - Deploy using deploy-rs (modern method)"
echo " Options: --dry-run"
echo " update [mode] - Update all machines (legacy method)"
echo " Modes: boot (default), test, switch"
echo " update-flake - Update flake inputs and check configuration"
echo " hybrid-update [target] [opts] - Update flake + deploy with deploy-rs"
echo " Target: machine name or 'all' (default)"
echo " Options: --dry-run"
echo " status - Check infrastructure connectivity"
echo ""
echo "Deployment Methods:"
echo " Legacy (SSH + rsync): Reliable, tested, slower"
echo " Deploy-rs: Modern, automatic rollback, parallel deployment"
echo " Hybrid: Combines flake updates with deploy-rs safety"
echo ""
echo "Ollama AI Tools (when available):"
echo " ollama-cli <command> - Manage Ollama service and models"
echo " monitor-ollama [opts] - Monitor Ollama service health"
echo ""
echo "Examples:"
echo " # Legacy deployment"
echo " lab deploy sleeper-service boot # Deploy and set for next boot"
echo " lab deploy grey-area switch # Deploy and switch immediately"
echo " lab update boot # Update all machines for next boot"
echo ""
echo " # Modern deploy-rs deployment"
echo " lab deploy-rs sleeper-service # Deploy with automatic rollback"
echo " lab deploy-rs grey-area --dry-run # Test deployment without applying"
echo ""
echo " # Hybrid approach (recommended for updates)"
echo " lab hybrid-update sleeper-service # Update flake + deploy specific machine"
echo " lab hybrid-update all --dry-run # Test update all machines"
echo " lab update-flake # Just update flake inputs"
echo ""
echo " # Status and monitoring"
echo " lab status # Check all machines"
echo ""
echo " # Ollama AI tools"
echo " ollama-cli status # Check Ollama service status"
echo " ollama-cli models # List installed AI models"
echo " monitor-ollama --test-inference # Full Ollama health check"
;;
esac
''