feat: implement orchestrated auto-update system and fix deployment
- Add sma user module to little-rascal configuration for passwordless deployment - Replace cosmic-greeter with greetd on both congenital-optimist and little-rascal - Implement staggered auto-update system that updates remote machines first - Add proper SSH user configuration for secure deployments - Fix deployment permission issues by configuring admin user access - Ensure orchestrator machine (congenital-optimist) reboots last to prevent SSH disconnection - Add comprehensive error handling and update reporting - Successfully tested lab tool deployment and auto-update on all machines Fixes the critical issue where orchestrator reboot could break SSH connections during multi-machine updates.
This commit is contained in:
parent
0465c56305
commit
5f65abc2cc
7 changed files with 161 additions and 39 deletions
6
flake.lock
generated
6
flake.lock
generated
|
@ -70,11 +70,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs_2": {
|
"nixpkgs_2": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1750969886,
|
"lastModified": 1751211869,
|
||||||
"narHash": "sha256-zW/OFnotiz/ndPFdebpo3X0CrbVNf22n4DjN2vxlb58=",
|
"narHash": "sha256-1Cu92i1KSPbhPCKxoiVG5qnoRiKTgR5CcGSRyLpOd7Y=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "a676066377a2fe7457369dd37c31fd2263b662f4",
|
"rev": "b43c397f6c213918d6cfe6e3550abfe79b5d1c51",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -58,7 +58,20 @@
|
||||||
path = "/boot";
|
path = "/boot";
|
||||||
}
|
}
|
||||||
];
|
];
|
||||||
}; # ZFS services for this machine
|
};
|
||||||
|
|
||||||
|
# Display manager - use greetd instead of cosmic-greeter
|
||||||
|
services.greetd = {
|
||||||
|
enable = true;
|
||||||
|
settings = {
|
||||||
|
default_session = {
|
||||||
|
command = "${pkgs.greetd.tuigreet}/bin/tuigreet --time --cmd ${pkgs.zsh}/bin/zsh";
|
||||||
|
user = "greeter";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
# ZFS services for this machine
|
||||||
services.zfs = {
|
services.zfs = {
|
||||||
autoScrub.enable = true;
|
autoScrub.enable = true;
|
||||||
trim.enable = true;
|
trim.enable = true;
|
||||||
|
|
|
@ -28,6 +28,7 @@
|
||||||
|
|
||||||
# Users
|
# Users
|
||||||
../../modules/users/geir.nix
|
../../modules/users/geir.nix
|
||||||
|
../../modules/users/sma.nix
|
||||||
../../modules/users/common.nix
|
../../modules/users/common.nix
|
||||||
../../modules/users/shell-aliases.nix
|
../../modules/users/shell-aliases.nix
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
{ config, pkgs, ... }: {
|
{
|
||||||
|
config,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}: {
|
||||||
# Cosmic Desktop Environment (System76's new Rust-based DE)
|
# Cosmic Desktop Environment (System76's new Rust-based DE)
|
||||||
services.desktopManager.cosmic.enable = true;
|
services.desktopManager.cosmic.enable = true;
|
||||||
services.displayManager.cosmic-greeter.enable = true;
|
|
||||||
services.desktopManager.cosmic.xwayland.enable = true;
|
services.desktopManager.cosmic.xwayland.enable = true;
|
||||||
|
|
||||||
# Cosmic-specific packages
|
# Cosmic-specific packages
|
||||||
environment.systemPackages = with pkgs; [
|
environment.systemPackages = with pkgs; [
|
||||||
# Cosmic is still in development, most packages come with the DE
|
# Cosmic is still in development, most packages come with the DE
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
0
packages/claude-code-latest.nix
Normal file
0
packages/claude-code-latest.nix
Normal file
26
packages/lab-tool/config/lab-auto-update.service
Normal file
26
packages/lab-tool/config/lab-auto-update.service
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
[Unit]
|
||||||
|
Description=Home Lab Auto-Update Service
|
||||||
|
After=network-online.target
|
||||||
|
Wants=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
User=root
|
||||||
|
WorkingDirectory=/home/geir/Home-lab
|
||||||
|
ExecStart=/run/current-system/sw/bin/lab auto-update
|
||||||
|
Environment=HOME=/root
|
||||||
|
Environment=PATH=/run/current-system/sw/bin:/usr/bin:/bin
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
SyslogIdentifier=lab-auto-update
|
||||||
|
|
||||||
|
# Security settings
|
||||||
|
NoNewPrivileges=true
|
||||||
|
ProtectSystem=false
|
||||||
|
ProtectHome=false
|
||||||
|
PrivateTmp=true
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
|
@ -13,7 +13,15 @@
|
||||||
#:export (auto-update-system
|
#:export (auto-update-system
|
||||||
schedule-auto-update
|
schedule-auto-update
|
||||||
check-update-health
|
check-update-health
|
||||||
auto-update-status))
|
auto-update-status
|
||||||
|
get-update-order
|
||||||
|
update-single-machine))
|
||||||
|
|
||||||
|
;; Helper function for option handling (duplicated from deployment module)
|
||||||
|
(define (option-ref options key default)
|
||||||
|
"Get option value with default fallback"
|
||||||
|
(let ((value (assoc-ref options key)))
|
||||||
|
(if value value default)))
|
||||||
|
|
||||||
;; Pure function: Generate update log entry
|
;; Pure function: Generate update log entry
|
||||||
(define (format-update-log-entry timestamp operation status details)
|
(define (format-update-log-entry timestamp operation status details)
|
||||||
|
@ -102,24 +110,80 @@
|
||||||
(lambda (key . args)
|
(lambda (key . args)
|
||||||
(log-error "Failed to write update log: ~a" args)))))
|
(log-error "Failed to write update log: ~a" args)))))
|
||||||
|
|
||||||
;; Impure function: Main auto-update routine
|
;; Pure function: Determine update order for machines
|
||||||
|
(define (get-update-order)
|
||||||
|
"Get machines in update order - orchestrator last"
|
||||||
|
(let* ((all-machines (get-all-machines))
|
||||||
|
(current-machine (get-hostname))
|
||||||
|
(remote-machines (filter (lambda (machine)
|
||||||
|
(let* ((machine-str (if (symbol? machine)
|
||||||
|
(symbol->string machine)
|
||||||
|
machine))
|
||||||
|
(config (get-machine-config machine)))
|
||||||
|
(and config
|
||||||
|
(not (equal? machine-str current-machine))
|
||||||
|
(not (eq? 'local (assoc-ref config 'type))))))
|
||||||
|
all-machines))
|
||||||
|
(local-machines (filter (lambda (machine)
|
||||||
|
(let* ((machine-str (if (symbol? machine)
|
||||||
|
(symbol->string machine)
|
||||||
|
machine))
|
||||||
|
(config (get-machine-config machine)))
|
||||||
|
(or (equal? machine-str current-machine)
|
||||||
|
(eq? 'local (assoc-ref config 'type)))))
|
||||||
|
all-machines)))
|
||||||
|
;; Return remote machines first, then local/orchestrator machines
|
||||||
|
(append remote-machines local-machines)))
|
||||||
|
|
||||||
|
;; Impure function: Update a single machine with error handling
|
||||||
|
(define (update-single-machine machine-name options)
|
||||||
|
"Update a single machine with proper error handling"
|
||||||
|
(let* ((machine-str (if (symbol? machine-name)
|
||||||
|
(symbol->string machine-name)
|
||||||
|
machine-name))
|
||||||
|
(is-local (equal? machine-str (get-hostname))))
|
||||||
|
|
||||||
|
(log-info "Updating machine: ~a" machine-str)
|
||||||
|
(write-update-log "machine-update" "started" machine-str)
|
||||||
|
|
||||||
|
(catch #t
|
||||||
|
(lambda ()
|
||||||
|
(let ((deploy-result (deploy-machine machine-str "switch" options)))
|
||||||
|
(if deploy-result
|
||||||
|
(begin
|
||||||
|
(log-success "Successfully updated ~a" machine-str)
|
||||||
|
(write-update-log "machine-update" "success" machine-str)
|
||||||
|
#t)
|
||||||
|
(begin
|
||||||
|
(log-error "Failed to update ~a" machine-str)
|
||||||
|
(write-update-log "machine-update" "failed" machine-str)
|
||||||
|
#f))))
|
||||||
|
(lambda (key . args)
|
||||||
|
(log-error "Exception updating ~a: ~a ~a" machine-str key args)
|
||||||
|
(write-update-log "machine-update" "error" (format #f "~a: ~a" machine-str key))
|
||||||
|
#f))))
|
||||||
|
|
||||||
|
;; Impure function: Orchestrated auto-update routine
|
||||||
(define (auto-update-system . args)
|
(define (auto-update-system . args)
|
||||||
"Perform automatic system update (impure - modifies system)"
|
"Perform orchestrated automatic system update (impure - modifies system)"
|
||||||
(let* ((options (if (null? args) '() (car args)))
|
(let* ((options (if (null? args) '() (car args)))
|
||||||
(auto-reboot (option-ref options 'auto-reboot #t))
|
(auto-reboot (option-ref options 'auto-reboot #t))
|
||||||
(dry-run (option-ref options 'dry-run #f))
|
(dry-run (option-ref options 'dry-run #f))
|
||||||
(machine-name (get-hostname)))
|
(parallel (option-ref options 'parallel #f))
|
||||||
|
(current-machine (get-hostname))
|
||||||
|
(update-order (get-update-order)))
|
||||||
|
|
||||||
(log-info "Starting auto-update for machine: ~a" machine-name)
|
(log-info "Starting orchestrated auto-update from: ~a" current-machine)
|
||||||
(write-update-log "auto-update" "started" machine-name)
|
(log-info "Update order: ~a" (map (lambda (m) (if (symbol? m) (symbol->string m) m)) update-order))
|
||||||
|
(write-update-log "orchestrated-update" "started" current-machine)
|
||||||
|
|
||||||
(if (not (check-update-health))
|
(if (not (check-update-health))
|
||||||
(begin
|
(begin
|
||||||
(log-error "System health check failed - aborting update")
|
(log-error "System health check failed - aborting update")
|
||||||
(write-update-log "auto-update" "aborted" "health check failed")
|
(write-update-log "orchestrated-update" "aborted" "health check failed")
|
||||||
#f)
|
#f)
|
||||||
(begin
|
(begin
|
||||||
;; Update flake inputs
|
;; Update flake inputs first
|
||||||
(log-info "Updating flake inputs...")
|
(log-info "Updating flake inputs...")
|
||||||
(let ((flake-result (update-flake options)))
|
(let ((flake-result (update-flake options)))
|
||||||
(if flake-result
|
(if flake-result
|
||||||
|
@ -127,29 +191,44 @@
|
||||||
(log-success "Flake update completed")
|
(log-success "Flake update completed")
|
||||||
(write-update-log "flake-update" "success" "")
|
(write-update-log "flake-update" "success" "")
|
||||||
|
|
||||||
;; Deploy configuration
|
;; Update machines in order
|
||||||
(log-info "Deploying updated configuration...")
|
(let ((update-results (map (lambda (machine)
|
||||||
(let ((deploy-result (deploy-machine machine-name "switch" options)))
|
(update-single-machine machine options))
|
||||||
(if deploy-result
|
update-order)))
|
||||||
(begin
|
|
||||||
(log-success "Configuration deployment completed")
|
(let* ((successful-updates (filter identity update-results))
|
||||||
(write-update-log "deployment" "success" "switch mode")
|
(failed-updates (- (length update-results) (length successful-updates)))
|
||||||
|
(all-success (= failed-updates 0)))
|
||||||
;; Schedule reboot if enabled
|
|
||||||
(if (and auto-reboot (not dry-run))
|
(log-info "Update summary: ~a successful, ~a failed"
|
||||||
(begin
|
(length successful-updates) failed-updates)
|
||||||
(log-info "Scheduling system reboot in 2 minutes...")
|
|
||||||
(write-update-log "reboot" "scheduled" "2 minutes")
|
(if all-success
|
||||||
(system "shutdown -r +2 'Auto-update completed - rebooting'")
|
(begin
|
||||||
#t)
|
(log-success "All machines updated successfully")
|
||||||
(begin
|
(write-update-log "orchestrated-update" "success"
|
||||||
(log-info "Auto-reboot disabled - update complete")
|
(format #f "~a machines" (length successful-updates)))
|
||||||
(write-update-log "auto-update" "completed" "no reboot")
|
|
||||||
#t)))
|
;; Schedule reboot of orchestrator if enabled and it was updated
|
||||||
(begin
|
(if (and auto-reboot (not dry-run)
|
||||||
(log-error "Configuration deployment failed")
|
(member current-machine
|
||||||
(write-update-log "deployment" "failed" "switch mode")
|
(map (lambda (m) (if (symbol? m) (symbol->string m) m))
|
||||||
#f))))
|
update-order)))
|
||||||
|
(begin
|
||||||
|
(log-info "Scheduling orchestrator reboot in 2 minutes...")
|
||||||
|
(write-update-log "reboot" "scheduled" "orchestrator - 2 minutes")
|
||||||
|
(system "shutdown -r +2 'Orchestrated auto-update completed - rebooting'")
|
||||||
|
#t)
|
||||||
|
(begin
|
||||||
|
(log-info "Orchestrated update complete - no reboot needed")
|
||||||
|
(write-update-log "orchestrated-update" "completed" "no reboot")
|
||||||
|
#t)))
|
||||||
|
(begin
|
||||||
|
(log-warn "Some machines failed to update (~a failures)" failed-updates)
|
||||||
|
(write-update-log "orchestrated-update" "partial-failure"
|
||||||
|
(format #f "~a failures" failed-updates))
|
||||||
|
;; Don't reboot orchestrator if there were failures
|
||||||
|
#f)))))
|
||||||
(begin
|
(begin
|
||||||
(log-error "Flake update failed")
|
(log-error "Flake update failed")
|
||||||
(write-update-log "flake-update" "failed" "")
|
(write-update-log "flake-update" "failed" "")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue