diff --git a/dotfiles/README.md b/dotfiles/README.md index 6309110..fa6f187 100644 --- a/dotfiles/README.md +++ b/dotfiles/README.md @@ -5,7 +5,9 @@ This directory contains per-user configurations and dotfiles for the Home-lab in ## Directory Organization ### `geir/` + Primary user configuration for geir: + - `user.nix` - NixOS user configuration (packages, groups, shell) - `dotfiles/` - Literate programming dotfiles using org-mode - `README.org` - Main literate configuration file @@ -14,7 +16,9 @@ Primary user configuration for geir: - `editors/` - Editor configurations (neovim, vscode) ### Future Users + Additional user directories will follow the same pattern: + - `admin/` - Administrative user for system management - `service/` - Service accounts for automation - `guest/` - Temporary/guest user configurations @@ -22,21 +26,27 @@ Additional user directories will follow the same pattern: ## User Configuration Philosophy ### NixOS Integration + Each user has a `user.nix` file that defines: + - User account settings (shell, groups, home directory) - User-specific packages - System-level user configurations - Integration with home lab services ### Literate Dotfiles + Each user's `dotfiles/README.org` serves as: + - Single source of truth for all user configurations - Self-documenting setup with rationale - Auto-tangling to generate actual dotfiles - Version-controlled configuration history ### Multi-Machine Consistency + User configurations are designed to work across machines: + - congenital-optimist: Full development environment - sleeper-service: Minimal server access - Future machines: Consistent user experience @@ -44,7 +54,9 @@ User configurations are designed to work across machines: ## Dotfiles Structure ### `dotfiles/README.org` + Main literate configuration file containing: + - Shell configuration (zsh, starship, aliases) - Editor configurations (emacs, neovim) - Development tool settings @@ -52,6 +64,7 @@ Main literate configuration file containing: - Machine-specific customizations ### Subdirectories + - `emacs/` - Generated Emacs configuration files - `shell/` - Generated shell configuration files - `editors/` - Generated editor configuration files @@ -59,6 +72,7 @@ Main literate configuration file containing: ## Usage Examples ### Importing User Configuration + ```nix # In machine configuration imports = [ @@ -67,12 +81,14 @@ imports = [ ``` ### Adding New User + 1. Create user directory: `users/newuser/` 2. Copy and adapt `user.nix` template 3. Create `dotfiles/README.org` with user-specific configs 4. Import in machine configurations as needed ### Tangling Dotfiles + ```bash # From user's dotfiles directory cd users/geir/dotfiles @@ -98,4 +114,4 @@ emacs --batch -l org --eval "(org-babel-tangle-file \"README.org\")" - **User Directories**: lowercase (e.g., `geir/`, `admin/`) - **Configuration Files**: descriptive names (e.g., `user.nix`, `README.org`) -- **Generated Files**: follow target application conventions \ No newline at end of file +- **Generated Files**: follow target application conventions diff --git a/modules/sound/disable-auto-rnnoise.nix b/modules/sound/disable-auto-rnnoise.nix new file mode 100644 index 0000000..1d908d3 --- /dev/null +++ b/modules/sound/disable-auto-rnnoise.nix @@ -0,0 +1,25 @@ +{ + config, + lib, + pkgs, + ... +}: { + # Optional configuration to disable automatic RNNoise filter + # This can be imported if the automatic filter causes distortion + + services.pipewire = { + extraConfig.pipewire."15-disable-auto-rnnoise" = { + "context.modules" = [ + # Commenting out the automatic RNNoise filter + # Users should use EasyEffects for manual noise suppression instead + # { + # name = "libpipewire-module-filter-chain"; + # args = { + # "node.description" = "Noise Canceling Source"; + # # ... rest of RNNoise config + # }; + # } + ]; + }; + }; +} diff --git a/modules/sound/pipewire.nix b/modules/sound/pipewire.nix index ea50a81..b72de34 100644 --- a/modules/sound/pipewire.nix +++ b/modules/sound/pipewire.nix @@ -24,8 +24,8 @@ "context.properties" = { "default.clock.rate" = 48000; "default.clock.quantum" = 1024; - "default.clock.min-quantum" = 32; - "default.clock.max-quantum" = 2048; + "default.clock.min-quantum" = 64; + "default.clock.max-quantum" = 8192; }; "context.modules" = [ @@ -40,10 +40,10 @@ type = "ladspa"; name = "rnnoise"; plugin = "${pkgs.rnnoise-plugin}/lib/ladspa/librnnoise_ladspa.so"; - label = "noise_suppressor_stereo"; + label = "noise_suppressor_mono"; control = { - "VAD Threshold (%)" = 50.0; - "VAD Grace Period (ms)" = 200; + "VAD Threshold (%)" = 95.0; + "VAD Grace Period (ms)" = 100; "Retroactive VAD Grace (ms)" = 0; }; } @@ -85,6 +85,9 @@ # Validation script (writeShellScriptBin "validate-audio" (builtins.readFile ./validate-audio.sh)) + # Troubleshoot script for voice distortion + (writeShellScriptBin "troubleshoot-voice-distortion" (builtins.readFile ./troubleshoot-voice-distortion.sh)) + # Optional: Professional audio tools # qjackctl # JACK control GUI (for JACK applications) # carla # Audio plugin host diff --git a/modules/sound/troubleshoot-voice-distortion.sh b/modules/sound/troubleshoot-voice-distortion.sh new file mode 100755 index 0000000..ccdedf7 --- /dev/null +++ b/modules/sound/troubleshoot-voice-distortion.sh @@ -0,0 +1,322 @@ +#!/usr/bin/env bash + +# Voice Distortion Troubleshoot Script +# This script helps diagnose and fix voice distortion issues in PipeWire + +# Use safer error handling - don't exit on all errors +set -uo pipefail + +echo "🎀 Voice Distortion Troubleshoot Tool" +echo "====================================" +echo "" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +success() { + echo -e "${GREEN}βœ… $1${NC}" +} + +warning() { + echo -e "${YELLOW}⚠️ $1${NC}" +} + +error() { + echo -e "${RED}❌ $1${NC}" +} + +info() { + echo -e "${BLUE}ℹ️ $1${NC}" +} + +highlight() { + echo -e "${CYAN}πŸ”§ $1${NC}" +} + +echo "Let's diagnose your voice distortion issue step by step..." +echo "" + +# 1. Check current audio settings +echo "1. Current Audio Configuration" +echo "==============================" + +if command -v wpctl >/dev/null 2>&1; then + echo "Default devices:" + wpctl status | head -20 + echo "" + + # Get default source + DEFAULT_SOURCE=$(wpctl inspect @DEFAULT_AUDIO_SOURCE@ 2>/dev/null | grep "node.name" | head -1 | sed 's/.*"\(.*\)".*/\1/' || echo "unknown") + info "Current default source: $DEFAULT_SOURCE" + + # Check sample rate + CURRENT_RATE=$(pw-metadata -n settings | grep "clock.rate" | awk '{print $3}' || echo "unknown") + info "Current sample rate: $CURRENT_RATE Hz" + + # Check buffer size + CURRENT_QUANTUM=$(pw-metadata -n settings | grep "clock.quantum" | awk '{print $3}' || echo "unknown") + info "Current buffer size: $CURRENT_QUANTUM samples" + +else + error "wpctl not available" +fi + +echo "" + +# 2. Check for common distortion causes +echo "2. Distortion Diagnosis" +echo "======================" + +# Check if using RNNoise filter +if command -v pw-dump >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then + if pw-dump 2>/dev/null | jq -r '.[] | select(.info.props."node.name" == "rnnoise_source")' 2>/dev/null | grep -q "rnnoise" 2>/dev/null; then + warning "You're using the RNNoise filter chain - this might be causing distortion" + echo " The automatic filter chain can sometimes cause artifacts" + else + info "Not using automatic RNNoise filter" + fi +else + warning "Cannot check RNNoise filter status (pw-dump or jq not available)" +fi + +# Check for high CPU usage +if command -v pw-top >/dev/null 2>&1; then + highlight "Checking PipeWire performance (5 seconds)..." + if timeout 5 pw-top --batch-mode 2>/dev/null | tail -10 2>/dev/null; then + info "Performance check completed" + else + warning "Could not check performance - pw-top failed" + fi +else + info "pw-top not available for performance checking" +fi + +# Check input levels +if command -v wpctl >/dev/null 2>&1; then + echo "" + echo "Current microphone volume levels:" + if wpctl get-volume @DEFAULT_AUDIO_SOURCE@ 2>/dev/null; then + info "Volume check completed" + else + warning "Could not get volume info - no default audio source?" + fi +else + warning "wpctl not available for volume checking" +fi + +echo "" + +# 3. Quick fixes +echo "3. Quick Fixes to Try" +echo "====================" +echo "" + +echo "Choose a solution to try:" +echo "" +echo "A) Disable automatic RNNoise filter (recommended first step)" +echo "B) Lower microphone input gain" +echo "C) Reduce buffer size for lower latency" +echo "D) Use EasyEffects instead of filter chain" +echo "E) Reset to safe audio settings" +echo "F) Test different sample rates" +echo "G) Monitor audio in real-time" +echo "H) All of the above (comprehensive fix)" +echo "" + +read -p "Enter your choice (A-H): " choice + +case $choice in + A|a) + echo "" + highlight "Disabling automatic RNNoise filter..." + if command -v pw-dump >/dev/null 2>&1 && command -v jq >/dev/null 2>&1 && command -v pw-cli >/dev/null 2>&1; then + # Find and remove RNNoise filter nodes + FILTER_IDS=$(pw-dump 2>/dev/null | jq -r '.[] | select(.info.props."node.name" == "rnnoise_source") | .id' 2>/dev/null || echo "") + if [ -n "$FILTER_IDS" ]; then + echo "$FILTER_IDS" | while read -r id; do + if [ -n "$id" ]; then + echo "Removing filter node $id" + pw-cli destroy "$id" 2>/dev/null || warning "Could not remove filter $id" + fi + done + success "RNNoise filter removal attempted" + else + info "No RNNoise filter found to remove" + fi + echo "Try speaking now. If distortion is gone, use EasyEffects for noise suppression instead." + else + warning "Required tools not available (pw-dump, jq, pw-cli)" + echo "Try manually: systemctl --user restart pipewire" + fi + ;; + + B|b) + echo "" + highlight "Lowering microphone input gain to 50%..." + wpctl set-volume @DEFAULT_AUDIO_SOURCE@ 50% + success "Microphone gain reduced to 50%" + echo "Test your voice now. Adjust further if needed with: wpctl set-volume @DEFAULT_AUDIO_SOURCE@ X%" + ;; + + C|c) + echo "" + highlight "Setting lower buffer size for reduced latency..." + pw-metadata -n settings 0 clock.force-quantum 512 + success "Buffer size set to 512 samples" + echo "This should reduce latency but may increase CPU usage" + ;; + + D|d) + echo "" + highlight "Launching EasyEffects for manual noise suppression..." + if command -v easyeffects >/dev/null 2>&1; then + easyeffects & + success "EasyEffects launched" + echo "" + echo "In EasyEffects:" + echo "1. Go to 'Input' tab" + echo "2. Add 'RNNoise' effect" + echo "3. Set 'VAD Threshold' to 95% (very conservative)" + echo "4. Set 'Wet' signal to 50-70% (not 100%)" + echo "5. Disable any other aggressive processing" + else + error "EasyEffects not available" + fi + ;; + + E|e) + echo "" + highlight "Resetting to safe audio settings..." + # Reset quantum + pw-metadata -n settings 0 clock.force-quantum 0 + # Reset rate + pw-metadata -n settings 0 clock.force-rate 0 + # Set reasonable volume + wpctl set-volume @DEFAULT_AUDIO_SOURCE@ 70% + # Restart audio services + systemctl --user restart pipewire pipewire-pulse wireplumber + success "Audio settings reset to defaults" + echo "Wait 5 seconds for services to restart, then test your voice" + ;; + + F|f) + echo "" + highlight "Testing different sample rates..." + echo "Current rate: $(pw-metadata -n settings | grep clock.rate | awk '{print $3}' || echo 'default')" + echo "" + echo "Trying 44100 Hz..." + pw-metadata -n settings 0 clock.force-rate 44100 + sleep 2 + echo "Test your voice now. Press Enter to continue..." + read + echo "Trying 48000 Hz..." + pw-metadata -n settings 0 clock.force-rate 48000 + sleep 2 + echo "Test your voice now. Press Enter to continue..." + read + echo "Back to automatic rate..." + pw-metadata -n settings 0 clock.force-rate 0 + success "Rate testing complete" + ;; + + G|g) + echo "" + highlight "Starting real-time audio monitoring..." + echo "Press Ctrl+C to stop monitoring" + echo "" + if command -v pw-top >/dev/null 2>&1; then + pw-top + else + echo "Monitoring with wpctl status (updating every 2 seconds):" + while true; do + clear + echo "=== PipeWire Status ===" + wpctl status + echo "" + echo "=== Microphone Volume ===" + wpctl get-volume @DEFAULT_AUDIO_SOURCE@ + echo "" + echo "Press Ctrl+C to stop" + sleep 2 + done + fi + ;; + + H|h) + echo "" + highlight "Running comprehensive fix..." + + # Step 1: Disable RNNoise filter + echo "1/6: Disabling automatic RNNoise filter..." + if command -v pw-dump >/dev/null 2>&1 && command -v jq >/dev/null 2>&1; then + FILTER_IDS=$(pw-dump 2>/dev/null | jq -r '.[] | select(.info.props."node.name" == "rnnoise_source") | .id' 2>/dev/null || echo "") + if [ -n "$FILTER_IDS" ]; then + echo "$FILTER_IDS" | while read -r id; do + if [ -n "$id" ]; then + pw-cli destroy "$id" 2>/dev/null || true + fi + done + fi + fi + + # Step 2: Reset audio settings + echo "2/6: Resetting audio settings..." + pw-metadata -n settings 0 clock.force-quantum 0 2>/dev/null || true + pw-metadata -n settings 0 clock.force-rate 0 2>/dev/null || true + + # Step 3: Set conservative volume + echo "3/6: Setting conservative microphone gain..." + wpctl set-volume @DEFAULT_AUDIO_SOURCE@ 60% 2>/dev/null || warning "Could not set volume" + + # Step 4: Restart services + echo "4/6: Restarting audio services..." + systemctl --user restart pipewire pipewire-pulse wireplumber 2>/dev/null || warning "Could not restart services" + + # Step 5: Wait for restart + echo "5/6: Waiting for services to stabilize..." + sleep 5 + + # Step 6: Launch EasyEffects + echo "6/6: Launching EasyEffects for manual control..." + if command -v easyeffects >/dev/null 2>&1; then + easyeffects & + success "Comprehensive fix applied!" + echo "" + echo "Next steps:" + echo "1. Test your voice without any effects first" + echo "2. In EasyEffects, gradually add noise suppression:" + echo " - Start with RNNoise at 50% wet signal" + echo " - Use VAD threshold of 95% or higher" + echo " - Avoid aggressive compression or EQ" + echo "3. If still distorted, try lowering input gain further" + else + warning "EasyEffects not available for manual control" + fi + ;; + + *) + error "Invalid choice" + ;; +esac + +echo "" +echo "🎯 Additional Tips to Prevent Distortion:" +echo "=========================================" +echo "" +echo "β€’ Keep microphone gain below 80% to avoid clipping" +echo "β€’ Use RNNoise conservatively (50-70% wet signal, not 100%)" +echo "β€’ Check for background applications using audio" +echo "β€’ Ensure your microphone hardware supports 48kHz" +echo "β€’ Consider using a better quality microphone" +echo "β€’ Avoid stacking multiple noise reduction effects" +echo "" + +echo "Run this script again anytime with: troubleshoot-voice-distortion" +echo "" +echo "βœ… Script completed successfully!" +exit 0 diff --git a/research/netdata-home-lab-research.md b/research/netdata-home-lab-research.md new file mode 100644 index 0000000..2830f61 --- /dev/null +++ b/research/netdata-home-lab-research.md @@ -0,0 +1,607 @@ +# Netdata Research: Metrics Aggregation for Home Lab + +*Research conducted June 19, 2025* + +## Executive Summary + +Netdata is a highly viable metrics aggregation solution for your home lab infrastructure. It offers real-time monitoring with per-second granularity, minimal resource usage, and excellent scalability through its Parent-Child architecture. The recent addition of a beta MCP (Model Context Protocol) server makes it particularly interesting for integration with AI tooling and your existing workflow. + +## Key Advantages for Home Lab Use + +### 1. **Real-Time Monitoring Excellence** + +- **Per-second metrics collection** - True real-time visibility +- **1-second dashboard latency** - Instant feedback for troubleshooting +- **Zero sampling** - Complete data fidelity +- **800+ integrations** out of the box + +### 2. **Resource Efficiency** + +- **Most energy-efficient monitoring tool** according to University of Amsterdam study +- **40x better storage efficiency** compared to traditional solutions +- **22x faster responses** than alternatives +- **Uses only 15% of resources** compared to similar tools + +### 3. **Perfect Home Lab Architecture** + +- **Zero-configuration deployment** - Auto-discovers services +- **Distributed by design** - No centralized data collection required +- **Edge-based ML** - Anomaly detection runs locally on each node +- **Parent-Child streaming** - Centralize dashboards while keeping data local + +### 4. **Advanced Features** + +- **Built-in ML anomaly detection** - One model per metric, trained locally +- **Pre-configured alerts** - 400+ ready-to-use alert templates +- **Multiple notification channels** - Slack, Discord, email, PagerDuty, etc. +- **Export capabilities** - Prometheus, InfluxDB, Graphite integration + +## Architecture Options for Home Lab + +### Option 1: Standalone Deployment (Simple) + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Machine 1 β”‚ β”‚ Machine 2 β”‚ β”‚ Machine N β”‚ +β”‚ (Netdata β”‚ β”‚ (Netdata β”‚ β”‚ (Netdata β”‚ +β”‚ Agent) β”‚ β”‚ Agent) β”‚ β”‚ Agent) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Netdata Cloud β”‚ + β”‚ (Optional) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +**Benefits:** + +- Simple setup and maintenance +- Each node retains its own data +- No single point of failure +- Perfect for learning and small deployments + +### Option 2: Parent-Child Architecture (Recommended) + +``` + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Netdata Parent β”‚ + β”‚ (Central Hub) β”‚ + β”‚ - Dashboards β”‚ + β”‚ - Long retentionβ”‚ + β”‚ - Alerts β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Netdata Child β”‚ β”‚ Netdata Child β”‚ β”‚ Netdata Child β”‚ + β”‚ (NixOS VMs) β”‚ β”‚ (Containers) β”‚ β”‚ (IoT devices) β”‚ + β”‚ - Thin mode β”‚ β”‚ - Thin mode β”‚ β”‚ - Thin mode β”‚ + β”‚ - Local buffer β”‚ β”‚ - Local buffer β”‚ β”‚ - Local buffer β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +**Benefits:** + +- Centralized dashboards and alerting +- Extended retention on Parent node +- Reduced resource usage on Child nodes +- Better for production-like home lab setups + +### Option 3: High Availability Cluster (Advanced) + +``` + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ Netdata Parent 1│◄───►│ Netdata Parent 2β”‚ + β”‚ (Primary) β”‚ β”‚ (Backup) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ β”‚ β”‚ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚Child 1 β”‚ β”‚Child 2 β”‚ β”‚Child 3 β”‚ β”‚Child N β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +**Benefits:** + +- No single point of failure +- Automatic failover +- Load distribution +- Production-grade reliability + +## Integration with Your NixOS Infrastructure + +### NixOS Configuration + +```nix +# In your NixOS configuration.nix +{ + services.netdata = { + enable = true; + config = { + global = { + "default port" = "19999"; + "memory mode" = "ram"; # For children + # "memory mode" = "save"; # For parents + }; + + # For Parent nodes + streaming = { + enabled = "yes"; + "allow from" = "*"; + "default memory mode" = "ram"; + }; + + # For Child nodes + stream = { + enabled = "yes"; + destination = "parent.yourdomain.local"; + "api key" = "your-api-key"; + }; + }; + }; + + # Open firewall for Netdata + networking.firewall.allowedTCPPorts = [ 19999 ]; +} +``` + +### Deployment Strategy for Your Lab + +1. **Reverse Proxy** (grey-area): Netdata Parent + Nginx reverse proxy +2. **Sleeper Service** (NFS): Netdata Child with storage monitoring +3. **Congenital Optimist**: Netdata Child with system monitoring +4. **VM workloads**: Netdata Children in thin mode + +## MCP Server Integration (Beta Feature) + +Netdata recently introduced an **MCP (Model Context Protocol) server in beta**. This is particularly relevant for your AI-integrated workflow: + +### What It Offers + +- **AI-powered metric analysis** through standardized MCP interface +- **Integration with Claude, ChatGPT, and other LLMs** for intelligent monitoring +- **Natural language queries** about your infrastructure metrics +- **Automated root cause analysis** using AI reasoning +- **Contextual alerting** with AI-generated insights + +### Potential Use Cases + +```bash +# Example MCP interactions (conceptual) +"What's causing high CPU on sleeper-service?" +"Show me network anomalies from the last hour" +"Compare current metrics to last week's baseline" +"Generate a performance report for grey-area" +``` + +### Integration with Your Existing MCP Setup + +Since you're already using MCP servers (TaskMaster, Context7), adding Netdata's MCP server would create a powerful monitoring-AI pipeline: + +``` +Your Infrastructure β†’ Netdata β†’ MCP Server β†’ AI Analysis β†’ Insights +``` + +## Comparison with Alternatives + +### vs. Prometheus + Grafana + +| Feature | Netdata | Prometheus + Grafana | +|---------|---------|---------------------| +| Setup Complexity | Zero-config | Complex setup | +| Real-time Data | 1-second | 15-second minimum | +| Resource Usage | Very low | Higher | +| Built-in ML | Yes | No | +| Dashboards | Auto-generated | Manual creation | +| Storage Efficiency | 40x better | Standard | + +### vs. Zabbix + +| Feature | Netdata | Zabbix | +|---------|---------|---------| +| Agent Overhead | Minimal | Higher | +| Configuration | Auto-discovery | Manual setup | +| Scalability | Horizontal | Vertical | +| Modern UI | Yes | Traditional | +| Cloud Integration | Native | Limited | + +### vs. DataDog/Commercial SaaS + +| Feature | Netdata | Commercial SaaS | +|---------|---------|-----------------| +| Cost | Open Source | Expensive | +| Data Sovereignty | Local | Vendor-hosted | +| Customization | Full control | Limited | +| Lock-in Risk | None | High | + +## Implementation Roadmap + +### Phase 1: Basic Deployment (Week 1) + +1. Deploy Netdata Parent on **grey-area** +2. Install Netdata Children on main nodes +3. Configure basic streaming +4. Set up reverse proxy for external access + +### Phase 2: Integration (Week 2-3) + +1. Configure alerts and notifications +2. Set up Prometheus export for existing tools +3. Integrate with your existing monitoring stack +4. Configure retention policies + +### Phase 3: Advanced Features (Week 4+) + +1. Enable MCP server (beta) +2. Set up high availability if needed +3. Custom dashboard creation +4. Advanced alert tuning + +## Potential Challenges + +### 1. **Learning Curve** + +- New terminology (Parent/Child vs traditional) +- Different approach to metrics storage +- **Mitigation**: Excellent documentation and active community + +### 2. **Beta MCP Server** + +- Still in beta development +- Limited documentation +- **Mitigation**: Conservative adoption, wait for stability + +### 3. **Integration Complexity** + +- May need adaptation of existing monitoring workflows +- **Mitigation**: Gradual migration, parallel running during transition + +## Resource Requirements + +### Minimal Setup (Per Node) + +- **CPU**: 1-2% of a single core +- **RAM**: 20-100MB depending on metrics count +- **Disk**: 100MB for agent + retention data +- **Network**: Minimal bandwidth for streaming + +### Parent Node (Centralized) + +- **CPU**: 2-4 cores for 10-20 children +- **RAM**: 2-4GB for extended retention +- **Disk**: 10-50GB depending on retention period +- **Network**: Higher bandwidth for ingesting streams + +## Recommendations + +### For Your Home Lab: **Strong Yes** + +1. **Start with Parent-Child architecture** on grey-area as Parent +2. **Deploy gradually** - begin with critical nodes +3. **Integrate with existing Prometheus** via export +4. **Monitor MCP server development** for AI integration +5. **Consider as primary monitoring solution** due to superior efficiency + +### Specific Benefits for Your Use Case + +- **Perfect fit for NixOS** - declarative configuration +- **Complements your AI workflow** - MCP integration potential +- **Scales with lab growth** - from single nodes to complex topologies +- **Energy efficient** - important for home lab power consumption +- **Real-time visibility** - excellent for development and testing + +## Next Steps + +1. **Proof of Concept**: Deploy on grey-area as standalone +2. **Evaluate**: Run for 1-2 weeks alongside current monitoring +3. **Expand**: Add children nodes if satisfied +4. **Integrate**: Connect with existing toolchain +5. **MCP Beta**: Request early access to MCP server + +## Conclusion + +Netdata represents a modern, efficient approach to infrastructure monitoring that aligns well with your home lab's goals. Its combination of real-time capabilities, minimal resource usage, and emerging AI integration through MCP makes it an excellent choice for sophisticated home lab environments. The Parent-Child architecture provides enterprise-grade capabilities while maintaining the simplicity needed for home lab management. + +The addition of MCP server support positions Netdata at the forefront of AI-integrated monitoring, making it particularly appealing given your existing investment in MCP-based tooling. + +## References + +- [Netdata GitHub Repository](https://github.com/netdata/netdata) +- [Netdata Documentation](https://learn.netdata.cloud/) +- [University of Amsterdam Energy Efficiency Study](https://www.ivanomalavolta.com/files/papers/ICSOC_2023.pdf) +- [Netdata vs Prometheus Comparison](https://www.netdata.cloud/blog/netdata-vs-prometheus-2025/) +- [Netdata MCP Server Documentation](https://github.com/netdata/netdata/blob/master/docs/mcp.md) (Beta) + +## Netdata API for Custom Web Dashboards + +Netdata provides a comprehensive REST API that makes it perfect for integrating with custom web dashboards. The API is exposed locally on each Netdata agent and can be used to fetch real-time metrics in various formats. + +### API Overview + +**Base URL**: `http://localhost:19999/api/v1/` + +**Primary Endpoints**: +- `/api/v1/data` - Query time-series data +- `/api/v1/charts` - Get available charts +- `/api/v1/allmetrics` - Get all metrics in shell-friendly format +- `/api/v1/badge.svg` - Generate SVG badges + +### Key API Features for Dashboard Integration + +1. **Multiple Output Formats** + - JSON (default) + - CSV + - TSV + - JSONP + - Plain text + - Shell variables + +2. **Real-Time Data Access** + - Per-second granularity + - Live streaming capabilities + - Historical data queries + +3. **Flexible Query Parameters** + - Time range selection + - Data grouping and aggregation + - Dimension filtering + - Custom sampling intervals + +### API Query Examples + +#### Basic Data Query +```bash +# Get CPU system data for the last 60 seconds +curl "http://localhost:19999/api/v1/data?chart=system.cpu&after=-60&dimensions=system" + +# Response format: +{ + "api": 1, + "id": "system.cpu", + "name": "system.cpu", + "update_every": 1, + "first_entry": 1640995200, + "last_entry": 1640995260, + "before": 1640995260, + "after": 1640995200, + "dimension_names": ["guest_nice", "guest", "steal", "softirq", "irq", "system", "user", "nice", "iowait"], + "dimension_ids": ["guest_nice", "guest", "steal", "softirq", "irq", "system", "user", "nice", "iowait"], + "latest_values": [0, 0, 0, 0.502513, 0, 2.512563, 5.025126, 0, 0.502513], + "view_update_every": 1, + "dimensions": 9, + "points": 61, + "format": "json", + "result": { + "data": [ + [1640995201, 0, 0, 0, 0.0025, 0, 0.0125, 0.025, 0, 0.0025], + [1640995202, 0, 0, 0, 0.005, 0, 0.0275, 0.0525, 0, 0.005] + // ... more data points + ] + } +} +``` + +#### Available Charts Discovery +```bash +# Get all available charts +curl "http://localhost:19999/api/v1/charts" + +# Returns JSON with all chart definitions including: +# - Chart IDs and names +# - Available dimensions +# - Update frequencies +# - Chart types and units +``` + +#### Memory Usage Example +```bash +# Get memory usage data with specific grouping +curl "http://localhost:19999/api/v1/data?chart=system.ram&after=-300&points=60&group=average" +``` + +#### Network Interface Metrics +```bash +# Get network traffic for specific interface +curl "http://localhost:19999/api/v1/data?chart=net.eth0&after=-60&dimensions=received,sent" +``` + +#### All Metrics in Shell Format +```bash +# Perfect for scripting and automation +curl "http://localhost:19999/api/v1/allmetrics" + +# Example output: +NETDATA_SYSTEM_CPU_USER=2.5 +NETDATA_SYSTEM_CPU_SYSTEM=1.2 +NETDATA_SYSTEM_RAM_USED=4096 +# ... all metrics as shell variables +``` + +### Advanced Query Parameters + +| Parameter | Description | Example | +|-----------|-------------|---------| +| `chart` | Chart ID to query | `system.cpu` | +| `after` | Start time (unix timestamp or relative) | `-60` (60 seconds ago) | +| `before` | End time (unix timestamp or relative) | `-30` (30 seconds ago) | +| `points` | Number of data points to return | `100` | +| `group` | Grouping method | `average`, `max`, `min`, `sum` | +| `gtime` | Group time in seconds | `60` (1-minute averages) | +| `dimensions` | Specific dimensions to include | `user,system,iowait` | +| `format` | Output format | `json`, `csv`, `jsonp` | +| `options` | Query options | `unaligned`, `percentage` | + +### Web Dashboard Integration Strategies + +#### 1. Direct AJAX Calls +```javascript +// Fetch CPU data for dashboard widget +fetch('http://localhost:19999/api/v1/data?chart=system.cpu&after=-60&points=60') + .then(response => response.json()) + .then(data => { + // Process data for chart library (Chart.js, D3, etc.) + updateCPUChart(data.result.data); + }); +``` + +#### 2. Server-Side Proxy +```javascript +// Proxy through your web server to avoid CORS issues +fetch('/api/netdata/system.cpu?after=-60') + .then(response => response.json()) + .then(data => updateWidget(data)); +``` + +#### 3. Real-Time Updates +```javascript +// Poll for updates every second +setInterval(() => { + fetch('http://localhost:19999/api/v1/data?chart=system.cpu&after=-1&points=1') + .then(response => response.json()) + .then(data => updateRealTimeMetrics(data)); +}, 1000); +``` + +### Custom Dashboard Implementation Example + +```html + + + + Home Lab Dashboard + + + +
+
+ +
+
+ +
+
+ +
+
+ + + + +``` + +### Integration Considerations + +#### 1. **CORS Handling** +- Netdata allows cross-origin requests by default +- For production, consider proxying through your web server +- Use server-side API calls for sensitive environments + +#### 2. **Performance Optimization** +- Cache frequently accessed chart definitions +- Use appropriate `points` parameter to limit data transfer +- Implement efficient polling strategies +- Consider WebSocket connections for real-time updates + +#### 3. **Data Processing** +- Netdata returns timestamps and values as arrays +- Convert to your chart library's expected format +- Handle missing data points gracefully +- Implement data aggregation for longer time ranges + +#### 4. **Error Handling** +```javascript +async function safeNetdataFetch(endpoint) { + try { + const response = await fetch(endpoint); + if (!response.ok) throw new Error(`HTTP ${response.status}`); + return await response.json(); + } catch (error) { + console.error('Netdata API error:', error); + return null; + } +} +``` + +### Multi-Node Dashboard + +For Parent-Child deployments, you can create a unified dashboard: + +```javascript +class MultiNodeDashboard { + constructor(nodes) { + this.nodes = nodes; // [{ name: 'server1', url: 'http://server1:19999' }, ...] + } + + async fetchFromAllNodes(chart) { + const promises = this.nodes.map(async node => { + const data = await fetch(`${node.url}/api/v1/data?chart=${chart}&after=-60`); + return { node: node.name, data: await data.json() }; + }); + return Promise.all(promises); + } +} +``` + +### API Documentation Resources + +- **Swagger Documentation**: https://learn.netdata.cloud/api +- **OpenAPI Spec**: https://raw.githubusercontent.com/netdata/netdata/master/src/web/api/netdata-swagger.yaml +- **Query Documentation**: https://learn.netdata.cloud/docs/developer-and-contributor-corner/rest-api/queries/ + +### Conclusion + +Netdata's REST API provides excellent capabilities for custom web dashboard integration: + +βœ… **Real-time data access** with sub-second latency +βœ… **Multiple output formats** including JSON and CSV +βœ… **Flexible query parameters** for precise data selection +βœ… **No authentication required** for local access +βœ… **CORS-friendly** for web applications +βœ… **Well-documented** with OpenAPI specification + +The API is production-ready and provides all the data access patterns needed for sophisticated custom dashboards, making it an excellent choice for integrating Netdata metrics into your existing home lab web interfaces.