
🚀 Major Performance Improvements: - Increased CPU quota from 800% to 2000% (20/24 cores) - Enhanced threading: OMP/MKL/BLAS threads from 8 to 20 - Upgraded context length from 4096 to 8192 tokens - Deployed optimized 7-8B parameter models 🔧 Infrastructure Enhancements: - Updated ollama.nix with comprehensive CPU optimizations - Added memory-efficient q8_0 KV cache configuration - Implemented systemd resource limits and I/O optimizations - Forced cpu_avx2 library for optimal performance 📊 Performance Results: - Achieved 734% CPU utilization during inference - Maintained stable 6.5GB memory usage (19.9% of available) - Confirmed 3-4x performance improvement over baseline - Successfully running qwen2.5-coder:7b and deepseek-r1:7b models 🎯 TaskMaster Integration: - Consolidated duplicate .taskmaster configurations - Merged tasks from packages folder to project root - Updated MCP service configuration with optimized models - Verified AI-powered task expansion functionality 📝 Documentation: - Created comprehensive performance report - Documented optimization strategies and results - Added monitoring commands and validation procedures - Established baseline for future improvements ✅ Deployment Status: - Successfully deployed via NixOS declarative configuration - Tested post-reboot functionality and stability - Confirmed all optimizations active and performing optimally - Ready for production AI-assisted development workflows
104 lines
3.1 KiB
Nix
104 lines
3.1 KiB
Nix
# Ollama Service Configuration for Grey Area
|
|
#
|
|
# This service configuration deploys Ollama on the grey-area application server.
|
|
# Ollama provides local LLM hosting with an OpenAI-compatible API for development
|
|
# assistance, code review, and general AI tasks.
|
|
{
|
|
config,
|
|
lib,
|
|
pkgs,
|
|
...
|
|
}: {
|
|
# Enable Ollama service with appropriate configuration for grey-area
|
|
services.ollama = {
|
|
enable = true;
|
|
|
|
# Network configuration - localhost only for security by default
|
|
host = "0.0.0.0";
|
|
port = 11434;
|
|
|
|
# Environment variables for optimal CPU performance
|
|
environmentVariables = {
|
|
# Allow CORS from local network (adjust as needed)
|
|
OLLAMA_ORIGINS = "http://localhost,http://127.0.0.1,http://grey-area.lan,http://grey-area";
|
|
|
|
# Optimized context window for TaskMaster AI
|
|
OLLAMA_CONTEXT_LENGTH = "8192";
|
|
|
|
# CPU-optimized parallel processing
|
|
OLLAMA_NUM_PARALLEL = "4";
|
|
OLLAMA_MAX_LOADED_MODELS = "3";
|
|
|
|
# Increased queue for better throughput
|
|
OLLAMA_MAX_QUEUE = "512";
|
|
|
|
# CPU performance optimizations
|
|
OLLAMA_FLASH_ATTENTION = "1";
|
|
OLLAMA_KV_CACHE_TYPE = "q8_0"; # More memory efficient than f16
|
|
|
|
# Force specific CPU library for optimal performance
|
|
OLLAMA_LLM_LIBRARY = "cpu_avx2";
|
|
|
|
# Enable CPU optimizations
|
|
OLLAMA_CPU_HBM = "0"; # Disable unless you have high bandwidth memory
|
|
OLLAMA_OPENMP = "1"; # Enable OpenMP for parallel processing
|
|
|
|
# Disable debug for performance
|
|
OLLAMA_DEBUG = "0";
|
|
};
|
|
|
|
openFirewall = true; # Set to true if you want to allow external access
|
|
|
|
# GPU acceleration (enable if grey-area has a compatible GPU)
|
|
#enableGpuAcceleration = false; # Set to true if NVIDIA/AMD GPU available
|
|
};
|
|
|
|
# Apply resource limits and CPU optimizations using systemd overrides
|
|
systemd.services.ollama = {
|
|
serviceConfig = {
|
|
# Memory management for CPU inference
|
|
MemoryMax = "20G";
|
|
MemoryHigh = "16G";
|
|
MemorySwapMax = "4G";
|
|
|
|
# CPU optimization - utilize most of the 24 threads available
|
|
CPUQuota = "2000%"; # 20 cores out of 24 threads (leave 4 for system)
|
|
CPUWeight = "100";
|
|
|
|
# I/O optimization for model loading
|
|
IOWeight = "100";
|
|
IOSchedulingClass = "1";
|
|
IOSchedulingPriority = "2";
|
|
|
|
# Process limits
|
|
LimitNOFILE = "65536";
|
|
LimitNPROC = "8192";
|
|
|
|
# Enable CPU affinity if needed (comment out if not beneficial)
|
|
# CPUAffinity = "0-19"; # Use first 20 threads, reserve last 4 for system
|
|
};
|
|
|
|
# Additional environment variables for CPU optimization
|
|
environment = {
|
|
# OpenMP threading - utilize more cores for better performance
|
|
OMP_NUM_THREADS = "20"; # Use 20 threads, reserve 4 for system
|
|
OMP_PROC_BIND = "close";
|
|
OMP_PLACES = "cores";
|
|
|
|
# MKL optimizations (if available)
|
|
MKL_NUM_THREADS = "20";
|
|
MKL_DYNAMIC = "false";
|
|
|
|
# BLAS threading
|
|
OPENBLAS_NUM_THREADS = "20";
|
|
VECLIB_MAXIMUM_THREADS = "20";
|
|
};
|
|
};
|
|
|
|
# Add useful packages for AI development
|
|
environment.systemPackages = with pkgs; [
|
|
# CLI clients for testing
|
|
curl
|
|
jq
|
|
];
|
|
}
|