#!/usr/bin/env python3 """ Pre-dispatch capacity checking system. Prevents OOM by validating system resources before launching new agents. """ import json import subprocess from pathlib import Path from dataclasses import dataclass @dataclass class SystemCapacity: """System resource status.""" memory_available_mb: int swap_available_mb: int memory_percent_used: int swap_percent_used: int load_1m: float load_5m: float load_15m: float active_agents: int def can_dispatch(self, min_memory_mb=500, max_memory_percent=85, max_swap_percent=90, max_agents=4): """Check if system can safely dispatch a new agent.""" checks = { "sufficient_memory": self.memory_available_mb >= min_memory_mb, "memory_not_swapping": self.memory_percent_used <= max_memory_percent, "swap_healthy": self.swap_percent_used <= max_swap_percent, "capacity_available": self.active_agents < max_agents, "load_reasonable": self.load_1m < (4 * 0.8), # 80% of CPU count } return all(checks.values()), checks def get_system_capacity(): """Gather current system capacity metrics.""" import psutil # Memory metrics mem = psutil.virtual_memory() swap = psutil.swap_memory() # CPU metrics cpu_count = psutil.cpu_count() load_avg = psutil.getloadavg() # Count active agents (running jobs) jobs_dir = Path("/var/log/luz-orchestrator/jobs") active_agents = 0 for job_dir in jobs_dir.iterdir(): if job_dir.is_dir(): meta_file = job_dir / "meta.json" if meta_file.exists(): try: with open(meta_file) as f: meta = json.load(f) if meta.get("status") == "running": pid_file = job_dir / "pid" if pid_file.exists(): try: pid = int(pid_file.read_text().strip()) import os os.kill(pid, 0) # Check if alive active_agents += 1 except: pass except: pass return SystemCapacity( memory_available_mb=int(mem.available / 1024 / 1024), swap_available_mb=int(swap.free / 1024 / 1024), memory_percent_used=int(mem.percent), swap_percent_used=int(swap.percent), load_1m=load_avg[0], load_5m=load_avg[1], load_15m=load_avg[2], active_agents=active_agents, ) def check_dispatch_safety(): """Pre-dispatch safety check.""" capacity = get_system_capacity() can_dispatch, checks = capacity.can_dispatch() return { "can_dispatch": can_dispatch, "capacity": capacity.__dict__, "checks": checks, } if __name__ == "__main__": import sys result = check_dispatch_safety() print(json.dumps(result, indent=2)) sys.exit(0 if result["can_dispatch"] else 1)