Files
luzia/lib/capacity_checker.py
admin ec33ac1936 Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor:

- Added DockerTmuxController class for robust tmux session management
- Implements send_keys() with configurable delay_enter
- Implements capture_pane() for output retrieval
- Implements wait_for_prompt() for pattern-based completion detection
- Implements wait_for_idle() for content-hash-based idle detection
- Implements wait_for_shell_prompt() for shell prompt detection

Also includes workflow improvements:
- Pre-task git snapshot before agent execution
- Post-task commit protocol in agent guidelines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00

98 lines
3.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Pre-dispatch capacity checking system.
Prevents OOM by validating system resources before launching new agents.
"""
import json
import subprocess
from pathlib import Path
from dataclasses import dataclass
@dataclass
class SystemCapacity:
"""System resource status."""
memory_available_mb: int
swap_available_mb: int
memory_percent_used: int
swap_percent_used: int
load_1m: float
load_5m: float
load_15m: float
active_agents: int
def can_dispatch(self, min_memory_mb=500, max_memory_percent=85, max_swap_percent=90, max_agents=4):
"""Check if system can safely dispatch a new agent."""
checks = {
"sufficient_memory": self.memory_available_mb >= min_memory_mb,
"memory_not_swapping": self.memory_percent_used <= max_memory_percent,
"swap_healthy": self.swap_percent_used <= max_swap_percent,
"capacity_available": self.active_agents < max_agents,
"load_reasonable": self.load_1m < (4 * 0.8), # 80% of CPU count
}
return all(checks.values()), checks
def get_system_capacity():
"""Gather current system capacity metrics."""
import psutil
# Memory metrics
mem = psutil.virtual_memory()
swap = psutil.swap_memory()
# CPU metrics
cpu_count = psutil.cpu_count()
load_avg = psutil.getloadavg()
# Count active agents (running jobs)
jobs_dir = Path("/var/log/luz-orchestrator/jobs")
active_agents = 0
for job_dir in jobs_dir.iterdir():
if job_dir.is_dir():
meta_file = job_dir / "meta.json"
if meta_file.exists():
try:
with open(meta_file) as f:
meta = json.load(f)
if meta.get("status") == "running":
pid_file = job_dir / "pid"
if pid_file.exists():
try:
pid = int(pid_file.read_text().strip())
import os
os.kill(pid, 0) # Check if alive
active_agents += 1
except:
pass
except:
pass
return SystemCapacity(
memory_available_mb=int(mem.available / 1024 / 1024),
swap_available_mb=int(swap.free / 1024 / 1024),
memory_percent_used=int(mem.percent),
swap_percent_used=int(swap.percent),
load_1m=load_avg[0],
load_5m=load_avg[1],
load_15m=load_avg[2],
active_agents=active_agents,
)
def check_dispatch_safety():
"""Pre-dispatch safety check."""
capacity = get_system_capacity()
can_dispatch, checks = capacity.can_dispatch()
return {
"can_dispatch": can_dispatch,
"capacity": capacity.__dict__,
"checks": checks,
}
if __name__ == "__main__":
import sys
result = check_dispatch_safety()
print(json.dumps(result, indent=2))
sys.exit(0 if result["can_dispatch"] else 1)