Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
97
lib/capacity_checker.py
Executable file
97
lib/capacity_checker.py
Executable file
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Pre-dispatch capacity checking system.
|
||||
Prevents OOM by validating system resources before launching new agents.
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class SystemCapacity:
|
||||
"""System resource status."""
|
||||
memory_available_mb: int
|
||||
swap_available_mb: int
|
||||
memory_percent_used: int
|
||||
swap_percent_used: int
|
||||
load_1m: float
|
||||
load_5m: float
|
||||
load_15m: float
|
||||
active_agents: int
|
||||
|
||||
def can_dispatch(self, min_memory_mb=500, max_memory_percent=85, max_swap_percent=90, max_agents=4):
|
||||
"""Check if system can safely dispatch a new agent."""
|
||||
checks = {
|
||||
"sufficient_memory": self.memory_available_mb >= min_memory_mb,
|
||||
"memory_not_swapping": self.memory_percent_used <= max_memory_percent,
|
||||
"swap_healthy": self.swap_percent_used <= max_swap_percent,
|
||||
"capacity_available": self.active_agents < max_agents,
|
||||
"load_reasonable": self.load_1m < (4 * 0.8), # 80% of CPU count
|
||||
}
|
||||
|
||||
return all(checks.values()), checks
|
||||
|
||||
def get_system_capacity():
|
||||
"""Gather current system capacity metrics."""
|
||||
import psutil
|
||||
|
||||
# Memory metrics
|
||||
mem = psutil.virtual_memory()
|
||||
swap = psutil.swap_memory()
|
||||
|
||||
# CPU metrics
|
||||
cpu_count = psutil.cpu_count()
|
||||
load_avg = psutil.getloadavg()
|
||||
|
||||
# Count active agents (running jobs)
|
||||
jobs_dir = Path("/var/log/luz-orchestrator/jobs")
|
||||
active_agents = 0
|
||||
for job_dir in jobs_dir.iterdir():
|
||||
if job_dir.is_dir():
|
||||
meta_file = job_dir / "meta.json"
|
||||
if meta_file.exists():
|
||||
try:
|
||||
with open(meta_file) as f:
|
||||
meta = json.load(f)
|
||||
if meta.get("status") == "running":
|
||||
pid_file = job_dir / "pid"
|
||||
if pid_file.exists():
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
import os
|
||||
os.kill(pid, 0) # Check if alive
|
||||
active_agents += 1
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
|
||||
return SystemCapacity(
|
||||
memory_available_mb=int(mem.available / 1024 / 1024),
|
||||
swap_available_mb=int(swap.free / 1024 / 1024),
|
||||
memory_percent_used=int(mem.percent),
|
||||
swap_percent_used=int(swap.percent),
|
||||
load_1m=load_avg[0],
|
||||
load_5m=load_avg[1],
|
||||
load_15m=load_avg[2],
|
||||
active_agents=active_agents,
|
||||
)
|
||||
|
||||
def check_dispatch_safety():
|
||||
"""Pre-dispatch safety check."""
|
||||
capacity = get_system_capacity()
|
||||
can_dispatch, checks = capacity.can_dispatch()
|
||||
|
||||
return {
|
||||
"can_dispatch": can_dispatch,
|
||||
"capacity": capacity.__dict__,
|
||||
"checks": checks,
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
result = check_dispatch_safety()
|
||||
print(json.dumps(result, indent=2))
|
||||
sys.exit(0 if result["can_dispatch"] else 1)
|
||||
Reference in New Issue
Block a user