Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
98 lines
3.1 KiB
Python
Executable File
98 lines
3.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Pre-dispatch capacity checking system.
|
|
Prevents OOM by validating system resources before launching new agents.
|
|
"""
|
|
|
|
import json
|
|
import subprocess
|
|
from pathlib import Path
|
|
from dataclasses import dataclass
|
|
|
|
@dataclass
|
|
class SystemCapacity:
|
|
"""System resource status."""
|
|
memory_available_mb: int
|
|
swap_available_mb: int
|
|
memory_percent_used: int
|
|
swap_percent_used: int
|
|
load_1m: float
|
|
load_5m: float
|
|
load_15m: float
|
|
active_agents: int
|
|
|
|
def can_dispatch(self, min_memory_mb=500, max_memory_percent=85, max_swap_percent=90, max_agents=4):
|
|
"""Check if system can safely dispatch a new agent."""
|
|
checks = {
|
|
"sufficient_memory": self.memory_available_mb >= min_memory_mb,
|
|
"memory_not_swapping": self.memory_percent_used <= max_memory_percent,
|
|
"swap_healthy": self.swap_percent_used <= max_swap_percent,
|
|
"capacity_available": self.active_agents < max_agents,
|
|
"load_reasonable": self.load_1m < (4 * 0.8), # 80% of CPU count
|
|
}
|
|
|
|
return all(checks.values()), checks
|
|
|
|
def get_system_capacity():
|
|
"""Gather current system capacity metrics."""
|
|
import psutil
|
|
|
|
# Memory metrics
|
|
mem = psutil.virtual_memory()
|
|
swap = psutil.swap_memory()
|
|
|
|
# CPU metrics
|
|
cpu_count = psutil.cpu_count()
|
|
load_avg = psutil.getloadavg()
|
|
|
|
# Count active agents (running jobs)
|
|
jobs_dir = Path("/var/log/luz-orchestrator/jobs")
|
|
active_agents = 0
|
|
for job_dir in jobs_dir.iterdir():
|
|
if job_dir.is_dir():
|
|
meta_file = job_dir / "meta.json"
|
|
if meta_file.exists():
|
|
try:
|
|
with open(meta_file) as f:
|
|
meta = json.load(f)
|
|
if meta.get("status") == "running":
|
|
pid_file = job_dir / "pid"
|
|
if pid_file.exists():
|
|
try:
|
|
pid = int(pid_file.read_text().strip())
|
|
import os
|
|
os.kill(pid, 0) # Check if alive
|
|
active_agents += 1
|
|
except:
|
|
pass
|
|
except:
|
|
pass
|
|
|
|
return SystemCapacity(
|
|
memory_available_mb=int(mem.available / 1024 / 1024),
|
|
swap_available_mb=int(swap.free / 1024 / 1024),
|
|
memory_percent_used=int(mem.percent),
|
|
swap_percent_used=int(swap.percent),
|
|
load_1m=load_avg[0],
|
|
load_5m=load_avg[1],
|
|
load_15m=load_avg[2],
|
|
active_agents=active_agents,
|
|
)
|
|
|
|
def check_dispatch_safety():
|
|
"""Pre-dispatch safety check."""
|
|
capacity = get_system_capacity()
|
|
can_dispatch, checks = capacity.can_dispatch()
|
|
|
|
return {
|
|
"can_dispatch": can_dispatch,
|
|
"capacity": capacity.__dict__,
|
|
"checks": checks,
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
result = check_dispatch_safety()
|
|
print(json.dumps(result, indent=2))
|
|
sys.exit(0 if result["can_dispatch"] else 1)
|