Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
141 lines
4.4 KiB
Python
Executable File
141 lines
4.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Emergency OOM recovery procedures.
|
|
Identifies and safely kills stuck processes, cleans up resources.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
|
|
def get_stuck_processes():
|
|
"""Identify stuck Claude processes."""
|
|
stuck = []
|
|
|
|
# Check processes in process table
|
|
try:
|
|
result = subprocess.run(['ps', 'aux'], capture_output=True, text=True)
|
|
for line in result.stdout.split('\n'):
|
|
if 'claude' in line and 'grep' not in line:
|
|
parts = line.split()
|
|
if len(parts) > 1:
|
|
pid = int(parts[1])
|
|
try:
|
|
# Check if process is in uninterruptible sleep (D state)
|
|
with open(f'/proc/{pid}/status') as f:
|
|
status = f.read()
|
|
if 'State:\tD' in status or 'State:\tZ' in status:
|
|
stuck.append({
|
|
'pid': pid,
|
|
'type': 'uninterruptible_sleep' if 'D' in status else 'zombie',
|
|
'user': parts[0],
|
|
})
|
|
except:
|
|
pass
|
|
except:
|
|
pass
|
|
|
|
return stuck
|
|
|
|
def identify_zombie_jobs():
|
|
"""Find jobs with dead processes still marked as running."""
|
|
zombies = []
|
|
jobs_dir = Path("/var/log/luz-orchestrator/jobs")
|
|
|
|
for job_dir in sorted(jobs_dir.iterdir()):
|
|
if not job_dir.is_dir():
|
|
continue
|
|
|
|
meta_file = job_dir / "meta.json"
|
|
pid_file = job_dir / "pid"
|
|
|
|
if not meta_file.exists():
|
|
continue
|
|
|
|
try:
|
|
with open(meta_file) as f:
|
|
meta = json.load(f)
|
|
|
|
if meta.get("status") == "running" and pid_file.exists():
|
|
try:
|
|
pid = int(pid_file.read_text().strip())
|
|
os.kill(pid, 0) # Signal 0 = just check
|
|
except ProcessLookupError:
|
|
zombies.append({
|
|
'job_id': job_dir.name,
|
|
'project': meta.get('project', 'unknown'),
|
|
'pid': pid,
|
|
'started': meta.get('started', 'unknown'),
|
|
})
|
|
except:
|
|
pass
|
|
|
|
return zombies
|
|
|
|
def clean_swap_cache():
|
|
"""Request kernel to free up swap (requires root)."""
|
|
try:
|
|
subprocess.run(['sync'], check=True)
|
|
subprocess.run(['sysctl', '-w', 'vm.drop_caches=3'], check=False)
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def emergency_kill_zombies(dry_run=True):
|
|
"""Kill zombie processes and clean up jobs."""
|
|
zombies = identify_zombie_jobs()
|
|
|
|
report = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'dry_run': dry_run,
|
|
'zombies_found': len(zombies),
|
|
'actions': [],
|
|
}
|
|
|
|
for zombie in zombies:
|
|
action = {
|
|
'job_id': zombie['job_id'],
|
|
'project': zombie['project'],
|
|
'status': 'skipped' if dry_run else 'killed',
|
|
}
|
|
|
|
if not dry_run:
|
|
try:
|
|
# Update job meta to reflect kill
|
|
job_dir = Path(f"/var/log/luz-orchestrator/jobs/{zombie['job_id']}")
|
|
meta_file = job_dir / "meta.json"
|
|
|
|
with open(meta_file) as f:
|
|
meta = json.load(f)
|
|
|
|
meta['status'] = 'failed'
|
|
meta['exit_code'] = 137 # SIGKILL
|
|
meta['killed_by_emergency_recovery'] = True
|
|
meta['recovery_timestamp'] = datetime.now().isoformat()
|
|
|
|
with open(meta_file, 'w') as f:
|
|
json.dump(meta, f, indent=2)
|
|
|
|
action['status'] = 'updated_metadata'
|
|
except Exception as e:
|
|
action['error'] = str(e)
|
|
|
|
report['actions'].append(action)
|
|
|
|
return report
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
if len(sys.argv) > 1 and sys.argv[1] == "--kill":
|
|
print("EMERGENCY RECOVERY: KILLING ZOMBIES")
|
|
report = emergency_kill_zombies(dry_run=False)
|
|
else:
|
|
print("EMERGENCY RECOVERY: DRY RUN (USE --kill TO EXECUTE)")
|
|
report = emergency_kill_zombies(dry_run=True)
|
|
|
|
print(json.dumps(report, indent=2))
|