Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
140
lib/emergency_recovery.py
Executable file
140
lib/emergency_recovery.py
Executable file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Emergency OOM recovery procedures.
|
||||
Identifies and safely kills stuck processes, cleans up resources.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
def get_stuck_processes():
|
||||
"""Identify stuck Claude processes."""
|
||||
stuck = []
|
||||
|
||||
# Check processes in process table
|
||||
try:
|
||||
result = subprocess.run(['ps', 'aux'], capture_output=True, text=True)
|
||||
for line in result.stdout.split('\n'):
|
||||
if 'claude' in line and 'grep' not in line:
|
||||
parts = line.split()
|
||||
if len(parts) > 1:
|
||||
pid = int(parts[1])
|
||||
try:
|
||||
# Check if process is in uninterruptible sleep (D state)
|
||||
with open(f'/proc/{pid}/status') as f:
|
||||
status = f.read()
|
||||
if 'State:\tD' in status or 'State:\tZ' in status:
|
||||
stuck.append({
|
||||
'pid': pid,
|
||||
'type': 'uninterruptible_sleep' if 'D' in status else 'zombie',
|
||||
'user': parts[0],
|
||||
})
|
||||
except:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
|
||||
return stuck
|
||||
|
||||
def identify_zombie_jobs():
|
||||
"""Find jobs with dead processes still marked as running."""
|
||||
zombies = []
|
||||
jobs_dir = Path("/var/log/luz-orchestrator/jobs")
|
||||
|
||||
for job_dir in sorted(jobs_dir.iterdir()):
|
||||
if not job_dir.is_dir():
|
||||
continue
|
||||
|
||||
meta_file = job_dir / "meta.json"
|
||||
pid_file = job_dir / "pid"
|
||||
|
||||
if not meta_file.exists():
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(meta_file) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
if meta.get("status") == "running" and pid_file.exists():
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
os.kill(pid, 0) # Signal 0 = just check
|
||||
except ProcessLookupError:
|
||||
zombies.append({
|
||||
'job_id': job_dir.name,
|
||||
'project': meta.get('project', 'unknown'),
|
||||
'pid': pid,
|
||||
'started': meta.get('started', 'unknown'),
|
||||
})
|
||||
except:
|
||||
pass
|
||||
|
||||
return zombies
|
||||
|
||||
def clean_swap_cache():
|
||||
"""Request kernel to free up swap (requires root)."""
|
||||
try:
|
||||
subprocess.run(['sync'], check=True)
|
||||
subprocess.run(['sysctl', '-w', 'vm.drop_caches=3'], check=False)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def emergency_kill_zombies(dry_run=True):
|
||||
"""Kill zombie processes and clean up jobs."""
|
||||
zombies = identify_zombie_jobs()
|
||||
|
||||
report = {
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'dry_run': dry_run,
|
||||
'zombies_found': len(zombies),
|
||||
'actions': [],
|
||||
}
|
||||
|
||||
for zombie in zombies:
|
||||
action = {
|
||||
'job_id': zombie['job_id'],
|
||||
'project': zombie['project'],
|
||||
'status': 'skipped' if dry_run else 'killed',
|
||||
}
|
||||
|
||||
if not dry_run:
|
||||
try:
|
||||
# Update job meta to reflect kill
|
||||
job_dir = Path(f"/var/log/luz-orchestrator/jobs/{zombie['job_id']}")
|
||||
meta_file = job_dir / "meta.json"
|
||||
|
||||
with open(meta_file) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
meta['status'] = 'failed'
|
||||
meta['exit_code'] = 137 # SIGKILL
|
||||
meta['killed_by_emergency_recovery'] = True
|
||||
meta['recovery_timestamp'] = datetime.now().isoformat()
|
||||
|
||||
with open(meta_file, 'w') as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
action['status'] = 'updated_metadata'
|
||||
except Exception as e:
|
||||
action['error'] = str(e)
|
||||
|
||||
report['actions'].append(action)
|
||||
|
||||
return report
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--kill":
|
||||
print("EMERGENCY RECOVERY: KILLING ZOMBIES")
|
||||
report = emergency_kill_zombies(dry_run=False)
|
||||
else:
|
||||
print("EMERGENCY RECOVERY: DRY RUN (USE --kill TO EXECUTE)")
|
||||
report = emergency_kill_zombies(dry_run=True)
|
||||
|
||||
print(json.dumps(report, indent=2))
|
||||
Reference in New Issue
Block a user