Files
luzia/lib/emergency_recovery.py
admin ec33ac1936 Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor:

- Added DockerTmuxController class for robust tmux session management
- Implements send_keys() with configurable delay_enter
- Implements capture_pane() for output retrieval
- Implements wait_for_prompt() for pattern-based completion detection
- Implements wait_for_idle() for content-hash-based idle detection
- Implements wait_for_shell_prompt() for shell prompt detection

Also includes workflow improvements:
- Pre-task git snapshot before agent execution
- Post-task commit protocol in agent guidelines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00

141 lines
4.4 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Emergency OOM recovery procedures.
Identifies and safely kills stuck processes, cleans up resources.
"""
import json
import os
import signal
import subprocess
from pathlib import Path
from datetime import datetime, timedelta
def get_stuck_processes():
"""Identify stuck Claude processes."""
stuck = []
# Check processes in process table
try:
result = subprocess.run(['ps', 'aux'], capture_output=True, text=True)
for line in result.stdout.split('\n'):
if 'claude' in line and 'grep' not in line:
parts = line.split()
if len(parts) > 1:
pid = int(parts[1])
try:
# Check if process is in uninterruptible sleep (D state)
with open(f'/proc/{pid}/status') as f:
status = f.read()
if 'State:\tD' in status or 'State:\tZ' in status:
stuck.append({
'pid': pid,
'type': 'uninterruptible_sleep' if 'D' in status else 'zombie',
'user': parts[0],
})
except:
pass
except:
pass
return stuck
def identify_zombie_jobs():
"""Find jobs with dead processes still marked as running."""
zombies = []
jobs_dir = Path("/var/log/luz-orchestrator/jobs")
for job_dir in sorted(jobs_dir.iterdir()):
if not job_dir.is_dir():
continue
meta_file = job_dir / "meta.json"
pid_file = job_dir / "pid"
if not meta_file.exists():
continue
try:
with open(meta_file) as f:
meta = json.load(f)
if meta.get("status") == "running" and pid_file.exists():
try:
pid = int(pid_file.read_text().strip())
os.kill(pid, 0) # Signal 0 = just check
except ProcessLookupError:
zombies.append({
'job_id': job_dir.name,
'project': meta.get('project', 'unknown'),
'pid': pid,
'started': meta.get('started', 'unknown'),
})
except:
pass
return zombies
def clean_swap_cache():
"""Request kernel to free up swap (requires root)."""
try:
subprocess.run(['sync'], check=True)
subprocess.run(['sysctl', '-w', 'vm.drop_caches=3'], check=False)
return True
except:
return False
def emergency_kill_zombies(dry_run=True):
"""Kill zombie processes and clean up jobs."""
zombies = identify_zombie_jobs()
report = {
'timestamp': datetime.now().isoformat(),
'dry_run': dry_run,
'zombies_found': len(zombies),
'actions': [],
}
for zombie in zombies:
action = {
'job_id': zombie['job_id'],
'project': zombie['project'],
'status': 'skipped' if dry_run else 'killed',
}
if not dry_run:
try:
# Update job meta to reflect kill
job_dir = Path(f"/var/log/luz-orchestrator/jobs/{zombie['job_id']}")
meta_file = job_dir / "meta.json"
with open(meta_file) as f:
meta = json.load(f)
meta['status'] = 'failed'
meta['exit_code'] = 137 # SIGKILL
meta['killed_by_emergency_recovery'] = True
meta['recovery_timestamp'] = datetime.now().isoformat()
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
action['status'] = 'updated_metadata'
except Exception as e:
action['error'] = str(e)
report['actions'].append(action)
return report
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "--kill":
print("EMERGENCY RECOVERY: KILLING ZOMBIES")
report = emergency_kill_zombies(dry_run=False)
else:
print("EMERGENCY RECOVERY: DRY RUN (USE --kill TO EXECUTE)")
report = emergency_kill_zombies(dry_run=True)
print(json.dumps(report, indent=2))