#!/usr/bin/env python3 """ Emergency OOM recovery procedures. Identifies and safely kills stuck processes, cleans up resources. """ import json import os import signal import subprocess from pathlib import Path from datetime import datetime, timedelta def get_stuck_processes(): """Identify stuck Claude processes.""" stuck = [] # Check processes in process table try: result = subprocess.run(['ps', 'aux'], capture_output=True, text=True) for line in result.stdout.split('\n'): if 'claude' in line and 'grep' not in line: parts = line.split() if len(parts) > 1: pid = int(parts[1]) try: # Check if process is in uninterruptible sleep (D state) with open(f'/proc/{pid}/status') as f: status = f.read() if 'State:\tD' in status or 'State:\tZ' in status: stuck.append({ 'pid': pid, 'type': 'uninterruptible_sleep' if 'D' in status else 'zombie', 'user': parts[0], }) except: pass except: pass return stuck def identify_zombie_jobs(): """Find jobs with dead processes still marked as running.""" zombies = [] jobs_dir = Path("/var/log/luz-orchestrator/jobs") for job_dir in sorted(jobs_dir.iterdir()): if not job_dir.is_dir(): continue meta_file = job_dir / "meta.json" pid_file = job_dir / "pid" if not meta_file.exists(): continue try: with open(meta_file) as f: meta = json.load(f) if meta.get("status") == "running" and pid_file.exists(): try: pid = int(pid_file.read_text().strip()) os.kill(pid, 0) # Signal 0 = just check except ProcessLookupError: zombies.append({ 'job_id': job_dir.name, 'project': meta.get('project', 'unknown'), 'pid': pid, 'started': meta.get('started', 'unknown'), }) except: pass return zombies def clean_swap_cache(): """Request kernel to free up swap (requires root).""" try: subprocess.run(['sync'], check=True) subprocess.run(['sysctl', '-w', 'vm.drop_caches=3'], check=False) return True except: return False def emergency_kill_zombies(dry_run=True): """Kill zombie processes and clean up jobs.""" zombies = identify_zombie_jobs() report = { 'timestamp': datetime.now().isoformat(), 'dry_run': dry_run, 'zombies_found': len(zombies), 'actions': [], } for zombie in zombies: action = { 'job_id': zombie['job_id'], 'project': zombie['project'], 'status': 'skipped' if dry_run else 'killed', } if not dry_run: try: # Update job meta to reflect kill job_dir = Path(f"/var/log/luz-orchestrator/jobs/{zombie['job_id']}") meta_file = job_dir / "meta.json" with open(meta_file) as f: meta = json.load(f) meta['status'] = 'failed' meta['exit_code'] = 137 # SIGKILL meta['killed_by_emergency_recovery'] = True meta['recovery_timestamp'] = datetime.now().isoformat() with open(meta_file, 'w') as f: json.dump(meta, f, indent=2) action['status'] = 'updated_metadata' except Exception as e: action['error'] = str(e) report['actions'].append(action) return report if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == "--kill": print("EMERGENCY RECOVERY: KILLING ZOMBIES") report = emergency_kill_zombies(dry_run=False) else: print("EMERGENCY RECOVERY: DRY RUN (USE --kill TO EXECUTE)") report = emergency_kill_zombies(dry_run=True) print(json.dumps(report, indent=2))