Refactor cockpit to use DockerTmuxController pattern

Based on claude-code-tools TmuxCLIController, this refactor:

- Added DockerTmuxController class for robust tmux session management
- Implements send_keys() with configurable delay_enter
- Implements capture_pane() for output retrieval
- Implements wait_for_prompt() for pattern-based completion detection
- Implements wait_for_idle() for content-hash-based idle detection
- Implements wait_for_shell_prompt() for shell prompt detection

Also includes workflow improvements:
- Pre-task git snapshot before agent execution
- Post-task commit protocol in agent guidelines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
admin
2026-01-14 10:42:16 -03:00
commit ec33ac1936
265 changed files with 92011 additions and 0 deletions

140
lib/emergency_recovery.py Executable file
View File

@@ -0,0 +1,140 @@
#!/usr/bin/env python3
"""
Emergency OOM recovery procedures.
Identifies and safely kills stuck processes, cleans up resources.
"""
import json
import os
import signal
import subprocess
from pathlib import Path
from datetime import datetime, timedelta
def get_stuck_processes():
"""Identify stuck Claude processes."""
stuck = []
# Check processes in process table
try:
result = subprocess.run(['ps', 'aux'], capture_output=True, text=True)
for line in result.stdout.split('\n'):
if 'claude' in line and 'grep' not in line:
parts = line.split()
if len(parts) > 1:
pid = int(parts[1])
try:
# Check if process is in uninterruptible sleep (D state)
with open(f'/proc/{pid}/status') as f:
status = f.read()
if 'State:\tD' in status or 'State:\tZ' in status:
stuck.append({
'pid': pid,
'type': 'uninterruptible_sleep' if 'D' in status else 'zombie',
'user': parts[0],
})
except:
pass
except:
pass
return stuck
def identify_zombie_jobs():
"""Find jobs with dead processes still marked as running."""
zombies = []
jobs_dir = Path("/var/log/luz-orchestrator/jobs")
for job_dir in sorted(jobs_dir.iterdir()):
if not job_dir.is_dir():
continue
meta_file = job_dir / "meta.json"
pid_file = job_dir / "pid"
if not meta_file.exists():
continue
try:
with open(meta_file) as f:
meta = json.load(f)
if meta.get("status") == "running" and pid_file.exists():
try:
pid = int(pid_file.read_text().strip())
os.kill(pid, 0) # Signal 0 = just check
except ProcessLookupError:
zombies.append({
'job_id': job_dir.name,
'project': meta.get('project', 'unknown'),
'pid': pid,
'started': meta.get('started', 'unknown'),
})
except:
pass
return zombies
def clean_swap_cache():
"""Request kernel to free up swap (requires root)."""
try:
subprocess.run(['sync'], check=True)
subprocess.run(['sysctl', '-w', 'vm.drop_caches=3'], check=False)
return True
except:
return False
def emergency_kill_zombies(dry_run=True):
"""Kill zombie processes and clean up jobs."""
zombies = identify_zombie_jobs()
report = {
'timestamp': datetime.now().isoformat(),
'dry_run': dry_run,
'zombies_found': len(zombies),
'actions': [],
}
for zombie in zombies:
action = {
'job_id': zombie['job_id'],
'project': zombie['project'],
'status': 'skipped' if dry_run else 'killed',
}
if not dry_run:
try:
# Update job meta to reflect kill
job_dir = Path(f"/var/log/luz-orchestrator/jobs/{zombie['job_id']}")
meta_file = job_dir / "meta.json"
with open(meta_file) as f:
meta = json.load(f)
meta['status'] = 'failed'
meta['exit_code'] = 137 # SIGKILL
meta['killed_by_emergency_recovery'] = True
meta['recovery_timestamp'] = datetime.now().isoformat()
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
action['status'] = 'updated_metadata'
except Exception as e:
action['error'] = str(e)
report['actions'].append(action)
return report
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "--kill":
print("EMERGENCY RECOVERY: KILLING ZOMBIES")
report = emergency_kill_zombies(dry_run=False)
else:
print("EMERGENCY RECOVERY: DRY RUN (USE --kill TO EXECUTE)")
report = emergency_kill_zombies(dry_run=True)
print(json.dumps(report, indent=2))