Refactor cockpit to use DockerTmuxController pattern

Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00
commit ec33ac1936
265 changed files with 92011 additions and 0 deletions
--- a/lib/conductor_recovery.py
+++ b/lib/conductor_recovery.py
@@ -0,0 +1,383 @@
+#!/usr/bin/env python3
+"""
+Conductor Task Recovery
+
+Auto-recovery for stalled conductor tasks:
+- Kill zombie processes
+- Release task locks
+- Update task status
+- Move to failed directory if unrecoverable
+"""
+
+import json
+import os
+import signal
+import time
+from pathlib import Path
+from datetime import datetime
+from typing import List, Dict
+
+
+class ConductorRecovery:
+    """Recover from stalled conductor tasks."""
+
+    CONDUCTOR_ROOT = Path('/home/admin/conductor')
+    HEARTBEAT_TIMEOUT_SECS = 300
+
+    def __init__(self):
+        """Initialize conductor recovery."""
+        self.conductor_root = self.CONDUCTOR_ROOT
+        self.active_dir = self.conductor_root / 'active'
+        self.failed_dir = self.conductor_root / 'failed'
+
+    def find_stalled_tasks(self) -> List[Dict]:
+        """
+        Find all stalled tasks in conductor/active.
+
+        Returns:
+            List of stalled task metadata dicts
+        """
+        stalled = []
+
+        if not self.active_dir.exists():
+            return stalled
+
+        now = time.time()
+
+        for task_dir in self.active_dir.iterdir():
+            if not task_dir.is_dir():
+                continue
+
+            task_id = task_dir.name
+            stall_reason = None
+            stall_details = {}
+
+            # Check heartbeat timeout
+            heartbeat_file = task_dir / 'heartbeat.json'
+            if heartbeat_file.exists():
+                try:
+                    hb = json.loads(heartbeat_file.read_text())
+                    hb_age = now - hb.get('ts', 0)
+
+                    if hb_age > self.HEARTBEAT_TIMEOUT_SECS:
+                        stall_reason = 'heartbeat_timeout'
+                        stall_details = {
+                            'heartbeat_age_secs': int(hb_age),
+                            'last_step': hb.get('step', 'unknown')
+                        }
+                except:
+                    pass
+
+            # Check if process exists
+            pid_file = task_dir / 'pid'
+            if pid_file.exists() and not stall_reason:
+                try:
+                    pid = int(pid_file.read_text().strip())
+                    if not os.path.exists(f'/proc/{pid}'):
+                        stall_reason = 'process_not_found'
+                        stall_details = {'pid': pid}
+                except:
+                    pass
+
+            if stall_reason:
+                stalled.append({
+                    'task_id': task_id,
+                    'task_dir': str(task_dir),
+                    'stall_reason': stall_reason,
+                    'details': stall_details,
+                    'timestamp': now
+                })
+
+        return stalled
+
+    def recover_stalled_task(self, task_id: str, dry_run: bool = True) -> Dict:
+        """
+        Attempt to recover a single stalled task.
+
+        Args:
+            task_id: Task ID to recover
+            dry_run: If True, preview actions without making changes
+
+        Returns:
+            Dict with recovery result
+        """
+        task_dir = self.active_dir / task_id
+
+        if not task_dir.exists():
+            return {'status': 'error', 'message': f'Task {task_id} not found'}
+
+        actions = []
+        result_status = 'unknown'
+
+        # 1. Kill zombie process (if exists)
+        pid_file = task_dir / 'pid'
+        if pid_file.exists():
+            try:
+                pid = int(pid_file.read_text().strip())
+                if os.path.exists(f'/proc/{pid}'):
+                    actions.append(f"Kill process {pid}")
+                    if not dry_run:
+                        try:
+                            os.kill(pid, signal.SIGTERM)
+                            time.sleep(1)
+                            # Force kill if still exists
+                            if os.path.exists(f'/proc/{pid}'):
+                                os.kill(pid, signal.SIGKILL)
+                        except:
+                            pass
+                else:
+                    actions.append(f"Process {pid} already terminated")
+            except:
+                pass
+
+        # 2. Update heartbeat to current time (signal recovery attempt)
+        heartbeat_file = task_dir / 'heartbeat.json'
+        actions.append("Update heartbeat to current time")
+        if not dry_run:
+            hb_data = {
+                'ts': time.time(),
+                'step': 'recovery_attempt',
+                'recovered_at': datetime.now().isoformat()
+            }
+            heartbeat_file.write_text(json.dumps(hb_data, indent=2))
+
+        # 3. Update progress file
+        progress_file = task_dir / 'progress.md'
+        actions.append("Update progress with recovery note")
+        if not dry_run:
+            progress_content = f"""# Task Recovery
+
+**Recovered at:** {datetime.now().isoformat()}
+**Status:** Task was stalled, recovery attempted
+
+## Original Progress
+(Previous content preserved)
+
+## Recovery Actions
+- Process killed/terminated
+- Heartbeat reset
+- Progress file updated
+
+**Next step:** Monitor task progress. If still stalled, may need manual intervention.
+"""
+            progress_file.write_text(progress_content)
+
+        # 4. Update meta to mark recovery attempt
+        meta_file = task_dir / 'meta.json'
+        actions.append("Update metadata with recovery flag")
+        if not dry_run:
+            try:
+                meta = json.loads(meta_file.read_text())
+                meta['recovery_attempts'] = meta.get('recovery_attempts', 0) + 1
+                meta['last_recovery'] = datetime.now().isoformat()
+                meta_file.write_text(json.dumps(meta, indent=2))
+            except:
+                pass
+
+        # 5. Decision: Keep in active or move to failed if too many recovery attempts
+        meta = json.loads(meta_file.read_text()) if meta_file.exists() else {}
+        recovery_attempts = meta.get('recovery_attempts', 0)
+
+        if recovery_attempts >= 3:
+            result_status = 'moved_to_failed'
+            actions.append("Move to failed (too many recovery attempts)")
+            if not dry_run:
+                self._move_task_to_failed(task_dir, task_id, "Exceeded maximum recovery attempts")
+        else:
+            result_status = 'recovered'
+            actions.append("Keep in active (monitor progress)")
+
+        return {
+            'task_id': task_id,
+            'status': result_status,
+            'actions': actions,
+            'dry_run': dry_run,
+            'timestamp': time.time()
+        }
+
+    def recover_all_stalled_tasks(self, dry_run: bool = True) -> Dict:
+        """
+        Recover all stalled tasks.
+
+        Args:
+            dry_run: If True, preview without making changes
+
+        Returns:
+            Dict with batch recovery results
+        """
+        stalled_tasks = self.find_stalled_tasks()
+
+        if not stalled_tasks:
+            return {
+                'total_stalled': 0,
+                'recovered': 0,
+                'moved_to_failed': 0,
+                'results': [],
+                'dry_run': dry_run,
+                'timestamp': time.time()
+            }
+
+        results = []
+        recovered_count = 0
+        moved_count = 0
+
+        for stalled in stalled_tasks:
+            task_id = stalled['task_id']
+            result = self.recover_stalled_task(task_id, dry_run=dry_run)
+            results.append(result)
+
+            if result['status'] == 'recovered':
+                recovered_count += 1
+            elif result['status'] == 'moved_to_failed':
+                moved_count += 1
+
+        return {
+            'total_stalled': len(stalled_tasks),
+            'recovered': recovered_count,
+            'moved_to_failed': moved_count,
+            'results': results,
+            'dry_run': dry_run,
+            'timestamp': time.time()
+        }
+
+    def release_locks(self, task_id: str, dry_run: bool = True) -> Dict:
+        """
+        Release any locks held by a task.
+
+        Args:
+            task_id: Task ID
+            dry_run: If True, preview without making changes
+
+        Returns:
+            Dict with lock release results
+        """
+        task_dir = self.active_dir / task_id
+
+        if not task_dir.exists():
+            return {'status': 'error', 'message': f'Task {task_id} not found'}
+
+        # Look for lock files
+        lock_dir = task_dir / 'locks'
+        released = []
+
+        if lock_dir.exists():
+            for lock_file in lock_dir.iterdir():
+                released.append(str(lock_file))
+                if not dry_run:
+                    lock_file.unlink()
+
+        return {
+            'task_id': task_id,
+            'locks_released': len(released),
+            'lock_files': released,
+            'dry_run': dry_run,
+            'timestamp': time.time()
+        }
+
+    def validate_recovery(self, task_id: str) -> Dict:
+        """
+        Validate that a task recovered successfully.
+
+        Args:
+            task_id: Task ID to validate
+
+        Returns:
+            Dict with validation result
+        """
+        task_dir = self.active_dir / task_id
+
+        if not task_dir.exists():
+            return {'status': 'not_found', 'task_id': task_id}
+
+        # Check heartbeat is recent
+        heartbeat_file = task_dir / 'heartbeat.json'
+        is_alive = False
+
+        if heartbeat_file.exists():
+            try:
+                hb = json.loads(heartbeat_file.read_text())
+                hb_age = time.time() - hb.get('ts', 0)
+                is_alive = hb_age < 300  # Consider alive if <5min old
+            except:
+                pass
+
+        # Check for process
+        process_running = False
+        pid_file = task_dir / 'pid'
+        if pid_file.exists():
+            try:
+                pid = int(pid_file.read_text().strip())
+                process_running = os.path.exists(f'/proc/{pid}')
+            except:
+                pass
+
+        # Overall recovery status
+        recovery_status = 'recovered' if is_alive or process_running else 'stalled'
+
+        return {
+            'task_id': task_id,
+            'recovery_status': recovery_status,
+            'heartbeat_alive': is_alive,
+            'process_running': process_running,
+            'timestamp': time.time()
+        }
+
+    def _move_task_to_failed(self, task_dir: Path, task_id: str, failure_reason: str) -> bool:
+        """Move a task from active to failed."""
+        try:
+            failed_task_dir = self.failed_dir / task_id
+            failed_task_dir.mkdir(parents=True, exist_ok=True)
+
+            # Copy all files
+            for item in task_dir.iterdir():
+                if item.is_file():
+                    import shutil
+                    shutil.copy2(item, failed_task_dir / item.name)
+
+            # Update meta with failure reason
+            meta_file = failed_task_dir / 'meta.json'
+            if meta_file.exists():
+                meta = json.loads(meta_file.read_text())
+            else:
+                meta = {}
+
+            meta['failure_reason'] = failure_reason
+            meta['moved_to_failed_at'] = datetime.now().isoformat()
+            meta_file.write_text(json.dumps(meta, indent=2))
+
+            # Create error.txt
+            error_file = failed_task_dir / 'error.txt'
+            error_file.write_text(f"Task stalled: {failure_reason}\nMoved to failed: {datetime.now().isoformat()}")
+
+            # Remove from active
+            import shutil
+            shutil.rmtree(task_dir)
+
+            return True
+        except Exception as e:
+            print(f"Error moving task {task_id} to failed: {e}")
+            return False
+
+
+if __name__ == '__main__':
+    recovery = ConductorRecovery()
+
+    print("=" * 70)
+    print("FINDING STALLED TASKS")
+    print("=" * 70)
+    stalled = recovery.find_stalled_tasks()
+    print(f"Found {len(stalled)} stalled task(s)")
+    for task in stalled[:5]:
+        print(f"  - {task['task_id']}: {task['stall_reason']}")
+
+    if stalled:
+        print("\n" + "=" * 70)
+        print("RECOVERY DRY RUN (preview only)")
+        print("=" * 70)
+        result = recovery.recover_all_stalled_tasks(dry_run=True)
+        print(f"Would recover: {result['recovered']}")
+        print(f"Would move to failed: {result['moved_to_failed']}")
+        print("\nActions:")
+        for r in result['results'][:1]:
+            for action in r['actions']:
+                print(f"  - {action}")