Refactor cockpit to use DockerTmuxController pattern

Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00
commit ec33ac1936
265 changed files with 92011 additions and 0 deletions
--- a/lib/task_watchdog.py
+++ b/lib/task_watchdog.py
@@ -0,0 +1,538 @@
+#!/usr/bin/env python3
+"""
+Task Watchdog - Monitor running tasks for stuck/failed states
+
+Key responsibilities:
+1. Monitor heartbeats for running tasks
+2. Detect and clean up stuck/orphaned tasks
+3. Release stale locks automatically
+4. Track task state transitions
+5. Support cockpit "awaiting_human" state
+
+Usage:
+    from task_watchdog import TaskWatchdog
+
+    watchdog = TaskWatchdog()
+    watchdog.run_check()  # Single check
+    watchdog.run_loop()   # Continuous monitoring
+"""
+
+import json
+import os
+import time
+import signal
+from pathlib import Path
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple
+import fcntl
+
+
+class TaskWatchdog:
+    """Monitor running tasks for stuck/failed states."""
+
+    # Configuration
+    CONDUCTOR_BASE = Path.home() / "conductor"
+    ACTIVE_DIR = CONDUCTOR_BASE / "active"
+    COMPLETED_DIR = CONDUCTOR_BASE / "completed"
+    FAILED_DIR = CONDUCTOR_BASE / "failed"
+
+    QUEUE_BASE = Path("/var/lib/luzia/queue")
+    LOCKS_BASE = Path("/var/lib/luzia/locks")
+    CAPACITY_FILE = QUEUE_BASE / "capacity.json"
+
+    JOBS_DIR = Path("/var/log/luz-orchestrator/jobs")
+
+    # Cockpit state directory
+    COCKPIT_STATE_DIR = Path("/var/lib/luz-orchestrator/cockpits")
+
+    # Timeouts
+    HEARTBEAT_TIMEOUT_SECONDS = 300  # 5 minutes without heartbeat = stuck
+    LOCK_TIMEOUT_SECONDS = 3600      # 1 hour lock timeout
+    AWAITING_HUMAN_TIMEOUT = 86400   # 24 hours for human response
+
+    # State transitions
+    STATES = {
+        'pending': ['claimed', 'cancelled'],
+        'claimed': ['dispatched', 'failed', 'cancelled'],
+        'dispatched': ['running', 'failed'],
+        'running': ['completed', 'failed', 'awaiting_human', 'stuck'],
+        'awaiting_human': ['running', 'failed', 'cancelled'],
+        'stuck': ['failed', 'recovered'],
+        'completed': [],
+        'failed': ['retrying'],
+        'retrying': ['running', 'failed'],
+        'cancelled': []
+    }
+
+    def __init__(self):
+        """Initialize watchdog."""
+        self._ensure_dirs()
+
+    def _ensure_dirs(self):
+        """Ensure required directories exist."""
+        for d in [self.ACTIVE_DIR, self.COMPLETED_DIR, self.FAILED_DIR]:
+            d.mkdir(parents=True, exist_ok=True)
+        self.LOCKS_BASE.mkdir(parents=True, exist_ok=True)
+        self.COCKPIT_STATE_DIR.mkdir(parents=True, exist_ok=True)
+
+    def check_heartbeats(self) -> List[Dict]:
+        """
+        Check all active tasks for stale heartbeats.
+        Returns list of stuck tasks.
+        """
+        stuck_tasks = []
+        now = time.time()
+
+        if not self.ACTIVE_DIR.exists():
+            return stuck_tasks
+
+        for task_dir in self.ACTIVE_DIR.iterdir():
+            if not task_dir.is_dir():
+                continue
+
+            task_id = task_dir.name
+            heartbeat_file = task_dir / "heartbeat.json"
+            meta_file = task_dir / "meta.json"
+
+            # Load task metadata
+            meta = {}
+            if meta_file.exists():
+                try:
+                    meta = json.loads(meta_file.read_text())
+                except:
+                    pass
+
+            # Check if task is in awaiting_human state (from cockpit)
+            if meta.get('status') == 'awaiting_human':
+                # Check if awaiting too long
+                awaiting_since = meta.get('awaiting_since', now)
+                if now - awaiting_since > self.AWAITING_HUMAN_TIMEOUT:
+                    stuck_tasks.append({
+                        'task_id': task_id,
+                        'reason': 'awaiting_human_timeout',
+                        'last_heartbeat': None,
+                        'meta': meta
+                    })
+                continue
+
+            # Check heartbeat for running tasks
+            if not heartbeat_file.exists():
+                # No heartbeat file = check task age
+                created = meta.get('created_at')
+                if created:
+                    try:
+                        created_ts = datetime.fromisoformat(created).timestamp()
+                        if now - created_ts > self.HEARTBEAT_TIMEOUT_SECONDS:
+                            stuck_tasks.append({
+                                'task_id': task_id,
+                                'reason': 'no_heartbeat',
+                                'last_heartbeat': None,
+                                'meta': meta
+                            })
+                    except:
+                        pass
+                continue
+
+            try:
+                heartbeat = json.loads(heartbeat_file.read_text())
+                last_ts = heartbeat.get('ts', 0)
+
+                if now - last_ts > self.HEARTBEAT_TIMEOUT_SECONDS:
+                    stuck_tasks.append({
+                        'task_id': task_id,
+                        'reason': 'stale_heartbeat',
+                        'last_heartbeat': last_ts,
+                        'last_step': heartbeat.get('step'),
+                        'meta': meta
+                    })
+            except Exception as e:
+                stuck_tasks.append({
+                    'task_id': task_id,
+                    'reason': 'heartbeat_read_error',
+                    'error': str(e),
+                    'meta': meta
+                })
+
+        return stuck_tasks
+
+    def cleanup_orphaned_tasks(self) -> List[str]:
+        """
+        Clean up tasks where agent died.
+        Moves stuck tasks to failed directory.
+        Returns list of cleaned task IDs.
+        """
+        cleaned = []
+        stuck_tasks = self.check_heartbeats()
+
+        for task in stuck_tasks:
+            task_id = task['task_id']
+            task_dir = self.ACTIVE_DIR / task_id
+
+            if not task_dir.exists():
+                continue
+
+            # Update meta with failure info
+            meta_file = task_dir / "meta.json"
+            meta = {}
+            if meta_file.exists():
+                try:
+                    meta = json.loads(meta_file.read_text())
+                except:
+                    pass
+
+            meta['status'] = 'failed'
+            meta['failure_reason'] = task.get('reason', 'unknown')
+            meta['failed_at'] = datetime.now().isoformat()
+            meta['last_heartbeat'] = task.get('last_heartbeat')
+
+            # Write updated meta
+            try:
+                with open(meta_file, 'w') as f:
+                    json.dump(meta, f, indent=2)
+            except:
+                pass
+
+            # Move to failed directory
+            failed_dest = self.FAILED_DIR / task_id
+            try:
+                if failed_dest.exists():
+                    # Remove old failed version
+                    import shutil
+                    shutil.rmtree(failed_dest)
+                task_dir.rename(failed_dest)
+                cleaned.append(task_id)
+            except Exception as e:
+                print(f"Error moving task {task_id} to failed: {e}")
+
+        return cleaned
+
+    def release_stale_locks(self) -> List[str]:
+        """
+        Release locks for dead tasks.
+        Returns list of released lock IDs.
+        """
+        released = []
+        now = time.time()
+
+        if not self.LOCKS_BASE.exists():
+            return released
+
+        for lock_file in self.LOCKS_BASE.glob("user_*.lock"):
+            try:
+                # Read lock metadata
+                meta_file = lock_file.with_suffix('.json')
+                if not meta_file.exists():
+                    # Old lock without metadata - remove if lock file is old
+                    if now - lock_file.stat().st_mtime > self.LOCK_TIMEOUT_SECONDS:
+                        lock_file.unlink()
+                        released.append(lock_file.name)
+                    continue
+
+                meta = json.loads(meta_file.read_text())
+                expires_at = meta.get('expires_at', 0)
+
+                if now > expires_at:
+                    # Lock expired - remove both files
+                    lock_file.unlink()
+                    meta_file.unlink()
+                    released.append(meta.get('lock_id', lock_file.name))
+            except Exception as e:
+                print(f"Error checking lock {lock_file}: {e}")
+
+        return released
+
+    def update_capacity(self, released_slots: int = 0) -> bool:
+        """
+        Update capacity file to reflect released resources.
+        Uses file locking for safety.
+        """
+        if not self.CAPACITY_FILE.exists():
+            return False
+
+        try:
+            with open(self.CAPACITY_FILE, 'r+') as f:
+                fcntl.flock(f, fcntl.LOCK_EX)
+                try:
+                    capacity = json.load(f)
+
+                    # Update available slots
+                    current = capacity.get('slots', {}).get('available', 0)
+                    max_slots = capacity.get('slots', {}).get('max', 4)
+                    capacity['slots']['available'] = min(current + released_slots, max_slots)
+
+                    # Update timestamp
+                    capacity['last_updated'] = datetime.now().isoformat()
+
+                    # Write back
+                    f.seek(0)
+                    f.truncate()
+                    json.dump(capacity, f, indent=2)
+                finally:
+                    fcntl.flock(f, fcntl.LOCK_UN)
+            return True
+        except Exception as e:
+            print(f"Error updating capacity: {e}")
+            return False
+
+    def get_project_queue_status(self) -> Dict[str, Dict]:
+        """
+        Get queue status per project.
+        Returns dict of project -> {pending, running, awaiting_human}
+        """
+        status = {}
+
+        # Count pending tasks
+        pending_dirs = [
+            self.QUEUE_BASE / "pending" / "high",
+            self.QUEUE_BASE / "pending" / "normal"
+        ]
+
+        for pending_dir in pending_dirs:
+            if not pending_dir.exists():
+                continue
+            for task_file in pending_dir.glob("*.json"):
+                try:
+                    task = json.loads(task_file.read_text())
+                    project = task.get('project', 'unknown')
+                    if project not in status:
+                        status[project] = {'pending': 0, 'running': 0, 'awaiting_human': 0}
+                    status[project]['pending'] += 1
+                except:
+                    pass
+
+        # Count active tasks
+        if self.ACTIVE_DIR.exists():
+            for task_dir in self.ACTIVE_DIR.iterdir():
+                if not task_dir.is_dir():
+                    continue
+                meta_file = task_dir / "meta.json"
+                if meta_file.exists():
+                    try:
+                        meta = json.loads(meta_file.read_text())
+                        project = meta.get('project', 'unknown')
+                        if project not in status:
+                            status[project] = {'pending': 0, 'running': 0, 'awaiting_human': 0}
+
+                        if meta.get('status') == 'awaiting_human':
+                            status[project]['awaiting_human'] += 1
+                        else:
+                            status[project]['running'] += 1
+                    except:
+                        pass
+
+        # Check cockpit states
+        if self.COCKPIT_STATE_DIR.exists():
+            for state_file in self.COCKPIT_STATE_DIR.glob("*.json"):
+                try:
+                    state = json.loads(state_file.read_text())
+                    project = state.get('project')
+                    if project and state.get('awaiting_response'):
+                        if project not in status:
+                            status[project] = {'pending': 0, 'running': 0, 'awaiting_human': 0}
+                        status[project]['awaiting_human'] += 1
+                except:
+                    pass
+
+        return status
+
+    def is_project_blocked(self, project: str) -> Tuple[bool, Optional[str]]:
+        """
+        Check if a project queue is blocked.
+        Returns (is_blocked, reason)
+        """
+        # Check cockpit state
+        cockpit_state = self.COCKPIT_STATE_DIR / f"{project}.json"
+        if cockpit_state.exists():
+            try:
+                state = json.loads(cockpit_state.read_text())
+                if state.get('awaiting_response'):
+                    return True, "awaiting_human_cockpit"
+            except:
+                pass
+
+        # Check active tasks for awaiting_human
+        if self.ACTIVE_DIR.exists():
+            for task_dir in self.ACTIVE_DIR.iterdir():
+                if not task_dir.is_dir():
+                    continue
+                meta_file = task_dir / "meta.json"
+                if meta_file.exists():
+                    try:
+                        meta = json.loads(meta_file.read_text())
+                        if meta.get('project') == project and meta.get('status') == 'awaiting_human':
+                            return True, f"awaiting_human_task:{task_dir.name}"
+                    except:
+                        pass
+
+        return False, None
+
+    def set_task_awaiting_human(self, task_id: str, question: str = None) -> bool:
+        """
+        Mark a task as awaiting human response.
+        This blocks the project queue.
+        """
+        task_dir = self.ACTIVE_DIR / task_id
+        meta_file = task_dir / "meta.json"
+
+        if not meta_file.exists():
+            return False
+
+        try:
+            meta = json.loads(meta_file.read_text())
+            meta['status'] = 'awaiting_human'
+            meta['awaiting_since'] = time.time()
+            if question:
+                meta['awaiting_question'] = question
+
+            with open(meta_file, 'w') as f:
+                json.dump(meta, f, indent=2)
+            return True
+        except Exception as e:
+            print(f"Error setting task awaiting: {e}")
+            return False
+
+    def resume_task(self, task_id: str, answer: str = None) -> bool:
+        """
+        Resume a task that was awaiting human response.
+        """
+        task_dir = self.ACTIVE_DIR / task_id
+        meta_file = task_dir / "meta.json"
+
+        if not meta_file.exists():
+            return False
+
+        try:
+            meta = json.loads(meta_file.read_text())
+            if meta.get('status') != 'awaiting_human':
+                return False
+
+            meta['status'] = 'running'
+            meta['resumed_at'] = datetime.now().isoformat()
+            if answer:
+                meta['human_response'] = answer
+
+            # Update heartbeat
+            heartbeat_file = task_dir / "heartbeat.json"
+            with open(heartbeat_file, 'w') as f:
+                json.dump({'ts': time.time(), 'step': 'Resumed after human response'}, f)
+
+            with open(meta_file, 'w') as f:
+                json.dump(meta, f, indent=2)
+            return True
+        except Exception as e:
+            print(f"Error resuming task: {e}")
+            return False
+
+    def run_check(self) -> Dict:
+        """
+        Run a single watchdog check.
+        Returns summary of actions taken.
+        """
+        summary = {
+            'timestamp': datetime.now().isoformat(),
+            'stuck_tasks': [],
+            'cleaned_tasks': [],
+            'released_locks': [],
+            'project_status': {}
+        }
+
+        # Check for stuck tasks
+        stuck = self.check_heartbeats()
+        summary['stuck_tasks'] = [t['task_id'] for t in stuck]
+
+        # Clean up orphaned tasks
+        cleaned = self.cleanup_orphaned_tasks()
+        summary['cleaned_tasks'] = cleaned
+
+        # Release stale locks
+        released = self.release_stale_locks()
+        summary['released_locks'] = released
+
+        # Update capacity if we cleaned anything
+        if cleaned or released:
+            self.update_capacity(released_slots=len(cleaned))
+
+        # Get project queue status
+        summary['project_status'] = self.get_project_queue_status()
+
+        return summary
+
+    def run_loop(self, interval_seconds: int = 60):
+        """
+        Run continuous watchdog monitoring.
+        """
+        print(f"Starting Task Watchdog (interval: {interval_seconds}s)")
+
+        def handle_signal(signum, frame):
+            print("\nWatchdog shutting down...")
+            exit(0)
+
+        signal.signal(signal.SIGINT, handle_signal)
+        signal.signal(signal.SIGTERM, handle_signal)
+
+        while True:
+            try:
+                summary = self.run_check()
+
+                # Log if any actions were taken
+                if summary['stuck_tasks'] or summary['cleaned_tasks'] or summary['released_locks']:
+                    print(f"[{datetime.now().isoformat()}] Watchdog check:")
+                    if summary['stuck_tasks']:
+                        print(f"  Stuck tasks: {summary['stuck_tasks']}")
+                    if summary['cleaned_tasks']:
+                        print(f"  Cleaned: {summary['cleaned_tasks']}")
+                    if summary['released_locks']:
+                        print(f"  Released locks: {summary['released_locks']}")
+
+                time.sleep(interval_seconds)
+
+            except Exception as e:
+                print(f"Watchdog error: {e}")
+                time.sleep(interval_seconds)
+
+
+def main():
+    """CLI entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Task Watchdog')
+    parser.add_argument('command', nargs='?', default='check',
+                        choices=['check', 'daemon', 'status', 'stuck', 'clean'],
+                        help='Command to run')
+    parser.add_argument('--interval', type=int, default=60,
+                        help='Check interval for daemon mode (seconds)')
+
+    args = parser.parse_args()
+
+    watchdog = TaskWatchdog()
+
+    if args.command == 'check':
+        summary = watchdog.run_check()
+        print(json.dumps(summary, indent=2))
+
+    elif args.command == 'daemon':
+        watchdog.run_loop(interval_seconds=args.interval)
+
+    elif args.command == 'status':
+        status = watchdog.get_project_queue_status()
+        print("Project Queue Status:")
+        print(json.dumps(status, indent=2))
+
+    elif args.command == 'stuck':
+        stuck = watchdog.check_heartbeats()
+        if stuck:
+            print(f"Found {len(stuck)} stuck tasks:")
+            for t in stuck:
+                print(f"  - {t['task_id']}: {t['reason']}")
+        else:
+            print("No stuck tasks found")
+
+    elif args.command == 'clean':
+        cleaned = watchdog.cleanup_orphaned_tasks()
+        released = watchdog.release_stale_locks()
+        print(f"Cleaned {len(cleaned)} orphaned tasks")
+        print(f"Released {len(released)} stale locks")
+
+
+if __name__ == '__main__':
+    main()