Refactor cockpit to use DockerTmuxController pattern

Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00
commit ec33ac1936
265 changed files with 92011 additions and 0 deletions
--- a/lib/task_completion.py
+++ b/lib/task_completion.py
@@ -0,0 +1,458 @@
+#!/usr/bin/env python3
+"""
+Task Completion Callback - Notify queue when task completes
+
+Called by agents when they finish to:
+1. Release per-user lock
+2. Update capacity counters
+3. Move conductor files to completed/failed
+4. Unblock project queue if was awaiting_human
+
+Usage:
+    # From agent code:
+    from task_completion import complete_task, fail_task
+
+    complete_task(task_id, result_data)
+    fail_task(task_id, error_message)
+
+    # CLI:
+    python3 task_completion.py complete <task_id> [result]
+    python3 task_completion.py fail <task_id> <error>
+"""
+
+import json
+import os
+import fcntl
+import shutil
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, Optional
+
+
+class TaskCompletion:
+    """Handle task completion callbacks."""
+
+    CONDUCTOR_BASE = Path.home() / "conductor"
+    ACTIVE_DIR = CONDUCTOR_BASE / "active"
+    COMPLETED_DIR = CONDUCTOR_BASE / "completed"
+    FAILED_DIR = CONDUCTOR_BASE / "failed"
+
+    QUEUE_BASE = Path("/var/lib/luzia/queue")
+    LOCKS_BASE = Path("/var/lib/luzia/locks")
+    CAPACITY_FILE = QUEUE_BASE / "capacity.json"
+
+    COCKPIT_STATE_DIR = Path("/var/lib/luz-orchestrator/cockpits")
+
+    def __init__(self):
+        """Initialize completion handler."""
+        self._ensure_dirs()
+
+    def _ensure_dirs(self):
+        """Ensure directories exist."""
+        for d in [self.COMPLETED_DIR, self.FAILED_DIR]:
+            d.mkdir(parents=True, exist_ok=True)
+
+    def complete_task(
+        self,
+        task_id: str,
+        result: Optional[Dict] = None,
+        summary: str = None
+    ) -> Dict:
+        """
+        Mark task as completed successfully.
+
+        Args:
+            task_id: The task ID
+            result: Optional result data
+            summary: Optional summary of what was accomplished
+
+        Returns:
+            Status dict with success flag
+        """
+        task_dir = self.ACTIVE_DIR / task_id
+
+        if not task_dir.exists():
+            return {'success': False, 'error': f'Task {task_id} not found in active'}
+
+        try:
+            # Load and update meta
+            meta_file = task_dir / "meta.json"
+            meta = {}
+            if meta_file.exists():
+                meta = json.loads(meta_file.read_text())
+
+            meta['status'] = 'completed'
+            meta['completed_at'] = datetime.now().isoformat()
+            if result:
+                meta['result'] = result
+            if summary:
+                meta['summary'] = summary
+
+            # Calculate duration
+            if 'created_at' in meta:
+                try:
+                    start = datetime.fromisoformat(meta['created_at'])
+                    meta['duration_seconds'] = (datetime.now() - start).total_seconds()
+                except:
+                    pass
+
+            # Write updated meta
+            with open(meta_file, 'w') as f:
+                json.dump(meta, f, indent=2)
+
+            # Release user lock
+            user = meta.get('user') or meta.get('enqueued_by')
+            lock_id = meta.get('lock_id')
+            if user and lock_id:
+                self._release_lock(user, lock_id)
+
+            # Update capacity
+            self._increment_capacity()
+
+            # Move to completed
+            dest = self.COMPLETED_DIR / task_id
+            if dest.exists():
+                shutil.rmtree(dest)
+            shutil.move(str(task_dir), str(dest))
+
+            return {
+                'success': True,
+                'task_id': task_id,
+                'status': 'completed',
+                'completed_at': meta['completed_at']
+            }
+
+        except Exception as e:
+            return {'success': False, 'error': str(e)}
+
+    def fail_task(
+        self,
+        task_id: str,
+        error: str,
+        exit_code: int = 1,
+        recoverable: bool = True
+    ) -> Dict:
+        """
+        Mark task as failed.
+
+        Args:
+            task_id: The task ID
+            error: Error message
+            exit_code: Process exit code
+            recoverable: Whether task can be retried
+
+        Returns:
+            Status dict
+        """
+        task_dir = self.ACTIVE_DIR / task_id
+
+        if not task_dir.exists():
+            return {'success': False, 'error': f'Task {task_id} not found in active'}
+
+        try:
+            # Load and update meta
+            meta_file = task_dir / "meta.json"
+            meta = {}
+            if meta_file.exists():
+                meta = json.loads(meta_file.read_text())
+
+            meta['status'] = 'failed'
+            meta['failed_at'] = datetime.now().isoformat()
+            meta['error'] = error
+            meta['exit_code'] = exit_code
+            meta['recoverable'] = recoverable
+
+            # Track retry count
+            meta['retry_count'] = meta.get('retry_count', 0)
+
+            # Write updated meta
+            with open(meta_file, 'w') as f:
+                json.dump(meta, f, indent=2)
+
+            # Release user lock
+            user = meta.get('user') or meta.get('enqueued_by')
+            lock_id = meta.get('lock_id')
+            if user and lock_id:
+                self._release_lock(user, lock_id)
+
+            # Update capacity
+            self._increment_capacity()
+
+            # Move to failed
+            dest = self.FAILED_DIR / task_id
+            if dest.exists():
+                shutil.rmtree(dest)
+            shutil.move(str(task_dir), str(dest))
+
+            return {
+                'success': True,
+                'task_id': task_id,
+                'status': 'failed',
+                'failed_at': meta['failed_at'],
+                'recoverable': recoverable
+            }
+
+        except Exception as e:
+            return {'success': False, 'error': str(e)}
+
+    def set_awaiting_human(
+        self,
+        task_id: str,
+        question: str,
+        project: str = None
+    ) -> Dict:
+        """
+        Mark task as awaiting human response.
+        This blocks the project queue AND sends question to Telegram.
+
+        Args:
+            task_id: The task ID
+            question: The question for the human
+            project: Optional project name (for cockpit integration)
+
+        Returns:
+            Status dict
+        """
+        task_dir = self.ACTIVE_DIR / task_id
+
+        if not task_dir.exists():
+            return {'success': False, 'error': f'Task {task_id} not found'}
+
+        try:
+            # Update task meta
+            meta_file = task_dir / "meta.json"
+            meta = {}
+            if meta_file.exists():
+                meta = json.loads(meta_file.read_text())
+
+            meta['status'] = 'awaiting_human'
+            meta['awaiting_since'] = datetime.now().isoformat()
+            meta['awaiting_question'] = question
+
+            with open(meta_file, 'w') as f:
+                json.dump(meta, f, indent=2)
+
+            # If project specified, also update cockpit state
+            project = project or meta.get('project')
+            if project:
+                self._update_cockpit_awaiting(project, question)
+
+            # Send question to Bruno via Telegram
+            telegram_request_id = None
+            try:
+                from telegram_bridge import ask_bruno
+                context = f"Task: {task_id}\nProject: {project or 'unknown'}"
+                telegram_request_id, sent = ask_bruno(
+                    question=question,
+                    project=project or "luzia",
+                    context=context
+                )
+                if sent:
+                    meta['telegram_request_id'] = telegram_request_id
+                    with open(meta_file, 'w') as f:
+                        json.dump(meta, f, indent=2)
+            except Exception as e:
+                # Log but don't fail - telegram is optional
+                pass
+
+            return {
+                'success': True,
+                'task_id': task_id,
+                'status': 'awaiting_human',
+                'question': question,
+                'telegram_request_id': telegram_request_id
+            }
+
+        except Exception as e:
+            return {'success': False, 'error': str(e)}
+
+    def resume_from_human(
+        self,
+        task_id: str,
+        answer: str,
+        project: str = None
+    ) -> Dict:
+        """
+        Resume task after human provides answer.
+
+        Args:
+            task_id: The task ID
+            answer: Human's response
+            project: Optional project name
+
+        Returns:
+            Status dict
+        """
+        task_dir = self.ACTIVE_DIR / task_id
+
+        if not task_dir.exists():
+            return {'success': False, 'error': f'Task {task_id} not found'}
+
+        try:
+            # Update task meta
+            meta_file = task_dir / "meta.json"
+            meta = {}
+            if meta_file.exists():
+                meta = json.loads(meta_file.read_text())
+
+            meta['status'] = 'running'
+            meta['resumed_at'] = datetime.now().isoformat()
+            meta['human_answer'] = answer
+
+            with open(meta_file, 'w') as f:
+                json.dump(meta, f, indent=2)
+
+            # Clear cockpit awaiting state
+            project = project or meta.get('project')
+            if project:
+                self._clear_cockpit_awaiting(project)
+
+            return {
+                'success': True,
+                'task_id': task_id,
+                'status': 'running',
+                'resumed_at': meta['resumed_at']
+            }
+
+        except Exception as e:
+            return {'success': False, 'error': str(e)}
+
+    def _release_lock(self, user: str, lock_id: str) -> bool:
+        """Release a per-user lock."""
+        lock_file = self.LOCKS_BASE / f"user_{user}.lock"
+        meta_file = self.LOCKS_BASE / f"user_{user}.json"
+
+        try:
+            # Verify lock ID matches
+            if meta_file.exists():
+                meta = json.loads(meta_file.read_text())
+                if meta.get('lock_id') != lock_id:
+                    return False
+
+            # Remove lock files
+            if lock_file.exists():
+                lock_file.unlink()
+            if meta_file.exists():
+                meta_file.unlink()
+
+            return True
+        except:
+            return False
+
+    def _increment_capacity(self) -> bool:
+        """Increment available capacity slots."""
+        if not self.CAPACITY_FILE.exists():
+            return False
+
+        try:
+            with open(self.CAPACITY_FILE, 'r+') as f:
+                fcntl.flock(f, fcntl.LOCK_EX)
+                try:
+                    capacity = json.load(f)
+                    current = capacity.get('slots', {}).get('available', 0)
+                    max_slots = capacity.get('slots', {}).get('max', 4)
+                    capacity['slots']['available'] = min(current + 1, max_slots)
+                    capacity['last_updated'] = datetime.now().isoformat()
+
+                    f.seek(0)
+                    f.truncate()
+                    json.dump(capacity, f, indent=2)
+                finally:
+                    fcntl.flock(f, fcntl.LOCK_UN)
+            return True
+        except:
+            return False
+
+    def _update_cockpit_awaiting(self, project: str, question: str):
+        """Update cockpit state to show awaiting human."""
+        state_file = self.COCKPIT_STATE_DIR / f"{project}.json"
+
+        try:
+            state = {}
+            if state_file.exists():
+                state = json.loads(state_file.read_text())
+
+            state['awaiting_response'] = True
+            state['last_question'] = question
+            state['awaiting_since'] = datetime.now().isoformat()
+
+            with open(state_file, 'w') as f:
+                json.dump(state, f, indent=2)
+        except:
+            pass
+
+    def _clear_cockpit_awaiting(self, project: str):
+        """Clear cockpit awaiting state."""
+        state_file = self.COCKPIT_STATE_DIR / f"{project}.json"
+
+        try:
+            if not state_file.exists():
+                return
+
+            state = json.loads(state_file.read_text())
+            state['awaiting_response'] = False
+            state['last_question'] = None
+
+            with open(state_file, 'w') as f:
+                json.dump(state, f, indent=2)
+        except:
+            pass
+
+
+# Convenience functions for direct import
+_handler = None
+
+def _get_handler():
+    global _handler
+    if _handler is None:
+        _handler = TaskCompletion()
+    return _handler
+
+def complete_task(task_id: str, result: Dict = None, summary: str = None) -> Dict:
+    """Complete a task successfully."""
+    return _get_handler().complete_task(task_id, result, summary)
+
+def fail_task(task_id: str, error: str, exit_code: int = 1, recoverable: bool = True) -> Dict:
+    """Mark a task as failed."""
+    return _get_handler().fail_task(task_id, error, exit_code, recoverable)
+
+def set_awaiting_human(task_id: str, question: str, project: str = None) -> Dict:
+    """Mark task as awaiting human response."""
+    return _get_handler().set_awaiting_human(task_id, question, project)
+
+def resume_from_human(task_id: str, answer: str, project: str = None) -> Dict:
+    """Resume task after human answer."""
+    return _get_handler().resume_from_human(task_id, answer, project)
+
+
+def main():
+    """CLI entry point."""
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Task Completion Callback')
+    parser.add_argument('command', choices=['complete', 'fail', 'await', 'resume'],
+                        help='Command to run')
+    parser.add_argument('task_id', help='Task ID')
+    parser.add_argument('message', nargs='?', default='',
+                        help='Result/error/question/answer')
+    parser.add_argument('--project', help='Project name')
+    parser.add_argument('--exit-code', type=int, default=1, help='Exit code for failures')
+
+    args = parser.parse_args()
+
+    handler = TaskCompletion()
+
+    if args.command == 'complete':
+        result = handler.complete_task(args.task_id, summary=args.message)
+    elif args.command == 'fail':
+        result = handler.fail_task(args.task_id, args.message, args.exit_code)
+    elif args.command == 'await':
+        result = handler.set_awaiting_human(args.task_id, args.message, args.project)
+    elif args.command == 'resume':
+        result = handler.resume_from_human(args.task_id, args.message, args.project)
+
+    print(json.dumps(result, indent=2))
+
+
+if __name__ == '__main__':
+    main()