Refactor cockpit to use DockerTmuxController pattern

Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00
commit ec33ac1936
265 changed files with 92011 additions and 0 deletions
--- a/lib/responsive_dispatcher.py
+++ b/lib/responsive_dispatcher.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+"""
+Responsive Dispatcher - Non-blocking Task Dispatch with Live Status Updates
+
+Implements:
+- Immediate job_id return on task dispatch
+- Background job status monitoring without blocking
+- Live progress feedback system
+- Concurrent task management
+- Status caching for fast retrieval
+
+Key Features:
+1. Dispatch returns immediately with job_id
+2. Background monitor updates job status files
+3. CLI can poll status without blocking
+4. Multiple concurrent tasks tracked independently
+5. Status persisted to disk for resilience
+"""
+
+import json
+import os
+import subprocess
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Any
+import threading
+import queue
+
+
+class ResponseiveDispatcher:
+    """Non-blocking task dispatcher with background monitoring"""
+
+    def __init__(self, jobs_dir: Path = None):
+        self.jobs_dir = jobs_dir or Path("/var/lib/luzia/jobs")
+        self.jobs_dir.mkdir(parents=True, exist_ok=True)
+        self.monitoring_queue = queue.Queue()
+        self.status_cache = {}  # Local cache for fast retrieval
+        self.cache_update_time = {}  # Track cache freshness
+
+    def dispatch_task(self, project: str, task: str, priority: int = 5) -> Tuple[str, Dict]:
+        """
+        Dispatch a task immediately, returning job_id and initial status.
+
+        Returns:
+            (job_id, status_dict)
+        """
+        job_id = datetime.now().strftime("%H%M%S") + "-" + hex(hash(task) & 0xffff)[2:]
+        job_dir = self.jobs_dir / job_id
+
+        # Create job directory atomically
+        job_dir.mkdir(parents=True, exist_ok=True)
+
+        # Write initial status
+        initial_status = {
+            "id": job_id,
+            "project": project,
+            "task": task[:200],  # Truncate long tasks
+            "status": "dispatched",
+            "priority": priority,
+            "dispatched_at": datetime.now().isoformat(),
+            "progress": 0,
+            "message": "Task queued for execution",
+        }
+
+        status_file = job_dir / "status.json"
+        self._write_status(status_file, initial_status)
+
+        # Queue for background monitoring
+        self.monitoring_queue.put({
+            "job_id": job_id,
+            "project": project,
+            "task": task,
+            "job_dir": str(job_dir),
+            "priority": priority,
+        })
+
+        # Update local cache
+        self.status_cache[job_id] = initial_status
+        self.cache_update_time[job_id] = time.time()
+
+        return job_id, initial_status
+
+    def get_status(self, job_id: str, use_cache: bool = True) -> Optional[Dict]:
+        """
+        Get current status of a job.
+
+        Args:
+            job_id: Job ID to query
+            use_cache: Use cached status if fresh (< 1 second old)
+
+        Returns:
+            Status dict or None if job not found
+        """
+        # Check cache first
+        if use_cache and job_id in self.status_cache:
+            cache_age = time.time() - self.cache_update_time.get(job_id, 0)
+            if cache_age < 1.0:  # Cache valid for 1 second
+                return self.status_cache[job_id]
+
+        # Read from disk
+        status_file = self.jobs_dir / job_id / "status.json"
+        if not status_file.exists():
+            return None
+
+        try:
+            status = json.loads(status_file.read_text())
+            self.status_cache[job_id] = status
+            self.cache_update_time[job_id] = time.time()
+            return status
+        except (json.JSONDecodeError, IOError):
+            return None
+
+    def update_status(
+        self,
+        job_id: str,
+        status: str,
+        progress: int = None,
+        message: str = None,
+        exit_code: int = None,
+    ) -> bool:
+        """
+        Update job status. Used by background monitor.
+
+        Returns:
+            True if update successful, False otherwise
+        """
+        status_file = self.jobs_dir / job_id / "status.json"
+        if not status_file.exists():
+            return False
+
+        try:
+            current = json.loads(status_file.read_text())
+        except (json.JSONDecodeError, IOError):
+            return False
+
+        # Update fields
+        current["status"] = status
+        if progress is not None:
+            current["progress"] = min(100, max(0, progress))
+        if message:
+            current["message"] = message
+        if exit_code is not None:
+            current["exit_code"] = exit_code
+        current["updated_at"] = datetime.now().isoformat()
+
+        self._write_status(status_file, current)
+
+        # Update cache
+        self.status_cache[job_id] = current
+        self.cache_update_time[job_id] = time.time()
+
+        return True
+
+    def list_jobs(self, project: str = None, status_filter: str = None) -> list:
+        """
+        List jobs, optionally filtered by project and status.
+
+        Returns:
+            List of job status dicts
+        """
+        jobs = []
+        for job_dir in sorted(self.jobs_dir.iterdir(), reverse=True):
+            if not job_dir.is_dir():
+                continue
+
+            status = self.get_status(job_dir.name)
+            if not status:
+                continue
+
+            # Apply filters
+            if project and status.get("project") != project:
+                continue
+            if status_filter and status.get("status") != status_filter:
+                continue
+
+            jobs.append(status)
+
+        return jobs[:50]  # Return last 50 jobs
+
+    def wait_for_job(self, job_id: str, timeout: int = None, poll_interval: float = 0.5):
+        """
+        Wait for job to complete (blocking).
+        Useful for critical operations that need synchronization.
+
+        Args:
+            job_id: Job ID to wait for
+            timeout: Max seconds to wait (None = wait forever)
+            poll_interval: Seconds between status checks
+
+        Returns:
+            Final status dict or None if timeout
+        """
+        start_time = time.time()
+        while True:
+            status = self.get_status(job_id, use_cache=False)
+            if not status:
+                return None
+
+            if status.get("status") in ["completed", "failed", "killed"]:
+                return status
+
+            if timeout:
+                elapsed = time.time() - start_time
+                if elapsed > timeout:
+                    return None
+
+            time.sleep(poll_interval)
+
+    def stream_status(self, job_id: str, interval: float = 0.5) -> None:
+        """
+        Stream status updates to stdout without blocking main loop.
+        Useful for long-running tasks.
+
+        Args:
+            job_id: Job ID to stream
+            interval: Seconds between updates
+        """
+        last_msg = None
+        while True:
+            status = self.get_status(job_id, use_cache=False)
+            if not status:
+                print(f"Job {job_id} not found")
+                return
+
+            # Print new messages
+            msg = status.get("message")
+            if msg and msg != last_msg:
+                progress = status.get("progress", 0)
+                print(f"  [{progress}%] {msg}")
+                last_msg = msg
+
+            # Check if done
+            if status.get("status") in ["completed", "failed", "killed"]:
+                exit_code = status.get("exit_code", -1)
+                print(f"  [100%] {status['status'].title()} (exit {exit_code})")
+                return
+
+            time.sleep(interval)
+
+    def start_background_monitor(self) -> threading.Thread:
+        """
+        Start background monitor thread that processes queued jobs.
+
+        Returns:
+            Monitor thread (started, daemon=True)
+        """
+        monitor = threading.Thread(target=self._monitor_loop, daemon=True)
+        monitor.start()
+        return monitor
+
+    def _monitor_loop(self):
+        """Background monitor loop - processes jobs from queue"""
+        while True:
+            try:
+                # Get next job from queue with short timeout
+                job_info = self.monitoring_queue.get(timeout=1.0)
+                self._monitor_job(job_info)
+            except queue.Empty:
+                continue
+            except Exception as e:
+                print(f"[Monitor error] {e}", flush=True)
+
+    def _monitor_job(self, job_info: Dict):
+        """Monitor a single job's execution"""
+        job_id = job_info["job_id"]
+        project = job_info["project"]
+        job_dir = Path(job_info["job_dir"])
+
+        # Update status: starting
+        self.update_status(job_id, "starting", progress=5, message="Agent initialization")
+
+        # Wait for agent to start (check for meta.json or output)
+        max_wait = 30
+        for _ in range(max_wait):
+            output_file = job_dir / "output.log"
+            meta_file = job_dir / "meta.json"
+
+            if output_file.exists() or meta_file.exists():
+                self.update_status(job_id, "running", progress=10, message="Agent running")
+                break
+
+            time.sleep(0.5)
+        else:
+            # Timeout waiting for agent to start
+            self.update_status(job_id, "failed", progress=0, message="Agent failed to start")
+            return
+
+        # Monitor execution
+        output_file = job_dir / "output.log"
+        last_size = 0
+        stalled_count = 0
+
+        while True:
+            # Check if completed
+            if output_file.exists():
+                content = output_file.read_text()
+                if "exit:" in content:
+                    # Parse exit code
+                    lines = content.strip().split("\n")
+                    for line in reversed(lines):
+                        if line.startswith("exit:"):
+                            exit_code = int(line.split(":")[1])
+                            status = "completed" if exit_code == 0 else "failed"
+                            self.update_status(
+                                job_id,
+                                status,
+                                progress=100,
+                                message=f"Agent {status}",
+                                exit_code=exit_code,
+                            )
+                            return
+
+                # Update progress based on output size
+                current_size = len(content)
+                if current_size > last_size:
+                    progress = min(95, 10 + (current_size // 1000))  # Rough progress indicator
+                    self.update_status(job_id, "running", progress=progress)
+                    last_size = current_size
+                    stalled_count = 0
+                else:
+                    stalled_count += 1
+                    if stalled_count > 30:  # 30 * 1 second = 30 seconds with no output
+                        self.update_status(
+                            job_id, "stalled", progress=50, message="No output for 30 seconds"
+                        )
+
+            time.sleep(1.0)
+
+    @staticmethod
+    def _write_status(path: Path, data: Dict) -> None:
+        """Write status atomically"""
+        tmp_path = path.with_suffix(".json.tmp")
+        with open(tmp_path, "w") as f:
+            json.dump(data, f, indent=2)
+            f.flush()
+            os.fsync(f.fileno())
+        os.rename(tmp_path, path)
+
+
+# Helper function for quick dispatch
+def quick_dispatch(job_id: str, project: str, task: str) -> Dict:
+    """Quick dispatch helper - returns status dict with job_id"""
+    dispatcher = ResponseiveDispatcher()
+    _, status = dispatcher.dispatch_task(project, task)
+    return status