Refactor cockpit to use DockerTmuxController pattern

Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00
commit ec33ac1936
265 changed files with 92011 additions and 0 deletions
--- a/lib/job_recovery.py
+++ b/lib/job_recovery.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""
+Job recovery and restart system.
+Handles resumption of incomplete jobs with session continuation.
+"""
+
+import json
+import uuid
+import subprocess
+import os
+from pathlib import Path
+from datetime import datetime
+
+JOBS_DIR = Path("/var/log/luz-orchestrator/jobs")
+
+def get_claude_session_id(job_id):
+    """Get existing session ID or create new one."""
+    meta_file = JOBS_DIR / job_id / "meta.json"
+    
+    try:
+        with open(meta_file) as f:
+            meta = json.load(f)
+        
+        session_id = meta.get('claude_session_id')
+        if session_id:
+            return session_id, False  # Existing session
+        
+        # Create new session ID
+        session_id = f"sess_{uuid.uuid4().hex[:12]}"
+        meta['claude_session_id'] = session_id
+        meta['claude_session_created'] = datetime.now().isoformat()
+        
+        with open(meta_file, 'w') as f:
+            json.dump(meta, f, indent=2)
+        
+        return session_id, True  # New session
+    except Exception as e:
+        print(f"Error managing session ID: {e}")
+        return None, False
+
+def create_recovery_prompt(original_prompt):
+    """Add recovery prefix to original prompt."""
+    recovery_prefix = """RECOVERY MODE: Continue from where you left off
+
+IMPORTANT: Before resuming work, do ALL of the following:
+1. Check what has been implemented so far
+   - Look at git status in the project directory
+   - Check /home/<project>/ for any partial work
+   - Review any existing output or reports
+   - Check the conductor directory for progress markers
+   
+2. Verify all artifacts from previous session
+   - List files created/modified since dispatch
+   - Check timestamps to understand what succeeded
+   - Review any logs or error messages
+   
+3. Determine current state
+   - Is implementation complete?
+   - Where exactly did work stop?
+   - What's the next logical step?
+   
+4. If session was interrupted by system overload:
+   - Do NOT retry identical operations
+   - Check for partial results first
+   - Build incrementally on what exists
+   - Report progress immediately
+
+ORIGINAL TASK:
+================================================================================
+""" + original_prompt + """
+================================================================================
+
+RESUME: Begin by following steps 1-4 above, then continue the work.
+"""
+    return recovery_prefix
+
+def prepare_restart(job_id, use_session_continuation=True):
+    """Prepare a job for restart."""
+    job_dir = JOBS_DIR / job_id
+    meta_file = job_dir / "meta.json"
+    prompt_file = job_dir / "prompt.txt"
+    
+    if not meta_file.exists() or not prompt_file.exists():
+        return None, "Missing job files"
+    
+    try:
+        # Load original metadata and prompt
+        with open(meta_file) as f:
+            original_meta = json.load(f)
+        
+        with open(prompt_file) as f:
+            original_prompt = f.read()
+        
+        # Get session ID
+        session_id, is_new = get_claude_session_id(job_id)
+        
+        # Create recovery prompt
+        recovery_prompt = create_recovery_prompt(original_prompt)
+        
+        # Create recovery metadata
+        recovery_meta = original_meta.copy()
+        recovery_meta['status'] = 'running'
+        recovery_meta['recovery_attempt'] = recovery_meta.get('recovery_attempt', 0) + 1
+        recovery_meta['recovery_started'] = datetime.now().isoformat()
+        recovery_meta['claude_session_id'] = session_id
+        recovery_meta['recovery_previous_exit_code'] = recovery_meta.get('exit_code', 'unknown')
+        
+        # Backup original output
+        output_file = job_dir / "output.log"
+        if output_file.exists() and output_file.stat().st_size > 0:
+            backup_file = job_dir / f"output.previous.attempt{recovery_meta['recovery_attempt'] - 1}.log"
+            backup_file.write_bytes(output_file.read_bytes())
+            output_file.write_text("")  # Clear for new attempt
+        
+        # Save recovery metadata
+        with open(meta_file, 'w') as f:
+            json.dump(recovery_meta, f, indent=2)
+        
+        # Save recovery prompt
+        with open(prompt_file, 'w') as f:
+            f.write(recovery_prompt)
+        
+        return {
+            'job_id': job_id,
+            'session_id': session_id,
+            'is_new_session': is_new,
+            'recovery_attempt': recovery_meta['recovery_attempt'],
+            'project': original_meta.get('project'),
+            'ready_to_restart': True,
+        }, None
+    except Exception as e:
+        return None, str(e)
+
+def restart_job(job_id, use_session_continuation=True):
+    """Restart a job with optional session continuation."""
+    job_dir = JOBS_DIR / job_id
+    meta_file = job_dir / "meta.json"
+    
+    # Prepare recovery
+    prep_result, error = prepare_restart(job_id, use_session_continuation)
+    if error:
+        return {'error': error, 'job_id': job_id}
+    
+    session_id = prep_result['session_id']
+    
+    # Build command
+    cmd_parts = [
+        'bash',
+        str(job_dir / "run.sh"),
+    ]
+    
+    # If we have a session ID and continuation is enabled, use -c flag
+    if use_session_continuation and session_id:
+        # Prepend session continuation to script
+        run_script = job_dir / "run.sh"
+        original_script = run_script.read_text()
+        
+        # Inject session ID into the claude command
+        # This would need to be handled by the CLI wrapper
+        # For now, we'll pass it as environment variable
+        os.environ['CLAUDE_SESSION_ID'] = session_id
+        os.environ['CLAUDE_RECOVERY_MODE'] = '1'
+    
+    # Launch restart
+    try:
+        proc = subprocess.Popen(
+            cmd_parts,
+            cwd=str(job_dir.parent),
+            env={**os.environ, 'CLAUDE_SESSION_ID': session_id}
+        )
+        
+        with open(meta_file) as f:
+            meta = json.load(f)
+        
+        meta['recovery_pid'] = proc.pid
+        meta['recovery_restart_timestamp'] = datetime.now().isoformat()
+        
+        with open(meta_file, 'w') as f:
+            json.dump(meta, f, indent=2)
+        
+        return {
+            'job_id': job_id,
+            'session_id': session_id,
+            'recovery_attempt': prep_result['recovery_attempt'],
+            'pid': proc.pid,
+            'status': 'restarted',
+        }
+    except Exception as e:
+        return {
+            'error': str(e),
+            'job_id': job_id,
+            'session_id': session_id,
+        }
+
+if __name__ == "__main__":
+    import sys
+    
+    if len(sys.argv) < 2:
+        print("Usage: job_recovery.py <job_id> [--restart]")
+        sys.exit(1)
+    
+    job_id = sys.argv[1]
+    do_restart = "--restart" in sys.argv
+    
+    if do_restart:
+        result = restart_job(job_id)
+    else:
+        result, error = prepare_restart(job_id)
+    
+    print(json.dumps(result, indent=2))