Refactor cockpit to use DockerTmuxController pattern

Based on claude-code-tools TmuxCLIController, this refactor:

- Added DockerTmuxController class for robust tmux session management
- Implements send_keys() with configurable delay_enter
- Implements capture_pane() for output retrieval
- Implements wait_for_prompt() for pattern-based completion detection
- Implements wait_for_idle() for content-hash-based idle detection
- Implements wait_for_shell_prompt() for shell prompt detection

Also includes workflow improvements:
- Pre-task git snapshot before agent execution
- Post-task commit protocol in agent guidelines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
admin
2026-01-14 10:42:16 -03:00
commit ec33ac1936
265 changed files with 92011 additions and 0 deletions

458
lib/task_completion.py Normal file
View File

@@ -0,0 +1,458 @@
#!/usr/bin/env python3
"""
Task Completion Callback - Notify queue when task completes
Called by agents when they finish to:
1. Release per-user lock
2. Update capacity counters
3. Move conductor files to completed/failed
4. Unblock project queue if was awaiting_human
Usage:
# From agent code:
from task_completion import complete_task, fail_task
complete_task(task_id, result_data)
fail_task(task_id, error_message)
# CLI:
python3 task_completion.py complete <task_id> [result]
python3 task_completion.py fail <task_id> <error>
"""
import json
import os
import fcntl
import shutil
from pathlib import Path
from datetime import datetime
from typing import Dict, Optional
class TaskCompletion:
"""Handle task completion callbacks."""
CONDUCTOR_BASE = Path.home() / "conductor"
ACTIVE_DIR = CONDUCTOR_BASE / "active"
COMPLETED_DIR = CONDUCTOR_BASE / "completed"
FAILED_DIR = CONDUCTOR_BASE / "failed"
QUEUE_BASE = Path("/var/lib/luzia/queue")
LOCKS_BASE = Path("/var/lib/luzia/locks")
CAPACITY_FILE = QUEUE_BASE / "capacity.json"
COCKPIT_STATE_DIR = Path("/var/lib/luz-orchestrator/cockpits")
def __init__(self):
"""Initialize completion handler."""
self._ensure_dirs()
def _ensure_dirs(self):
"""Ensure directories exist."""
for d in [self.COMPLETED_DIR, self.FAILED_DIR]:
d.mkdir(parents=True, exist_ok=True)
def complete_task(
self,
task_id: str,
result: Optional[Dict] = None,
summary: str = None
) -> Dict:
"""
Mark task as completed successfully.
Args:
task_id: The task ID
result: Optional result data
summary: Optional summary of what was accomplished
Returns:
Status dict with success flag
"""
task_dir = self.ACTIVE_DIR / task_id
if not task_dir.exists():
return {'success': False, 'error': f'Task {task_id} not found in active'}
try:
# Load and update meta
meta_file = task_dir / "meta.json"
meta = {}
if meta_file.exists():
meta = json.loads(meta_file.read_text())
meta['status'] = 'completed'
meta['completed_at'] = datetime.now().isoformat()
if result:
meta['result'] = result
if summary:
meta['summary'] = summary
# Calculate duration
if 'created_at' in meta:
try:
start = datetime.fromisoformat(meta['created_at'])
meta['duration_seconds'] = (datetime.now() - start).total_seconds()
except:
pass
# Write updated meta
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
# Release user lock
user = meta.get('user') or meta.get('enqueued_by')
lock_id = meta.get('lock_id')
if user and lock_id:
self._release_lock(user, lock_id)
# Update capacity
self._increment_capacity()
# Move to completed
dest = self.COMPLETED_DIR / task_id
if dest.exists():
shutil.rmtree(dest)
shutil.move(str(task_dir), str(dest))
return {
'success': True,
'task_id': task_id,
'status': 'completed',
'completed_at': meta['completed_at']
}
except Exception as e:
return {'success': False, 'error': str(e)}
def fail_task(
self,
task_id: str,
error: str,
exit_code: int = 1,
recoverable: bool = True
) -> Dict:
"""
Mark task as failed.
Args:
task_id: The task ID
error: Error message
exit_code: Process exit code
recoverable: Whether task can be retried
Returns:
Status dict
"""
task_dir = self.ACTIVE_DIR / task_id
if not task_dir.exists():
return {'success': False, 'error': f'Task {task_id} not found in active'}
try:
# Load and update meta
meta_file = task_dir / "meta.json"
meta = {}
if meta_file.exists():
meta = json.loads(meta_file.read_text())
meta['status'] = 'failed'
meta['failed_at'] = datetime.now().isoformat()
meta['error'] = error
meta['exit_code'] = exit_code
meta['recoverable'] = recoverable
# Track retry count
meta['retry_count'] = meta.get('retry_count', 0)
# Write updated meta
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
# Release user lock
user = meta.get('user') or meta.get('enqueued_by')
lock_id = meta.get('lock_id')
if user and lock_id:
self._release_lock(user, lock_id)
# Update capacity
self._increment_capacity()
# Move to failed
dest = self.FAILED_DIR / task_id
if dest.exists():
shutil.rmtree(dest)
shutil.move(str(task_dir), str(dest))
return {
'success': True,
'task_id': task_id,
'status': 'failed',
'failed_at': meta['failed_at'],
'recoverable': recoverable
}
except Exception as e:
return {'success': False, 'error': str(e)}
def set_awaiting_human(
self,
task_id: str,
question: str,
project: str = None
) -> Dict:
"""
Mark task as awaiting human response.
This blocks the project queue AND sends question to Telegram.
Args:
task_id: The task ID
question: The question for the human
project: Optional project name (for cockpit integration)
Returns:
Status dict
"""
task_dir = self.ACTIVE_DIR / task_id
if not task_dir.exists():
return {'success': False, 'error': f'Task {task_id} not found'}
try:
# Update task meta
meta_file = task_dir / "meta.json"
meta = {}
if meta_file.exists():
meta = json.loads(meta_file.read_text())
meta['status'] = 'awaiting_human'
meta['awaiting_since'] = datetime.now().isoformat()
meta['awaiting_question'] = question
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
# If project specified, also update cockpit state
project = project or meta.get('project')
if project:
self._update_cockpit_awaiting(project, question)
# Send question to Bruno via Telegram
telegram_request_id = None
try:
from telegram_bridge import ask_bruno
context = f"Task: {task_id}\nProject: {project or 'unknown'}"
telegram_request_id, sent = ask_bruno(
question=question,
project=project or "luzia",
context=context
)
if sent:
meta['telegram_request_id'] = telegram_request_id
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
except Exception as e:
# Log but don't fail - telegram is optional
pass
return {
'success': True,
'task_id': task_id,
'status': 'awaiting_human',
'question': question,
'telegram_request_id': telegram_request_id
}
except Exception as e:
return {'success': False, 'error': str(e)}
def resume_from_human(
self,
task_id: str,
answer: str,
project: str = None
) -> Dict:
"""
Resume task after human provides answer.
Args:
task_id: The task ID
answer: Human's response
project: Optional project name
Returns:
Status dict
"""
task_dir = self.ACTIVE_DIR / task_id
if not task_dir.exists():
return {'success': False, 'error': f'Task {task_id} not found'}
try:
# Update task meta
meta_file = task_dir / "meta.json"
meta = {}
if meta_file.exists():
meta = json.loads(meta_file.read_text())
meta['status'] = 'running'
meta['resumed_at'] = datetime.now().isoformat()
meta['human_answer'] = answer
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
# Clear cockpit awaiting state
project = project or meta.get('project')
if project:
self._clear_cockpit_awaiting(project)
return {
'success': True,
'task_id': task_id,
'status': 'running',
'resumed_at': meta['resumed_at']
}
except Exception as e:
return {'success': False, 'error': str(e)}
def _release_lock(self, user: str, lock_id: str) -> bool:
"""Release a per-user lock."""
lock_file = self.LOCKS_BASE / f"user_{user}.lock"
meta_file = self.LOCKS_BASE / f"user_{user}.json"
try:
# Verify lock ID matches
if meta_file.exists():
meta = json.loads(meta_file.read_text())
if meta.get('lock_id') != lock_id:
return False
# Remove lock files
if lock_file.exists():
lock_file.unlink()
if meta_file.exists():
meta_file.unlink()
return True
except:
return False
def _increment_capacity(self) -> bool:
"""Increment available capacity slots."""
if not self.CAPACITY_FILE.exists():
return False
try:
with open(self.CAPACITY_FILE, 'r+') as f:
fcntl.flock(f, fcntl.LOCK_EX)
try:
capacity = json.load(f)
current = capacity.get('slots', {}).get('available', 0)
max_slots = capacity.get('slots', {}).get('max', 4)
capacity['slots']['available'] = min(current + 1, max_slots)
capacity['last_updated'] = datetime.now().isoformat()
f.seek(0)
f.truncate()
json.dump(capacity, f, indent=2)
finally:
fcntl.flock(f, fcntl.LOCK_UN)
return True
except:
return False
def _update_cockpit_awaiting(self, project: str, question: str):
"""Update cockpit state to show awaiting human."""
state_file = self.COCKPIT_STATE_DIR / f"{project}.json"
try:
state = {}
if state_file.exists():
state = json.loads(state_file.read_text())
state['awaiting_response'] = True
state['last_question'] = question
state['awaiting_since'] = datetime.now().isoformat()
with open(state_file, 'w') as f:
json.dump(state, f, indent=2)
except:
pass
def _clear_cockpit_awaiting(self, project: str):
"""Clear cockpit awaiting state."""
state_file = self.COCKPIT_STATE_DIR / f"{project}.json"
try:
if not state_file.exists():
return
state = json.loads(state_file.read_text())
state['awaiting_response'] = False
state['last_question'] = None
with open(state_file, 'w') as f:
json.dump(state, f, indent=2)
except:
pass
# Convenience functions for direct import
_handler = None
def _get_handler():
global _handler
if _handler is None:
_handler = TaskCompletion()
return _handler
def complete_task(task_id: str, result: Dict = None, summary: str = None) -> Dict:
"""Complete a task successfully."""
return _get_handler().complete_task(task_id, result, summary)
def fail_task(task_id: str, error: str, exit_code: int = 1, recoverable: bool = True) -> Dict:
"""Mark a task as failed."""
return _get_handler().fail_task(task_id, error, exit_code, recoverable)
def set_awaiting_human(task_id: str, question: str, project: str = None) -> Dict:
"""Mark task as awaiting human response."""
return _get_handler().set_awaiting_human(task_id, question, project)
def resume_from_human(task_id: str, answer: str, project: str = None) -> Dict:
"""Resume task after human answer."""
return _get_handler().resume_from_human(task_id, answer, project)
def main():
"""CLI entry point."""
import argparse
parser = argparse.ArgumentParser(description='Task Completion Callback')
parser.add_argument('command', choices=['complete', 'fail', 'await', 'resume'],
help='Command to run')
parser.add_argument('task_id', help='Task ID')
parser.add_argument('message', nargs='?', default='',
help='Result/error/question/answer')
parser.add_argument('--project', help='Project name')
parser.add_argument('--exit-code', type=int, default=1, help='Exit code for failures')
args = parser.parse_args()
handler = TaskCompletion()
if args.command == 'complete':
result = handler.complete_task(args.task_id, summary=args.message)
elif args.command == 'fail':
result = handler.fail_task(args.task_id, args.message, args.exit_code)
elif args.command == 'await':
result = handler.set_awaiting_human(args.task_id, args.message, args.project)
elif args.command == 'resume':
result = handler.resume_from_human(args.task_id, args.message, args.project)
print(json.dumps(result, indent=2))
if __name__ == '__main__':
main()