Refactor cockpit to use DockerTmuxController pattern

Based on claude-code-tools TmuxCLIController, this refactor:

- Added DockerTmuxController class for robust tmux session management
- Implements send_keys() with configurable delay_enter
- Implements capture_pane() for output retrieval
- Implements wait_for_prompt() for pattern-based completion detection
- Implements wait_for_idle() for content-hash-based idle detection
- Implements wait_for_shell_prompt() for shell prompt detection

Also includes workflow improvements:
- Pre-task git snapshot before agent execution
- Post-task commit protocol in agent guidelines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
admin
2026-01-14 10:42:16 -03:00
commit ec33ac1936
265 changed files with 92011 additions and 0 deletions

538
lib/task_watchdog.py Normal file
View File

@@ -0,0 +1,538 @@
#!/usr/bin/env python3
"""
Task Watchdog - Monitor running tasks for stuck/failed states
Key responsibilities:
1. Monitor heartbeats for running tasks
2. Detect and clean up stuck/orphaned tasks
3. Release stale locks automatically
4. Track task state transitions
5. Support cockpit "awaiting_human" state
Usage:
from task_watchdog import TaskWatchdog
watchdog = TaskWatchdog()
watchdog.run_check() # Single check
watchdog.run_loop() # Continuous monitoring
"""
import json
import os
import time
import signal
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple
import fcntl
class TaskWatchdog:
"""Monitor running tasks for stuck/failed states."""
# Configuration
CONDUCTOR_BASE = Path.home() / "conductor"
ACTIVE_DIR = CONDUCTOR_BASE / "active"
COMPLETED_DIR = CONDUCTOR_BASE / "completed"
FAILED_DIR = CONDUCTOR_BASE / "failed"
QUEUE_BASE = Path("/var/lib/luzia/queue")
LOCKS_BASE = Path("/var/lib/luzia/locks")
CAPACITY_FILE = QUEUE_BASE / "capacity.json"
JOBS_DIR = Path("/var/log/luz-orchestrator/jobs")
# Cockpit state directory
COCKPIT_STATE_DIR = Path("/var/lib/luz-orchestrator/cockpits")
# Timeouts
HEARTBEAT_TIMEOUT_SECONDS = 300 # 5 minutes without heartbeat = stuck
LOCK_TIMEOUT_SECONDS = 3600 # 1 hour lock timeout
AWAITING_HUMAN_TIMEOUT = 86400 # 24 hours for human response
# State transitions
STATES = {
'pending': ['claimed', 'cancelled'],
'claimed': ['dispatched', 'failed', 'cancelled'],
'dispatched': ['running', 'failed'],
'running': ['completed', 'failed', 'awaiting_human', 'stuck'],
'awaiting_human': ['running', 'failed', 'cancelled'],
'stuck': ['failed', 'recovered'],
'completed': [],
'failed': ['retrying'],
'retrying': ['running', 'failed'],
'cancelled': []
}
def __init__(self):
"""Initialize watchdog."""
self._ensure_dirs()
def _ensure_dirs(self):
"""Ensure required directories exist."""
for d in [self.ACTIVE_DIR, self.COMPLETED_DIR, self.FAILED_DIR]:
d.mkdir(parents=True, exist_ok=True)
self.LOCKS_BASE.mkdir(parents=True, exist_ok=True)
self.COCKPIT_STATE_DIR.mkdir(parents=True, exist_ok=True)
def check_heartbeats(self) -> List[Dict]:
"""
Check all active tasks for stale heartbeats.
Returns list of stuck tasks.
"""
stuck_tasks = []
now = time.time()
if not self.ACTIVE_DIR.exists():
return stuck_tasks
for task_dir in self.ACTIVE_DIR.iterdir():
if not task_dir.is_dir():
continue
task_id = task_dir.name
heartbeat_file = task_dir / "heartbeat.json"
meta_file = task_dir / "meta.json"
# Load task metadata
meta = {}
if meta_file.exists():
try:
meta = json.loads(meta_file.read_text())
except:
pass
# Check if task is in awaiting_human state (from cockpit)
if meta.get('status') == 'awaiting_human':
# Check if awaiting too long
awaiting_since = meta.get('awaiting_since', now)
if now - awaiting_since > self.AWAITING_HUMAN_TIMEOUT:
stuck_tasks.append({
'task_id': task_id,
'reason': 'awaiting_human_timeout',
'last_heartbeat': None,
'meta': meta
})
continue
# Check heartbeat for running tasks
if not heartbeat_file.exists():
# No heartbeat file = check task age
created = meta.get('created_at')
if created:
try:
created_ts = datetime.fromisoformat(created).timestamp()
if now - created_ts > self.HEARTBEAT_TIMEOUT_SECONDS:
stuck_tasks.append({
'task_id': task_id,
'reason': 'no_heartbeat',
'last_heartbeat': None,
'meta': meta
})
except:
pass
continue
try:
heartbeat = json.loads(heartbeat_file.read_text())
last_ts = heartbeat.get('ts', 0)
if now - last_ts > self.HEARTBEAT_TIMEOUT_SECONDS:
stuck_tasks.append({
'task_id': task_id,
'reason': 'stale_heartbeat',
'last_heartbeat': last_ts,
'last_step': heartbeat.get('step'),
'meta': meta
})
except Exception as e:
stuck_tasks.append({
'task_id': task_id,
'reason': 'heartbeat_read_error',
'error': str(e),
'meta': meta
})
return stuck_tasks
def cleanup_orphaned_tasks(self) -> List[str]:
"""
Clean up tasks where agent died.
Moves stuck tasks to failed directory.
Returns list of cleaned task IDs.
"""
cleaned = []
stuck_tasks = self.check_heartbeats()
for task in stuck_tasks:
task_id = task['task_id']
task_dir = self.ACTIVE_DIR / task_id
if not task_dir.exists():
continue
# Update meta with failure info
meta_file = task_dir / "meta.json"
meta = {}
if meta_file.exists():
try:
meta = json.loads(meta_file.read_text())
except:
pass
meta['status'] = 'failed'
meta['failure_reason'] = task.get('reason', 'unknown')
meta['failed_at'] = datetime.now().isoformat()
meta['last_heartbeat'] = task.get('last_heartbeat')
# Write updated meta
try:
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
except:
pass
# Move to failed directory
failed_dest = self.FAILED_DIR / task_id
try:
if failed_dest.exists():
# Remove old failed version
import shutil
shutil.rmtree(failed_dest)
task_dir.rename(failed_dest)
cleaned.append(task_id)
except Exception as e:
print(f"Error moving task {task_id} to failed: {e}")
return cleaned
def release_stale_locks(self) -> List[str]:
"""
Release locks for dead tasks.
Returns list of released lock IDs.
"""
released = []
now = time.time()
if not self.LOCKS_BASE.exists():
return released
for lock_file in self.LOCKS_BASE.glob("user_*.lock"):
try:
# Read lock metadata
meta_file = lock_file.with_suffix('.json')
if not meta_file.exists():
# Old lock without metadata - remove if lock file is old
if now - lock_file.stat().st_mtime > self.LOCK_TIMEOUT_SECONDS:
lock_file.unlink()
released.append(lock_file.name)
continue
meta = json.loads(meta_file.read_text())
expires_at = meta.get('expires_at', 0)
if now > expires_at:
# Lock expired - remove both files
lock_file.unlink()
meta_file.unlink()
released.append(meta.get('lock_id', lock_file.name))
except Exception as e:
print(f"Error checking lock {lock_file}: {e}")
return released
def update_capacity(self, released_slots: int = 0) -> bool:
"""
Update capacity file to reflect released resources.
Uses file locking for safety.
"""
if not self.CAPACITY_FILE.exists():
return False
try:
with open(self.CAPACITY_FILE, 'r+') as f:
fcntl.flock(f, fcntl.LOCK_EX)
try:
capacity = json.load(f)
# Update available slots
current = capacity.get('slots', {}).get('available', 0)
max_slots = capacity.get('slots', {}).get('max', 4)
capacity['slots']['available'] = min(current + released_slots, max_slots)
# Update timestamp
capacity['last_updated'] = datetime.now().isoformat()
# Write back
f.seek(0)
f.truncate()
json.dump(capacity, f, indent=2)
finally:
fcntl.flock(f, fcntl.LOCK_UN)
return True
except Exception as e:
print(f"Error updating capacity: {e}")
return False
def get_project_queue_status(self) -> Dict[str, Dict]:
"""
Get queue status per project.
Returns dict of project -> {pending, running, awaiting_human}
"""
status = {}
# Count pending tasks
pending_dirs = [
self.QUEUE_BASE / "pending" / "high",
self.QUEUE_BASE / "pending" / "normal"
]
for pending_dir in pending_dirs:
if not pending_dir.exists():
continue
for task_file in pending_dir.glob("*.json"):
try:
task = json.loads(task_file.read_text())
project = task.get('project', 'unknown')
if project not in status:
status[project] = {'pending': 0, 'running': 0, 'awaiting_human': 0}
status[project]['pending'] += 1
except:
pass
# Count active tasks
if self.ACTIVE_DIR.exists():
for task_dir in self.ACTIVE_DIR.iterdir():
if not task_dir.is_dir():
continue
meta_file = task_dir / "meta.json"
if meta_file.exists():
try:
meta = json.loads(meta_file.read_text())
project = meta.get('project', 'unknown')
if project not in status:
status[project] = {'pending': 0, 'running': 0, 'awaiting_human': 0}
if meta.get('status') == 'awaiting_human':
status[project]['awaiting_human'] += 1
else:
status[project]['running'] += 1
except:
pass
# Check cockpit states
if self.COCKPIT_STATE_DIR.exists():
for state_file in self.COCKPIT_STATE_DIR.glob("*.json"):
try:
state = json.loads(state_file.read_text())
project = state.get('project')
if project and state.get('awaiting_response'):
if project not in status:
status[project] = {'pending': 0, 'running': 0, 'awaiting_human': 0}
status[project]['awaiting_human'] += 1
except:
pass
return status
def is_project_blocked(self, project: str) -> Tuple[bool, Optional[str]]:
"""
Check if a project queue is blocked.
Returns (is_blocked, reason)
"""
# Check cockpit state
cockpit_state = self.COCKPIT_STATE_DIR / f"{project}.json"
if cockpit_state.exists():
try:
state = json.loads(cockpit_state.read_text())
if state.get('awaiting_response'):
return True, "awaiting_human_cockpit"
except:
pass
# Check active tasks for awaiting_human
if self.ACTIVE_DIR.exists():
for task_dir in self.ACTIVE_DIR.iterdir():
if not task_dir.is_dir():
continue
meta_file = task_dir / "meta.json"
if meta_file.exists():
try:
meta = json.loads(meta_file.read_text())
if meta.get('project') == project and meta.get('status') == 'awaiting_human':
return True, f"awaiting_human_task:{task_dir.name}"
except:
pass
return False, None
def set_task_awaiting_human(self, task_id: str, question: str = None) -> bool:
"""
Mark a task as awaiting human response.
This blocks the project queue.
"""
task_dir = self.ACTIVE_DIR / task_id
meta_file = task_dir / "meta.json"
if not meta_file.exists():
return False
try:
meta = json.loads(meta_file.read_text())
meta['status'] = 'awaiting_human'
meta['awaiting_since'] = time.time()
if question:
meta['awaiting_question'] = question
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
return True
except Exception as e:
print(f"Error setting task awaiting: {e}")
return False
def resume_task(self, task_id: str, answer: str = None) -> bool:
"""
Resume a task that was awaiting human response.
"""
task_dir = self.ACTIVE_DIR / task_id
meta_file = task_dir / "meta.json"
if not meta_file.exists():
return False
try:
meta = json.loads(meta_file.read_text())
if meta.get('status') != 'awaiting_human':
return False
meta['status'] = 'running'
meta['resumed_at'] = datetime.now().isoformat()
if answer:
meta['human_response'] = answer
# Update heartbeat
heartbeat_file = task_dir / "heartbeat.json"
with open(heartbeat_file, 'w') as f:
json.dump({'ts': time.time(), 'step': 'Resumed after human response'}, f)
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
return True
except Exception as e:
print(f"Error resuming task: {e}")
return False
def run_check(self) -> Dict:
"""
Run a single watchdog check.
Returns summary of actions taken.
"""
summary = {
'timestamp': datetime.now().isoformat(),
'stuck_tasks': [],
'cleaned_tasks': [],
'released_locks': [],
'project_status': {}
}
# Check for stuck tasks
stuck = self.check_heartbeats()
summary['stuck_tasks'] = [t['task_id'] for t in stuck]
# Clean up orphaned tasks
cleaned = self.cleanup_orphaned_tasks()
summary['cleaned_tasks'] = cleaned
# Release stale locks
released = self.release_stale_locks()
summary['released_locks'] = released
# Update capacity if we cleaned anything
if cleaned or released:
self.update_capacity(released_slots=len(cleaned))
# Get project queue status
summary['project_status'] = self.get_project_queue_status()
return summary
def run_loop(self, interval_seconds: int = 60):
"""
Run continuous watchdog monitoring.
"""
print(f"Starting Task Watchdog (interval: {interval_seconds}s)")
def handle_signal(signum, frame):
print("\nWatchdog shutting down...")
exit(0)
signal.signal(signal.SIGINT, handle_signal)
signal.signal(signal.SIGTERM, handle_signal)
while True:
try:
summary = self.run_check()
# Log if any actions were taken
if summary['stuck_tasks'] or summary['cleaned_tasks'] or summary['released_locks']:
print(f"[{datetime.now().isoformat()}] Watchdog check:")
if summary['stuck_tasks']:
print(f" Stuck tasks: {summary['stuck_tasks']}")
if summary['cleaned_tasks']:
print(f" Cleaned: {summary['cleaned_tasks']}")
if summary['released_locks']:
print(f" Released locks: {summary['released_locks']}")
time.sleep(interval_seconds)
except Exception as e:
print(f"Watchdog error: {e}")
time.sleep(interval_seconds)
def main():
"""CLI entry point."""
import argparse
parser = argparse.ArgumentParser(description='Task Watchdog')
parser.add_argument('command', nargs='?', default='check',
choices=['check', 'daemon', 'status', 'stuck', 'clean'],
help='Command to run')
parser.add_argument('--interval', type=int, default=60,
help='Check interval for daemon mode (seconds)')
args = parser.parse_args()
watchdog = TaskWatchdog()
if args.command == 'check':
summary = watchdog.run_check()
print(json.dumps(summary, indent=2))
elif args.command == 'daemon':
watchdog.run_loop(interval_seconds=args.interval)
elif args.command == 'status':
status = watchdog.get_project_queue_status()
print("Project Queue Status:")
print(json.dumps(status, indent=2))
elif args.command == 'stuck':
stuck = watchdog.check_heartbeats()
if stuck:
print(f"Found {len(stuck)} stuck tasks:")
for t in stuck:
print(f" - {t['task_id']}: {t['reason']}")
else:
print("No stuck tasks found")
elif args.command == 'clean':
cleaned = watchdog.cleanup_orphaned_tasks()
released = watchdog.release_stale_locks()
print(f"Cleaned {len(cleaned)} orphaned tasks")
print(f"Released {len(released)} stale locks")
if __name__ == '__main__':
main()