Files
luzia/lib/conductor_recovery.py
admin ec33ac1936 Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor:

- Added DockerTmuxController class for robust tmux session management
- Implements send_keys() with configurable delay_enter
- Implements capture_pane() for output retrieval
- Implements wait_for_prompt() for pattern-based completion detection
- Implements wait_for_idle() for content-hash-based idle detection
- Implements wait_for_shell_prompt() for shell prompt detection

Also includes workflow improvements:
- Pre-task git snapshot before agent execution
- Post-task commit protocol in agent guidelines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00

384 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Conductor Task Recovery
Auto-recovery for stalled conductor tasks:
- Kill zombie processes
- Release task locks
- Update task status
- Move to failed directory if unrecoverable
"""
import json
import os
import signal
import time
from pathlib import Path
from datetime import datetime
from typing import List, Dict
class ConductorRecovery:
"""Recover from stalled conductor tasks."""
CONDUCTOR_ROOT = Path('/home/admin/conductor')
HEARTBEAT_TIMEOUT_SECS = 300
def __init__(self):
"""Initialize conductor recovery."""
self.conductor_root = self.CONDUCTOR_ROOT
self.active_dir = self.conductor_root / 'active'
self.failed_dir = self.conductor_root / 'failed'
def find_stalled_tasks(self) -> List[Dict]:
"""
Find all stalled tasks in conductor/active.
Returns:
List of stalled task metadata dicts
"""
stalled = []
if not self.active_dir.exists():
return stalled
now = time.time()
for task_dir in self.active_dir.iterdir():
if not task_dir.is_dir():
continue
task_id = task_dir.name
stall_reason = None
stall_details = {}
# Check heartbeat timeout
heartbeat_file = task_dir / 'heartbeat.json'
if heartbeat_file.exists():
try:
hb = json.loads(heartbeat_file.read_text())
hb_age = now - hb.get('ts', 0)
if hb_age > self.HEARTBEAT_TIMEOUT_SECS:
stall_reason = 'heartbeat_timeout'
stall_details = {
'heartbeat_age_secs': int(hb_age),
'last_step': hb.get('step', 'unknown')
}
except:
pass
# Check if process exists
pid_file = task_dir / 'pid'
if pid_file.exists() and not stall_reason:
try:
pid = int(pid_file.read_text().strip())
if not os.path.exists(f'/proc/{pid}'):
stall_reason = 'process_not_found'
stall_details = {'pid': pid}
except:
pass
if stall_reason:
stalled.append({
'task_id': task_id,
'task_dir': str(task_dir),
'stall_reason': stall_reason,
'details': stall_details,
'timestamp': now
})
return stalled
def recover_stalled_task(self, task_id: str, dry_run: bool = True) -> Dict:
"""
Attempt to recover a single stalled task.
Args:
task_id: Task ID to recover
dry_run: If True, preview actions without making changes
Returns:
Dict with recovery result
"""
task_dir = self.active_dir / task_id
if not task_dir.exists():
return {'status': 'error', 'message': f'Task {task_id} not found'}
actions = []
result_status = 'unknown'
# 1. Kill zombie process (if exists)
pid_file = task_dir / 'pid'
if pid_file.exists():
try:
pid = int(pid_file.read_text().strip())
if os.path.exists(f'/proc/{pid}'):
actions.append(f"Kill process {pid}")
if not dry_run:
try:
os.kill(pid, signal.SIGTERM)
time.sleep(1)
# Force kill if still exists
if os.path.exists(f'/proc/{pid}'):
os.kill(pid, signal.SIGKILL)
except:
pass
else:
actions.append(f"Process {pid} already terminated")
except:
pass
# 2. Update heartbeat to current time (signal recovery attempt)
heartbeat_file = task_dir / 'heartbeat.json'
actions.append("Update heartbeat to current time")
if not dry_run:
hb_data = {
'ts': time.time(),
'step': 'recovery_attempt',
'recovered_at': datetime.now().isoformat()
}
heartbeat_file.write_text(json.dumps(hb_data, indent=2))
# 3. Update progress file
progress_file = task_dir / 'progress.md'
actions.append("Update progress with recovery note")
if not dry_run:
progress_content = f"""# Task Recovery
**Recovered at:** {datetime.now().isoformat()}
**Status:** Task was stalled, recovery attempted
## Original Progress
(Previous content preserved)
## Recovery Actions
- Process killed/terminated
- Heartbeat reset
- Progress file updated
**Next step:** Monitor task progress. If still stalled, may need manual intervention.
"""
progress_file.write_text(progress_content)
# 4. Update meta to mark recovery attempt
meta_file = task_dir / 'meta.json'
actions.append("Update metadata with recovery flag")
if not dry_run:
try:
meta = json.loads(meta_file.read_text())
meta['recovery_attempts'] = meta.get('recovery_attempts', 0) + 1
meta['last_recovery'] = datetime.now().isoformat()
meta_file.write_text(json.dumps(meta, indent=2))
except:
pass
# 5. Decision: Keep in active or move to failed if too many recovery attempts
meta = json.loads(meta_file.read_text()) if meta_file.exists() else {}
recovery_attempts = meta.get('recovery_attempts', 0)
if recovery_attempts >= 3:
result_status = 'moved_to_failed'
actions.append("Move to failed (too many recovery attempts)")
if not dry_run:
self._move_task_to_failed(task_dir, task_id, "Exceeded maximum recovery attempts")
else:
result_status = 'recovered'
actions.append("Keep in active (monitor progress)")
return {
'task_id': task_id,
'status': result_status,
'actions': actions,
'dry_run': dry_run,
'timestamp': time.time()
}
def recover_all_stalled_tasks(self, dry_run: bool = True) -> Dict:
"""
Recover all stalled tasks.
Args:
dry_run: If True, preview without making changes
Returns:
Dict with batch recovery results
"""
stalled_tasks = self.find_stalled_tasks()
if not stalled_tasks:
return {
'total_stalled': 0,
'recovered': 0,
'moved_to_failed': 0,
'results': [],
'dry_run': dry_run,
'timestamp': time.time()
}
results = []
recovered_count = 0
moved_count = 0
for stalled in stalled_tasks:
task_id = stalled['task_id']
result = self.recover_stalled_task(task_id, dry_run=dry_run)
results.append(result)
if result['status'] == 'recovered':
recovered_count += 1
elif result['status'] == 'moved_to_failed':
moved_count += 1
return {
'total_stalled': len(stalled_tasks),
'recovered': recovered_count,
'moved_to_failed': moved_count,
'results': results,
'dry_run': dry_run,
'timestamp': time.time()
}
def release_locks(self, task_id: str, dry_run: bool = True) -> Dict:
"""
Release any locks held by a task.
Args:
task_id: Task ID
dry_run: If True, preview without making changes
Returns:
Dict with lock release results
"""
task_dir = self.active_dir / task_id
if not task_dir.exists():
return {'status': 'error', 'message': f'Task {task_id} not found'}
# Look for lock files
lock_dir = task_dir / 'locks'
released = []
if lock_dir.exists():
for lock_file in lock_dir.iterdir():
released.append(str(lock_file))
if not dry_run:
lock_file.unlink()
return {
'task_id': task_id,
'locks_released': len(released),
'lock_files': released,
'dry_run': dry_run,
'timestamp': time.time()
}
def validate_recovery(self, task_id: str) -> Dict:
"""
Validate that a task recovered successfully.
Args:
task_id: Task ID to validate
Returns:
Dict with validation result
"""
task_dir = self.active_dir / task_id
if not task_dir.exists():
return {'status': 'not_found', 'task_id': task_id}
# Check heartbeat is recent
heartbeat_file = task_dir / 'heartbeat.json'
is_alive = False
if heartbeat_file.exists():
try:
hb = json.loads(heartbeat_file.read_text())
hb_age = time.time() - hb.get('ts', 0)
is_alive = hb_age < 300 # Consider alive if <5min old
except:
pass
# Check for process
process_running = False
pid_file = task_dir / 'pid'
if pid_file.exists():
try:
pid = int(pid_file.read_text().strip())
process_running = os.path.exists(f'/proc/{pid}')
except:
pass
# Overall recovery status
recovery_status = 'recovered' if is_alive or process_running else 'stalled'
return {
'task_id': task_id,
'recovery_status': recovery_status,
'heartbeat_alive': is_alive,
'process_running': process_running,
'timestamp': time.time()
}
def _move_task_to_failed(self, task_dir: Path, task_id: str, failure_reason: str) -> bool:
"""Move a task from active to failed."""
try:
failed_task_dir = self.failed_dir / task_id
failed_task_dir.mkdir(parents=True, exist_ok=True)
# Copy all files
for item in task_dir.iterdir():
if item.is_file():
import shutil
shutil.copy2(item, failed_task_dir / item.name)
# Update meta with failure reason
meta_file = failed_task_dir / 'meta.json'
if meta_file.exists():
meta = json.loads(meta_file.read_text())
else:
meta = {}
meta['failure_reason'] = failure_reason
meta['moved_to_failed_at'] = datetime.now().isoformat()
meta_file.write_text(json.dumps(meta, indent=2))
# Create error.txt
error_file = failed_task_dir / 'error.txt'
error_file.write_text(f"Task stalled: {failure_reason}\nMoved to failed: {datetime.now().isoformat()}")
# Remove from active
import shutil
shutil.rmtree(task_dir)
return True
except Exception as e:
print(f"Error moving task {task_id} to failed: {e}")
return False
if __name__ == '__main__':
recovery = ConductorRecovery()
print("=" * 70)
print("FINDING STALLED TASKS")
print("=" * 70)
stalled = recovery.find_stalled_tasks()
print(f"Found {len(stalled)} stalled task(s)")
for task in stalled[:5]:
print(f" - {task['task_id']}: {task['stall_reason']}")
if stalled:
print("\n" + "=" * 70)
print("RECOVERY DRY RUN (preview only)")
print("=" * 70)
result = recovery.recover_all_stalled_tasks(dry_run=True)
print(f"Would recover: {result['recovered']}")
print(f"Would move to failed: {result['moved_to_failed']}")
print("\nActions:")
for r in result['results'][:1]:
for action in r['actions']:
print(f" - {action}")