Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
383
lib/conductor_recovery.py
Normal file
383
lib/conductor_recovery.py
Normal file
@@ -0,0 +1,383 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Conductor Task Recovery
|
||||
|
||||
Auto-recovery for stalled conductor tasks:
|
||||
- Kill zombie processes
|
||||
- Release task locks
|
||||
- Update task status
|
||||
- Move to failed directory if unrecoverable
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
class ConductorRecovery:
|
||||
"""Recover from stalled conductor tasks."""
|
||||
|
||||
CONDUCTOR_ROOT = Path('/home/admin/conductor')
|
||||
HEARTBEAT_TIMEOUT_SECS = 300
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize conductor recovery."""
|
||||
self.conductor_root = self.CONDUCTOR_ROOT
|
||||
self.active_dir = self.conductor_root / 'active'
|
||||
self.failed_dir = self.conductor_root / 'failed'
|
||||
|
||||
def find_stalled_tasks(self) -> List[Dict]:
|
||||
"""
|
||||
Find all stalled tasks in conductor/active.
|
||||
|
||||
Returns:
|
||||
List of stalled task metadata dicts
|
||||
"""
|
||||
stalled = []
|
||||
|
||||
if not self.active_dir.exists():
|
||||
return stalled
|
||||
|
||||
now = time.time()
|
||||
|
||||
for task_dir in self.active_dir.iterdir():
|
||||
if not task_dir.is_dir():
|
||||
continue
|
||||
|
||||
task_id = task_dir.name
|
||||
stall_reason = None
|
||||
stall_details = {}
|
||||
|
||||
# Check heartbeat timeout
|
||||
heartbeat_file = task_dir / 'heartbeat.json'
|
||||
if heartbeat_file.exists():
|
||||
try:
|
||||
hb = json.loads(heartbeat_file.read_text())
|
||||
hb_age = now - hb.get('ts', 0)
|
||||
|
||||
if hb_age > self.HEARTBEAT_TIMEOUT_SECS:
|
||||
stall_reason = 'heartbeat_timeout'
|
||||
stall_details = {
|
||||
'heartbeat_age_secs': int(hb_age),
|
||||
'last_step': hb.get('step', 'unknown')
|
||||
}
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check if process exists
|
||||
pid_file = task_dir / 'pid'
|
||||
if pid_file.exists() and not stall_reason:
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
if not os.path.exists(f'/proc/{pid}'):
|
||||
stall_reason = 'process_not_found'
|
||||
stall_details = {'pid': pid}
|
||||
except:
|
||||
pass
|
||||
|
||||
if stall_reason:
|
||||
stalled.append({
|
||||
'task_id': task_id,
|
||||
'task_dir': str(task_dir),
|
||||
'stall_reason': stall_reason,
|
||||
'details': stall_details,
|
||||
'timestamp': now
|
||||
})
|
||||
|
||||
return stalled
|
||||
|
||||
def recover_stalled_task(self, task_id: str, dry_run: bool = True) -> Dict:
|
||||
"""
|
||||
Attempt to recover a single stalled task.
|
||||
|
||||
Args:
|
||||
task_id: Task ID to recover
|
||||
dry_run: If True, preview actions without making changes
|
||||
|
||||
Returns:
|
||||
Dict with recovery result
|
||||
"""
|
||||
task_dir = self.active_dir / task_id
|
||||
|
||||
if not task_dir.exists():
|
||||
return {'status': 'error', 'message': f'Task {task_id} not found'}
|
||||
|
||||
actions = []
|
||||
result_status = 'unknown'
|
||||
|
||||
# 1. Kill zombie process (if exists)
|
||||
pid_file = task_dir / 'pid'
|
||||
if pid_file.exists():
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
if os.path.exists(f'/proc/{pid}'):
|
||||
actions.append(f"Kill process {pid}")
|
||||
if not dry_run:
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
time.sleep(1)
|
||||
# Force kill if still exists
|
||||
if os.path.exists(f'/proc/{pid}'):
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
actions.append(f"Process {pid} already terminated")
|
||||
except:
|
||||
pass
|
||||
|
||||
# 2. Update heartbeat to current time (signal recovery attempt)
|
||||
heartbeat_file = task_dir / 'heartbeat.json'
|
||||
actions.append("Update heartbeat to current time")
|
||||
if not dry_run:
|
||||
hb_data = {
|
||||
'ts': time.time(),
|
||||
'step': 'recovery_attempt',
|
||||
'recovered_at': datetime.now().isoformat()
|
||||
}
|
||||
heartbeat_file.write_text(json.dumps(hb_data, indent=2))
|
||||
|
||||
# 3. Update progress file
|
||||
progress_file = task_dir / 'progress.md'
|
||||
actions.append("Update progress with recovery note")
|
||||
if not dry_run:
|
||||
progress_content = f"""# Task Recovery
|
||||
|
||||
**Recovered at:** {datetime.now().isoformat()}
|
||||
**Status:** Task was stalled, recovery attempted
|
||||
|
||||
## Original Progress
|
||||
(Previous content preserved)
|
||||
|
||||
## Recovery Actions
|
||||
- Process killed/terminated
|
||||
- Heartbeat reset
|
||||
- Progress file updated
|
||||
|
||||
**Next step:** Monitor task progress. If still stalled, may need manual intervention.
|
||||
"""
|
||||
progress_file.write_text(progress_content)
|
||||
|
||||
# 4. Update meta to mark recovery attempt
|
||||
meta_file = task_dir / 'meta.json'
|
||||
actions.append("Update metadata with recovery flag")
|
||||
if not dry_run:
|
||||
try:
|
||||
meta = json.loads(meta_file.read_text())
|
||||
meta['recovery_attempts'] = meta.get('recovery_attempts', 0) + 1
|
||||
meta['last_recovery'] = datetime.now().isoformat()
|
||||
meta_file.write_text(json.dumps(meta, indent=2))
|
||||
except:
|
||||
pass
|
||||
|
||||
# 5. Decision: Keep in active or move to failed if too many recovery attempts
|
||||
meta = json.loads(meta_file.read_text()) if meta_file.exists() else {}
|
||||
recovery_attempts = meta.get('recovery_attempts', 0)
|
||||
|
||||
if recovery_attempts >= 3:
|
||||
result_status = 'moved_to_failed'
|
||||
actions.append("Move to failed (too many recovery attempts)")
|
||||
if not dry_run:
|
||||
self._move_task_to_failed(task_dir, task_id, "Exceeded maximum recovery attempts")
|
||||
else:
|
||||
result_status = 'recovered'
|
||||
actions.append("Keep in active (monitor progress)")
|
||||
|
||||
return {
|
||||
'task_id': task_id,
|
||||
'status': result_status,
|
||||
'actions': actions,
|
||||
'dry_run': dry_run,
|
||||
'timestamp': time.time()
|
||||
}
|
||||
|
||||
def recover_all_stalled_tasks(self, dry_run: bool = True) -> Dict:
|
||||
"""
|
||||
Recover all stalled tasks.
|
||||
|
||||
Args:
|
||||
dry_run: If True, preview without making changes
|
||||
|
||||
Returns:
|
||||
Dict with batch recovery results
|
||||
"""
|
||||
stalled_tasks = self.find_stalled_tasks()
|
||||
|
||||
if not stalled_tasks:
|
||||
return {
|
||||
'total_stalled': 0,
|
||||
'recovered': 0,
|
||||
'moved_to_failed': 0,
|
||||
'results': [],
|
||||
'dry_run': dry_run,
|
||||
'timestamp': time.time()
|
||||
}
|
||||
|
||||
results = []
|
||||
recovered_count = 0
|
||||
moved_count = 0
|
||||
|
||||
for stalled in stalled_tasks:
|
||||
task_id = stalled['task_id']
|
||||
result = self.recover_stalled_task(task_id, dry_run=dry_run)
|
||||
results.append(result)
|
||||
|
||||
if result['status'] == 'recovered':
|
||||
recovered_count += 1
|
||||
elif result['status'] == 'moved_to_failed':
|
||||
moved_count += 1
|
||||
|
||||
return {
|
||||
'total_stalled': len(stalled_tasks),
|
||||
'recovered': recovered_count,
|
||||
'moved_to_failed': moved_count,
|
||||
'results': results,
|
||||
'dry_run': dry_run,
|
||||
'timestamp': time.time()
|
||||
}
|
||||
|
||||
def release_locks(self, task_id: str, dry_run: bool = True) -> Dict:
|
||||
"""
|
||||
Release any locks held by a task.
|
||||
|
||||
Args:
|
||||
task_id: Task ID
|
||||
dry_run: If True, preview without making changes
|
||||
|
||||
Returns:
|
||||
Dict with lock release results
|
||||
"""
|
||||
task_dir = self.active_dir / task_id
|
||||
|
||||
if not task_dir.exists():
|
||||
return {'status': 'error', 'message': f'Task {task_id} not found'}
|
||||
|
||||
# Look for lock files
|
||||
lock_dir = task_dir / 'locks'
|
||||
released = []
|
||||
|
||||
if lock_dir.exists():
|
||||
for lock_file in lock_dir.iterdir():
|
||||
released.append(str(lock_file))
|
||||
if not dry_run:
|
||||
lock_file.unlink()
|
||||
|
||||
return {
|
||||
'task_id': task_id,
|
||||
'locks_released': len(released),
|
||||
'lock_files': released,
|
||||
'dry_run': dry_run,
|
||||
'timestamp': time.time()
|
||||
}
|
||||
|
||||
def validate_recovery(self, task_id: str) -> Dict:
|
||||
"""
|
||||
Validate that a task recovered successfully.
|
||||
|
||||
Args:
|
||||
task_id: Task ID to validate
|
||||
|
||||
Returns:
|
||||
Dict with validation result
|
||||
"""
|
||||
task_dir = self.active_dir / task_id
|
||||
|
||||
if not task_dir.exists():
|
||||
return {'status': 'not_found', 'task_id': task_id}
|
||||
|
||||
# Check heartbeat is recent
|
||||
heartbeat_file = task_dir / 'heartbeat.json'
|
||||
is_alive = False
|
||||
|
||||
if heartbeat_file.exists():
|
||||
try:
|
||||
hb = json.loads(heartbeat_file.read_text())
|
||||
hb_age = time.time() - hb.get('ts', 0)
|
||||
is_alive = hb_age < 300 # Consider alive if <5min old
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check for process
|
||||
process_running = False
|
||||
pid_file = task_dir / 'pid'
|
||||
if pid_file.exists():
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
process_running = os.path.exists(f'/proc/{pid}')
|
||||
except:
|
||||
pass
|
||||
|
||||
# Overall recovery status
|
||||
recovery_status = 'recovered' if is_alive or process_running else 'stalled'
|
||||
|
||||
return {
|
||||
'task_id': task_id,
|
||||
'recovery_status': recovery_status,
|
||||
'heartbeat_alive': is_alive,
|
||||
'process_running': process_running,
|
||||
'timestamp': time.time()
|
||||
}
|
||||
|
||||
def _move_task_to_failed(self, task_dir: Path, task_id: str, failure_reason: str) -> bool:
|
||||
"""Move a task from active to failed."""
|
||||
try:
|
||||
failed_task_dir = self.failed_dir / task_id
|
||||
failed_task_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy all files
|
||||
for item in task_dir.iterdir():
|
||||
if item.is_file():
|
||||
import shutil
|
||||
shutil.copy2(item, failed_task_dir / item.name)
|
||||
|
||||
# Update meta with failure reason
|
||||
meta_file = failed_task_dir / 'meta.json'
|
||||
if meta_file.exists():
|
||||
meta = json.loads(meta_file.read_text())
|
||||
else:
|
||||
meta = {}
|
||||
|
||||
meta['failure_reason'] = failure_reason
|
||||
meta['moved_to_failed_at'] = datetime.now().isoformat()
|
||||
meta_file.write_text(json.dumps(meta, indent=2))
|
||||
|
||||
# Create error.txt
|
||||
error_file = failed_task_dir / 'error.txt'
|
||||
error_file.write_text(f"Task stalled: {failure_reason}\nMoved to failed: {datetime.now().isoformat()}")
|
||||
|
||||
# Remove from active
|
||||
import shutil
|
||||
shutil.rmtree(task_dir)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error moving task {task_id} to failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
recovery = ConductorRecovery()
|
||||
|
||||
print("=" * 70)
|
||||
print("FINDING STALLED TASKS")
|
||||
print("=" * 70)
|
||||
stalled = recovery.find_stalled_tasks()
|
||||
print(f"Found {len(stalled)} stalled task(s)")
|
||||
for task in stalled[:5]:
|
||||
print(f" - {task['task_id']}: {task['stall_reason']}")
|
||||
|
||||
if stalled:
|
||||
print("\n" + "=" * 70)
|
||||
print("RECOVERY DRY RUN (preview only)")
|
||||
print("=" * 70)
|
||||
result = recovery.recover_all_stalled_tasks(dry_run=True)
|
||||
print(f"Would recover: {result['recovered']}")
|
||||
print(f"Would move to failed: {result['moved_to_failed']}")
|
||||
print("\nActions:")
|
||||
for r in result['results'][:1]:
|
||||
for action in r['actions']:
|
||||
print(f" - {action}")
|
||||
Reference in New Issue
Block a user