Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
384 lines
12 KiB
Python
384 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Conductor Task Recovery
|
|
|
|
Auto-recovery for stalled conductor tasks:
|
|
- Kill zombie processes
|
|
- Release task locks
|
|
- Update task status
|
|
- Move to failed directory if unrecoverable
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import signal
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import List, Dict
|
|
|
|
|
|
class ConductorRecovery:
|
|
"""Recover from stalled conductor tasks."""
|
|
|
|
CONDUCTOR_ROOT = Path('/home/admin/conductor')
|
|
HEARTBEAT_TIMEOUT_SECS = 300
|
|
|
|
def __init__(self):
|
|
"""Initialize conductor recovery."""
|
|
self.conductor_root = self.CONDUCTOR_ROOT
|
|
self.active_dir = self.conductor_root / 'active'
|
|
self.failed_dir = self.conductor_root / 'failed'
|
|
|
|
def find_stalled_tasks(self) -> List[Dict]:
|
|
"""
|
|
Find all stalled tasks in conductor/active.
|
|
|
|
Returns:
|
|
List of stalled task metadata dicts
|
|
"""
|
|
stalled = []
|
|
|
|
if not self.active_dir.exists():
|
|
return stalled
|
|
|
|
now = time.time()
|
|
|
|
for task_dir in self.active_dir.iterdir():
|
|
if not task_dir.is_dir():
|
|
continue
|
|
|
|
task_id = task_dir.name
|
|
stall_reason = None
|
|
stall_details = {}
|
|
|
|
# Check heartbeat timeout
|
|
heartbeat_file = task_dir / 'heartbeat.json'
|
|
if heartbeat_file.exists():
|
|
try:
|
|
hb = json.loads(heartbeat_file.read_text())
|
|
hb_age = now - hb.get('ts', 0)
|
|
|
|
if hb_age > self.HEARTBEAT_TIMEOUT_SECS:
|
|
stall_reason = 'heartbeat_timeout'
|
|
stall_details = {
|
|
'heartbeat_age_secs': int(hb_age),
|
|
'last_step': hb.get('step', 'unknown')
|
|
}
|
|
except:
|
|
pass
|
|
|
|
# Check if process exists
|
|
pid_file = task_dir / 'pid'
|
|
if pid_file.exists() and not stall_reason:
|
|
try:
|
|
pid = int(pid_file.read_text().strip())
|
|
if not os.path.exists(f'/proc/{pid}'):
|
|
stall_reason = 'process_not_found'
|
|
stall_details = {'pid': pid}
|
|
except:
|
|
pass
|
|
|
|
if stall_reason:
|
|
stalled.append({
|
|
'task_id': task_id,
|
|
'task_dir': str(task_dir),
|
|
'stall_reason': stall_reason,
|
|
'details': stall_details,
|
|
'timestamp': now
|
|
})
|
|
|
|
return stalled
|
|
|
|
def recover_stalled_task(self, task_id: str, dry_run: bool = True) -> Dict:
|
|
"""
|
|
Attempt to recover a single stalled task.
|
|
|
|
Args:
|
|
task_id: Task ID to recover
|
|
dry_run: If True, preview actions without making changes
|
|
|
|
Returns:
|
|
Dict with recovery result
|
|
"""
|
|
task_dir = self.active_dir / task_id
|
|
|
|
if not task_dir.exists():
|
|
return {'status': 'error', 'message': f'Task {task_id} not found'}
|
|
|
|
actions = []
|
|
result_status = 'unknown'
|
|
|
|
# 1. Kill zombie process (if exists)
|
|
pid_file = task_dir / 'pid'
|
|
if pid_file.exists():
|
|
try:
|
|
pid = int(pid_file.read_text().strip())
|
|
if os.path.exists(f'/proc/{pid}'):
|
|
actions.append(f"Kill process {pid}")
|
|
if not dry_run:
|
|
try:
|
|
os.kill(pid, signal.SIGTERM)
|
|
time.sleep(1)
|
|
# Force kill if still exists
|
|
if os.path.exists(f'/proc/{pid}'):
|
|
os.kill(pid, signal.SIGKILL)
|
|
except:
|
|
pass
|
|
else:
|
|
actions.append(f"Process {pid} already terminated")
|
|
except:
|
|
pass
|
|
|
|
# 2. Update heartbeat to current time (signal recovery attempt)
|
|
heartbeat_file = task_dir / 'heartbeat.json'
|
|
actions.append("Update heartbeat to current time")
|
|
if not dry_run:
|
|
hb_data = {
|
|
'ts': time.time(),
|
|
'step': 'recovery_attempt',
|
|
'recovered_at': datetime.now().isoformat()
|
|
}
|
|
heartbeat_file.write_text(json.dumps(hb_data, indent=2))
|
|
|
|
# 3. Update progress file
|
|
progress_file = task_dir / 'progress.md'
|
|
actions.append("Update progress with recovery note")
|
|
if not dry_run:
|
|
progress_content = f"""# Task Recovery
|
|
|
|
**Recovered at:** {datetime.now().isoformat()}
|
|
**Status:** Task was stalled, recovery attempted
|
|
|
|
## Original Progress
|
|
(Previous content preserved)
|
|
|
|
## Recovery Actions
|
|
- Process killed/terminated
|
|
- Heartbeat reset
|
|
- Progress file updated
|
|
|
|
**Next step:** Monitor task progress. If still stalled, may need manual intervention.
|
|
"""
|
|
progress_file.write_text(progress_content)
|
|
|
|
# 4. Update meta to mark recovery attempt
|
|
meta_file = task_dir / 'meta.json'
|
|
actions.append("Update metadata with recovery flag")
|
|
if not dry_run:
|
|
try:
|
|
meta = json.loads(meta_file.read_text())
|
|
meta['recovery_attempts'] = meta.get('recovery_attempts', 0) + 1
|
|
meta['last_recovery'] = datetime.now().isoformat()
|
|
meta_file.write_text(json.dumps(meta, indent=2))
|
|
except:
|
|
pass
|
|
|
|
# 5. Decision: Keep in active or move to failed if too many recovery attempts
|
|
meta = json.loads(meta_file.read_text()) if meta_file.exists() else {}
|
|
recovery_attempts = meta.get('recovery_attempts', 0)
|
|
|
|
if recovery_attempts >= 3:
|
|
result_status = 'moved_to_failed'
|
|
actions.append("Move to failed (too many recovery attempts)")
|
|
if not dry_run:
|
|
self._move_task_to_failed(task_dir, task_id, "Exceeded maximum recovery attempts")
|
|
else:
|
|
result_status = 'recovered'
|
|
actions.append("Keep in active (monitor progress)")
|
|
|
|
return {
|
|
'task_id': task_id,
|
|
'status': result_status,
|
|
'actions': actions,
|
|
'dry_run': dry_run,
|
|
'timestamp': time.time()
|
|
}
|
|
|
|
def recover_all_stalled_tasks(self, dry_run: bool = True) -> Dict:
|
|
"""
|
|
Recover all stalled tasks.
|
|
|
|
Args:
|
|
dry_run: If True, preview without making changes
|
|
|
|
Returns:
|
|
Dict with batch recovery results
|
|
"""
|
|
stalled_tasks = self.find_stalled_tasks()
|
|
|
|
if not stalled_tasks:
|
|
return {
|
|
'total_stalled': 0,
|
|
'recovered': 0,
|
|
'moved_to_failed': 0,
|
|
'results': [],
|
|
'dry_run': dry_run,
|
|
'timestamp': time.time()
|
|
}
|
|
|
|
results = []
|
|
recovered_count = 0
|
|
moved_count = 0
|
|
|
|
for stalled in stalled_tasks:
|
|
task_id = stalled['task_id']
|
|
result = self.recover_stalled_task(task_id, dry_run=dry_run)
|
|
results.append(result)
|
|
|
|
if result['status'] == 'recovered':
|
|
recovered_count += 1
|
|
elif result['status'] == 'moved_to_failed':
|
|
moved_count += 1
|
|
|
|
return {
|
|
'total_stalled': len(stalled_tasks),
|
|
'recovered': recovered_count,
|
|
'moved_to_failed': moved_count,
|
|
'results': results,
|
|
'dry_run': dry_run,
|
|
'timestamp': time.time()
|
|
}
|
|
|
|
def release_locks(self, task_id: str, dry_run: bool = True) -> Dict:
|
|
"""
|
|
Release any locks held by a task.
|
|
|
|
Args:
|
|
task_id: Task ID
|
|
dry_run: If True, preview without making changes
|
|
|
|
Returns:
|
|
Dict with lock release results
|
|
"""
|
|
task_dir = self.active_dir / task_id
|
|
|
|
if not task_dir.exists():
|
|
return {'status': 'error', 'message': f'Task {task_id} not found'}
|
|
|
|
# Look for lock files
|
|
lock_dir = task_dir / 'locks'
|
|
released = []
|
|
|
|
if lock_dir.exists():
|
|
for lock_file in lock_dir.iterdir():
|
|
released.append(str(lock_file))
|
|
if not dry_run:
|
|
lock_file.unlink()
|
|
|
|
return {
|
|
'task_id': task_id,
|
|
'locks_released': len(released),
|
|
'lock_files': released,
|
|
'dry_run': dry_run,
|
|
'timestamp': time.time()
|
|
}
|
|
|
|
def validate_recovery(self, task_id: str) -> Dict:
|
|
"""
|
|
Validate that a task recovered successfully.
|
|
|
|
Args:
|
|
task_id: Task ID to validate
|
|
|
|
Returns:
|
|
Dict with validation result
|
|
"""
|
|
task_dir = self.active_dir / task_id
|
|
|
|
if not task_dir.exists():
|
|
return {'status': 'not_found', 'task_id': task_id}
|
|
|
|
# Check heartbeat is recent
|
|
heartbeat_file = task_dir / 'heartbeat.json'
|
|
is_alive = False
|
|
|
|
if heartbeat_file.exists():
|
|
try:
|
|
hb = json.loads(heartbeat_file.read_text())
|
|
hb_age = time.time() - hb.get('ts', 0)
|
|
is_alive = hb_age < 300 # Consider alive if <5min old
|
|
except:
|
|
pass
|
|
|
|
# Check for process
|
|
process_running = False
|
|
pid_file = task_dir / 'pid'
|
|
if pid_file.exists():
|
|
try:
|
|
pid = int(pid_file.read_text().strip())
|
|
process_running = os.path.exists(f'/proc/{pid}')
|
|
except:
|
|
pass
|
|
|
|
# Overall recovery status
|
|
recovery_status = 'recovered' if is_alive or process_running else 'stalled'
|
|
|
|
return {
|
|
'task_id': task_id,
|
|
'recovery_status': recovery_status,
|
|
'heartbeat_alive': is_alive,
|
|
'process_running': process_running,
|
|
'timestamp': time.time()
|
|
}
|
|
|
|
def _move_task_to_failed(self, task_dir: Path, task_id: str, failure_reason: str) -> bool:
|
|
"""Move a task from active to failed."""
|
|
try:
|
|
failed_task_dir = self.failed_dir / task_id
|
|
failed_task_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Copy all files
|
|
for item in task_dir.iterdir():
|
|
if item.is_file():
|
|
import shutil
|
|
shutil.copy2(item, failed_task_dir / item.name)
|
|
|
|
# Update meta with failure reason
|
|
meta_file = failed_task_dir / 'meta.json'
|
|
if meta_file.exists():
|
|
meta = json.loads(meta_file.read_text())
|
|
else:
|
|
meta = {}
|
|
|
|
meta['failure_reason'] = failure_reason
|
|
meta['moved_to_failed_at'] = datetime.now().isoformat()
|
|
meta_file.write_text(json.dumps(meta, indent=2))
|
|
|
|
# Create error.txt
|
|
error_file = failed_task_dir / 'error.txt'
|
|
error_file.write_text(f"Task stalled: {failure_reason}\nMoved to failed: {datetime.now().isoformat()}")
|
|
|
|
# Remove from active
|
|
import shutil
|
|
shutil.rmtree(task_dir)
|
|
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error moving task {task_id} to failed: {e}")
|
|
return False
|
|
|
|
|
|
if __name__ == '__main__':
|
|
recovery = ConductorRecovery()
|
|
|
|
print("=" * 70)
|
|
print("FINDING STALLED TASKS")
|
|
print("=" * 70)
|
|
stalled = recovery.find_stalled_tasks()
|
|
print(f"Found {len(stalled)} stalled task(s)")
|
|
for task in stalled[:5]:
|
|
print(f" - {task['task_id']}: {task['stall_reason']}")
|
|
|
|
if stalled:
|
|
print("\n" + "=" * 70)
|
|
print("RECOVERY DRY RUN (preview only)")
|
|
print("=" * 70)
|
|
result = recovery.recover_all_stalled_tasks(dry_run=True)
|
|
print(f"Would recover: {result['recovered']}")
|
|
print(f"Would move to failed: {result['moved_to_failed']}")
|
|
print("\nActions:")
|
|
for r in result['results'][:1]:
|
|
for action in r['actions']:
|
|
print(f" - {action}")
|