Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
211 lines
6.7 KiB
Python
Executable File
211 lines
6.7 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Job recovery and restart system.
|
|
Handles resumption of incomplete jobs with session continuation.
|
|
"""
|
|
|
|
import json
|
|
import uuid
|
|
import subprocess
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
JOBS_DIR = Path("/var/log/luz-orchestrator/jobs")
|
|
|
|
def get_claude_session_id(job_id):
|
|
"""Get existing session ID or create new one."""
|
|
meta_file = JOBS_DIR / job_id / "meta.json"
|
|
|
|
try:
|
|
with open(meta_file) as f:
|
|
meta = json.load(f)
|
|
|
|
session_id = meta.get('claude_session_id')
|
|
if session_id:
|
|
return session_id, False # Existing session
|
|
|
|
# Create new session ID
|
|
session_id = f"sess_{uuid.uuid4().hex[:12]}"
|
|
meta['claude_session_id'] = session_id
|
|
meta['claude_session_created'] = datetime.now().isoformat()
|
|
|
|
with open(meta_file, 'w') as f:
|
|
json.dump(meta, f, indent=2)
|
|
|
|
return session_id, True # New session
|
|
except Exception as e:
|
|
print(f"Error managing session ID: {e}")
|
|
return None, False
|
|
|
|
def create_recovery_prompt(original_prompt):
|
|
"""Add recovery prefix to original prompt."""
|
|
recovery_prefix = """RECOVERY MODE: Continue from where you left off
|
|
|
|
IMPORTANT: Before resuming work, do ALL of the following:
|
|
1. Check what has been implemented so far
|
|
- Look at git status in the project directory
|
|
- Check /home/<project>/ for any partial work
|
|
- Review any existing output or reports
|
|
- Check the conductor directory for progress markers
|
|
|
|
2. Verify all artifacts from previous session
|
|
- List files created/modified since dispatch
|
|
- Check timestamps to understand what succeeded
|
|
- Review any logs or error messages
|
|
|
|
3. Determine current state
|
|
- Is implementation complete?
|
|
- Where exactly did work stop?
|
|
- What's the next logical step?
|
|
|
|
4. If session was interrupted by system overload:
|
|
- Do NOT retry identical operations
|
|
- Check for partial results first
|
|
- Build incrementally on what exists
|
|
- Report progress immediately
|
|
|
|
ORIGINAL TASK:
|
|
================================================================================
|
|
""" + original_prompt + """
|
|
================================================================================
|
|
|
|
RESUME: Begin by following steps 1-4 above, then continue the work.
|
|
"""
|
|
return recovery_prefix
|
|
|
|
def prepare_restart(job_id, use_session_continuation=True):
|
|
"""Prepare a job for restart."""
|
|
job_dir = JOBS_DIR / job_id
|
|
meta_file = job_dir / "meta.json"
|
|
prompt_file = job_dir / "prompt.txt"
|
|
|
|
if not meta_file.exists() or not prompt_file.exists():
|
|
return None, "Missing job files"
|
|
|
|
try:
|
|
# Load original metadata and prompt
|
|
with open(meta_file) as f:
|
|
original_meta = json.load(f)
|
|
|
|
with open(prompt_file) as f:
|
|
original_prompt = f.read()
|
|
|
|
# Get session ID
|
|
session_id, is_new = get_claude_session_id(job_id)
|
|
|
|
# Create recovery prompt
|
|
recovery_prompt = create_recovery_prompt(original_prompt)
|
|
|
|
# Create recovery metadata
|
|
recovery_meta = original_meta.copy()
|
|
recovery_meta['status'] = 'running'
|
|
recovery_meta['recovery_attempt'] = recovery_meta.get('recovery_attempt', 0) + 1
|
|
recovery_meta['recovery_started'] = datetime.now().isoformat()
|
|
recovery_meta['claude_session_id'] = session_id
|
|
recovery_meta['recovery_previous_exit_code'] = recovery_meta.get('exit_code', 'unknown')
|
|
|
|
# Backup original output
|
|
output_file = job_dir / "output.log"
|
|
if output_file.exists() and output_file.stat().st_size > 0:
|
|
backup_file = job_dir / f"output.previous.attempt{recovery_meta['recovery_attempt'] - 1}.log"
|
|
backup_file.write_bytes(output_file.read_bytes())
|
|
output_file.write_text("") # Clear for new attempt
|
|
|
|
# Save recovery metadata
|
|
with open(meta_file, 'w') as f:
|
|
json.dump(recovery_meta, f, indent=2)
|
|
|
|
# Save recovery prompt
|
|
with open(prompt_file, 'w') as f:
|
|
f.write(recovery_prompt)
|
|
|
|
return {
|
|
'job_id': job_id,
|
|
'session_id': session_id,
|
|
'is_new_session': is_new,
|
|
'recovery_attempt': recovery_meta['recovery_attempt'],
|
|
'project': original_meta.get('project'),
|
|
'ready_to_restart': True,
|
|
}, None
|
|
except Exception as e:
|
|
return None, str(e)
|
|
|
|
def restart_job(job_id, use_session_continuation=True):
|
|
"""Restart a job with optional session continuation."""
|
|
job_dir = JOBS_DIR / job_id
|
|
meta_file = job_dir / "meta.json"
|
|
|
|
# Prepare recovery
|
|
prep_result, error = prepare_restart(job_id, use_session_continuation)
|
|
if error:
|
|
return {'error': error, 'job_id': job_id}
|
|
|
|
session_id = prep_result['session_id']
|
|
|
|
# Build command
|
|
cmd_parts = [
|
|
'bash',
|
|
str(job_dir / "run.sh"),
|
|
]
|
|
|
|
# If we have a session ID and continuation is enabled, use -c flag
|
|
if use_session_continuation and session_id:
|
|
# Prepend session continuation to script
|
|
run_script = job_dir / "run.sh"
|
|
original_script = run_script.read_text()
|
|
|
|
# Inject session ID into the claude command
|
|
# This would need to be handled by the CLI wrapper
|
|
# For now, we'll pass it as environment variable
|
|
os.environ['CLAUDE_SESSION_ID'] = session_id
|
|
os.environ['CLAUDE_RECOVERY_MODE'] = '1'
|
|
|
|
# Launch restart
|
|
try:
|
|
proc = subprocess.Popen(
|
|
cmd_parts,
|
|
cwd=str(job_dir.parent),
|
|
env={**os.environ, 'CLAUDE_SESSION_ID': session_id}
|
|
)
|
|
|
|
with open(meta_file) as f:
|
|
meta = json.load(f)
|
|
|
|
meta['recovery_pid'] = proc.pid
|
|
meta['recovery_restart_timestamp'] = datetime.now().isoformat()
|
|
|
|
with open(meta_file, 'w') as f:
|
|
json.dump(meta, f, indent=2)
|
|
|
|
return {
|
|
'job_id': job_id,
|
|
'session_id': session_id,
|
|
'recovery_attempt': prep_result['recovery_attempt'],
|
|
'pid': proc.pid,
|
|
'status': 'restarted',
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
'error': str(e),
|
|
'job_id': job_id,
|
|
'session_id': session_id,
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: job_recovery.py <job_id> [--restart]")
|
|
sys.exit(1)
|
|
|
|
job_id = sys.argv[1]
|
|
do_restart = "--restart" in sys.argv
|
|
|
|
if do_restart:
|
|
result = restart_job(job_id)
|
|
else:
|
|
result, error = prepare_restart(job_id)
|
|
|
|
print(json.dumps(result, indent=2))
|