Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
210
lib/job_recovery.py
Executable file
210
lib/job_recovery.py
Executable file
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Job recovery and restart system.
|
||||
Handles resumption of incomplete jobs with session continuation.
|
||||
"""
|
||||
|
||||
import json
|
||||
import uuid
|
||||
import subprocess
|
||||
import os
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
JOBS_DIR = Path("/var/log/luz-orchestrator/jobs")
|
||||
|
||||
def get_claude_session_id(job_id):
|
||||
"""Get existing session ID or create new one."""
|
||||
meta_file = JOBS_DIR / job_id / "meta.json"
|
||||
|
||||
try:
|
||||
with open(meta_file) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
session_id = meta.get('claude_session_id')
|
||||
if session_id:
|
||||
return session_id, False # Existing session
|
||||
|
||||
# Create new session ID
|
||||
session_id = f"sess_{uuid.uuid4().hex[:12]}"
|
||||
meta['claude_session_id'] = session_id
|
||||
meta['claude_session_created'] = datetime.now().isoformat()
|
||||
|
||||
with open(meta_file, 'w') as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
return session_id, True # New session
|
||||
except Exception as e:
|
||||
print(f"Error managing session ID: {e}")
|
||||
return None, False
|
||||
|
||||
def create_recovery_prompt(original_prompt):
|
||||
"""Add recovery prefix to original prompt."""
|
||||
recovery_prefix = """RECOVERY MODE: Continue from where you left off
|
||||
|
||||
IMPORTANT: Before resuming work, do ALL of the following:
|
||||
1. Check what has been implemented so far
|
||||
- Look at git status in the project directory
|
||||
- Check /home/<project>/ for any partial work
|
||||
- Review any existing output or reports
|
||||
- Check the conductor directory for progress markers
|
||||
|
||||
2. Verify all artifacts from previous session
|
||||
- List files created/modified since dispatch
|
||||
- Check timestamps to understand what succeeded
|
||||
- Review any logs or error messages
|
||||
|
||||
3. Determine current state
|
||||
- Is implementation complete?
|
||||
- Where exactly did work stop?
|
||||
- What's the next logical step?
|
||||
|
||||
4. If session was interrupted by system overload:
|
||||
- Do NOT retry identical operations
|
||||
- Check for partial results first
|
||||
- Build incrementally on what exists
|
||||
- Report progress immediately
|
||||
|
||||
ORIGINAL TASK:
|
||||
================================================================================
|
||||
""" + original_prompt + """
|
||||
================================================================================
|
||||
|
||||
RESUME: Begin by following steps 1-4 above, then continue the work.
|
||||
"""
|
||||
return recovery_prefix
|
||||
|
||||
def prepare_restart(job_id, use_session_continuation=True):
|
||||
"""Prepare a job for restart."""
|
||||
job_dir = JOBS_DIR / job_id
|
||||
meta_file = job_dir / "meta.json"
|
||||
prompt_file = job_dir / "prompt.txt"
|
||||
|
||||
if not meta_file.exists() or not prompt_file.exists():
|
||||
return None, "Missing job files"
|
||||
|
||||
try:
|
||||
# Load original metadata and prompt
|
||||
with open(meta_file) as f:
|
||||
original_meta = json.load(f)
|
||||
|
||||
with open(prompt_file) as f:
|
||||
original_prompt = f.read()
|
||||
|
||||
# Get session ID
|
||||
session_id, is_new = get_claude_session_id(job_id)
|
||||
|
||||
# Create recovery prompt
|
||||
recovery_prompt = create_recovery_prompt(original_prompt)
|
||||
|
||||
# Create recovery metadata
|
||||
recovery_meta = original_meta.copy()
|
||||
recovery_meta['status'] = 'running'
|
||||
recovery_meta['recovery_attempt'] = recovery_meta.get('recovery_attempt', 0) + 1
|
||||
recovery_meta['recovery_started'] = datetime.now().isoformat()
|
||||
recovery_meta['claude_session_id'] = session_id
|
||||
recovery_meta['recovery_previous_exit_code'] = recovery_meta.get('exit_code', 'unknown')
|
||||
|
||||
# Backup original output
|
||||
output_file = job_dir / "output.log"
|
||||
if output_file.exists() and output_file.stat().st_size > 0:
|
||||
backup_file = job_dir / f"output.previous.attempt{recovery_meta['recovery_attempt'] - 1}.log"
|
||||
backup_file.write_bytes(output_file.read_bytes())
|
||||
output_file.write_text("") # Clear for new attempt
|
||||
|
||||
# Save recovery metadata
|
||||
with open(meta_file, 'w') as f:
|
||||
json.dump(recovery_meta, f, indent=2)
|
||||
|
||||
# Save recovery prompt
|
||||
with open(prompt_file, 'w') as f:
|
||||
f.write(recovery_prompt)
|
||||
|
||||
return {
|
||||
'job_id': job_id,
|
||||
'session_id': session_id,
|
||||
'is_new_session': is_new,
|
||||
'recovery_attempt': recovery_meta['recovery_attempt'],
|
||||
'project': original_meta.get('project'),
|
||||
'ready_to_restart': True,
|
||||
}, None
|
||||
except Exception as e:
|
||||
return None, str(e)
|
||||
|
||||
def restart_job(job_id, use_session_continuation=True):
|
||||
"""Restart a job with optional session continuation."""
|
||||
job_dir = JOBS_DIR / job_id
|
||||
meta_file = job_dir / "meta.json"
|
||||
|
||||
# Prepare recovery
|
||||
prep_result, error = prepare_restart(job_id, use_session_continuation)
|
||||
if error:
|
||||
return {'error': error, 'job_id': job_id}
|
||||
|
||||
session_id = prep_result['session_id']
|
||||
|
||||
# Build command
|
||||
cmd_parts = [
|
||||
'bash',
|
||||
str(job_dir / "run.sh"),
|
||||
]
|
||||
|
||||
# If we have a session ID and continuation is enabled, use -c flag
|
||||
if use_session_continuation and session_id:
|
||||
# Prepend session continuation to script
|
||||
run_script = job_dir / "run.sh"
|
||||
original_script = run_script.read_text()
|
||||
|
||||
# Inject session ID into the claude command
|
||||
# This would need to be handled by the CLI wrapper
|
||||
# For now, we'll pass it as environment variable
|
||||
os.environ['CLAUDE_SESSION_ID'] = session_id
|
||||
os.environ['CLAUDE_RECOVERY_MODE'] = '1'
|
||||
|
||||
# Launch restart
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd_parts,
|
||||
cwd=str(job_dir.parent),
|
||||
env={**os.environ, 'CLAUDE_SESSION_ID': session_id}
|
||||
)
|
||||
|
||||
with open(meta_file) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
meta['recovery_pid'] = proc.pid
|
||||
meta['recovery_restart_timestamp'] = datetime.now().isoformat()
|
||||
|
||||
with open(meta_file, 'w') as f:
|
||||
json.dump(meta, f, indent=2)
|
||||
|
||||
return {
|
||||
'job_id': job_id,
|
||||
'session_id': session_id,
|
||||
'recovery_attempt': prep_result['recovery_attempt'],
|
||||
'pid': proc.pid,
|
||||
'status': 'restarted',
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
'error': str(e),
|
||||
'job_id': job_id,
|
||||
'session_id': session_id,
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: job_recovery.py <job_id> [--restart]")
|
||||
sys.exit(1)
|
||||
|
||||
job_id = sys.argv[1]
|
||||
do_restart = "--restart" in sys.argv
|
||||
|
||||
if do_restart:
|
||||
result = restart_job(job_id)
|
||||
else:
|
||||
result, error = prepare_restart(job_id)
|
||||
|
||||
print(json.dumps(result, indent=2))
|
||||
Reference in New Issue
Block a user