Files
luzia/lib/job_recovery.py
admin ec33ac1936 Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor:

- Added DockerTmuxController class for robust tmux session management
- Implements send_keys() with configurable delay_enter
- Implements capture_pane() for output retrieval
- Implements wait_for_prompt() for pattern-based completion detection
- Implements wait_for_idle() for content-hash-based idle detection
- Implements wait_for_shell_prompt() for shell prompt detection

Also includes workflow improvements:
- Pre-task git snapshot before agent execution
- Post-task commit protocol in agent guidelines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00

211 lines
6.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Job recovery and restart system.
Handles resumption of incomplete jobs with session continuation.
"""
import json
import uuid
import subprocess
import os
from pathlib import Path
from datetime import datetime
JOBS_DIR = Path("/var/log/luz-orchestrator/jobs")
def get_claude_session_id(job_id):
"""Get existing session ID or create new one."""
meta_file = JOBS_DIR / job_id / "meta.json"
try:
with open(meta_file) as f:
meta = json.load(f)
session_id = meta.get('claude_session_id')
if session_id:
return session_id, False # Existing session
# Create new session ID
session_id = f"sess_{uuid.uuid4().hex[:12]}"
meta['claude_session_id'] = session_id
meta['claude_session_created'] = datetime.now().isoformat()
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
return session_id, True # New session
except Exception as e:
print(f"Error managing session ID: {e}")
return None, False
def create_recovery_prompt(original_prompt):
"""Add recovery prefix to original prompt."""
recovery_prefix = """RECOVERY MODE: Continue from where you left off
IMPORTANT: Before resuming work, do ALL of the following:
1. Check what has been implemented so far
- Look at git status in the project directory
- Check /home/<project>/ for any partial work
- Review any existing output or reports
- Check the conductor directory for progress markers
2. Verify all artifacts from previous session
- List files created/modified since dispatch
- Check timestamps to understand what succeeded
- Review any logs or error messages
3. Determine current state
- Is implementation complete?
- Where exactly did work stop?
- What's the next logical step?
4. If session was interrupted by system overload:
- Do NOT retry identical operations
- Check for partial results first
- Build incrementally on what exists
- Report progress immediately
ORIGINAL TASK:
================================================================================
""" + original_prompt + """
================================================================================
RESUME: Begin by following steps 1-4 above, then continue the work.
"""
return recovery_prefix
def prepare_restart(job_id, use_session_continuation=True):
"""Prepare a job for restart."""
job_dir = JOBS_DIR / job_id
meta_file = job_dir / "meta.json"
prompt_file = job_dir / "prompt.txt"
if not meta_file.exists() or not prompt_file.exists():
return None, "Missing job files"
try:
# Load original metadata and prompt
with open(meta_file) as f:
original_meta = json.load(f)
with open(prompt_file) as f:
original_prompt = f.read()
# Get session ID
session_id, is_new = get_claude_session_id(job_id)
# Create recovery prompt
recovery_prompt = create_recovery_prompt(original_prompt)
# Create recovery metadata
recovery_meta = original_meta.copy()
recovery_meta['status'] = 'running'
recovery_meta['recovery_attempt'] = recovery_meta.get('recovery_attempt', 0) + 1
recovery_meta['recovery_started'] = datetime.now().isoformat()
recovery_meta['claude_session_id'] = session_id
recovery_meta['recovery_previous_exit_code'] = recovery_meta.get('exit_code', 'unknown')
# Backup original output
output_file = job_dir / "output.log"
if output_file.exists() and output_file.stat().st_size > 0:
backup_file = job_dir / f"output.previous.attempt{recovery_meta['recovery_attempt'] - 1}.log"
backup_file.write_bytes(output_file.read_bytes())
output_file.write_text("") # Clear for new attempt
# Save recovery metadata
with open(meta_file, 'w') as f:
json.dump(recovery_meta, f, indent=2)
# Save recovery prompt
with open(prompt_file, 'w') as f:
f.write(recovery_prompt)
return {
'job_id': job_id,
'session_id': session_id,
'is_new_session': is_new,
'recovery_attempt': recovery_meta['recovery_attempt'],
'project': original_meta.get('project'),
'ready_to_restart': True,
}, None
except Exception as e:
return None, str(e)
def restart_job(job_id, use_session_continuation=True):
"""Restart a job with optional session continuation."""
job_dir = JOBS_DIR / job_id
meta_file = job_dir / "meta.json"
# Prepare recovery
prep_result, error = prepare_restart(job_id, use_session_continuation)
if error:
return {'error': error, 'job_id': job_id}
session_id = prep_result['session_id']
# Build command
cmd_parts = [
'bash',
str(job_dir / "run.sh"),
]
# If we have a session ID and continuation is enabled, use -c flag
if use_session_continuation and session_id:
# Prepend session continuation to script
run_script = job_dir / "run.sh"
original_script = run_script.read_text()
# Inject session ID into the claude command
# This would need to be handled by the CLI wrapper
# For now, we'll pass it as environment variable
os.environ['CLAUDE_SESSION_ID'] = session_id
os.environ['CLAUDE_RECOVERY_MODE'] = '1'
# Launch restart
try:
proc = subprocess.Popen(
cmd_parts,
cwd=str(job_dir.parent),
env={**os.environ, 'CLAUDE_SESSION_ID': session_id}
)
with open(meta_file) as f:
meta = json.load(f)
meta['recovery_pid'] = proc.pid
meta['recovery_restart_timestamp'] = datetime.now().isoformat()
with open(meta_file, 'w') as f:
json.dump(meta, f, indent=2)
return {
'job_id': job_id,
'session_id': session_id,
'recovery_attempt': prep_result['recovery_attempt'],
'pid': proc.pid,
'status': 'restarted',
}
except Exception as e:
return {
'error': str(e),
'job_id': job_id,
'session_id': session_id,
}
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: job_recovery.py <job_id> [--restart]")
sys.exit(1)
job_id = sys.argv[1]
do_restart = "--restart" in sys.argv
if do_restart:
result = restart_job(job_id)
else:
result, error = prepare_restart(job_id)
print(json.dumps(result, indent=2))