#!/usr/bin/env python3 """ Job recovery and restart system. Handles resumption of incomplete jobs with session continuation. """ import json import uuid import subprocess import os from pathlib import Path from datetime import datetime JOBS_DIR = Path("/var/log/luz-orchestrator/jobs") def get_claude_session_id(job_id): """Get existing session ID or create new one.""" meta_file = JOBS_DIR / job_id / "meta.json" try: with open(meta_file) as f: meta = json.load(f) session_id = meta.get('claude_session_id') if session_id: return session_id, False # Existing session # Create new session ID session_id = f"sess_{uuid.uuid4().hex[:12]}" meta['claude_session_id'] = session_id meta['claude_session_created'] = datetime.now().isoformat() with open(meta_file, 'w') as f: json.dump(meta, f, indent=2) return session_id, True # New session except Exception as e: print(f"Error managing session ID: {e}") return None, False def create_recovery_prompt(original_prompt): """Add recovery prefix to original prompt.""" recovery_prefix = """RECOVERY MODE: Continue from where you left off IMPORTANT: Before resuming work, do ALL of the following: 1. Check what has been implemented so far - Look at git status in the project directory - Check /home// for any partial work - Review any existing output or reports - Check the conductor directory for progress markers 2. Verify all artifacts from previous session - List files created/modified since dispatch - Check timestamps to understand what succeeded - Review any logs or error messages 3. Determine current state - Is implementation complete? - Where exactly did work stop? - What's the next logical step? 4. If session was interrupted by system overload: - Do NOT retry identical operations - Check for partial results first - Build incrementally on what exists - Report progress immediately ORIGINAL TASK: ================================================================================ """ + original_prompt + """ ================================================================================ RESUME: Begin by following steps 1-4 above, then continue the work. """ return recovery_prefix def prepare_restart(job_id, use_session_continuation=True): """Prepare a job for restart.""" job_dir = JOBS_DIR / job_id meta_file = job_dir / "meta.json" prompt_file = job_dir / "prompt.txt" if not meta_file.exists() or not prompt_file.exists(): return None, "Missing job files" try: # Load original metadata and prompt with open(meta_file) as f: original_meta = json.load(f) with open(prompt_file) as f: original_prompt = f.read() # Get session ID session_id, is_new = get_claude_session_id(job_id) # Create recovery prompt recovery_prompt = create_recovery_prompt(original_prompt) # Create recovery metadata recovery_meta = original_meta.copy() recovery_meta['status'] = 'running' recovery_meta['recovery_attempt'] = recovery_meta.get('recovery_attempt', 0) + 1 recovery_meta['recovery_started'] = datetime.now().isoformat() recovery_meta['claude_session_id'] = session_id recovery_meta['recovery_previous_exit_code'] = recovery_meta.get('exit_code', 'unknown') # Backup original output output_file = job_dir / "output.log" if output_file.exists() and output_file.stat().st_size > 0: backup_file = job_dir / f"output.previous.attempt{recovery_meta['recovery_attempt'] - 1}.log" backup_file.write_bytes(output_file.read_bytes()) output_file.write_text("") # Clear for new attempt # Save recovery metadata with open(meta_file, 'w') as f: json.dump(recovery_meta, f, indent=2) # Save recovery prompt with open(prompt_file, 'w') as f: f.write(recovery_prompt) return { 'job_id': job_id, 'session_id': session_id, 'is_new_session': is_new, 'recovery_attempt': recovery_meta['recovery_attempt'], 'project': original_meta.get('project'), 'ready_to_restart': True, }, None except Exception as e: return None, str(e) def restart_job(job_id, use_session_continuation=True): """Restart a job with optional session continuation.""" job_dir = JOBS_DIR / job_id meta_file = job_dir / "meta.json" # Prepare recovery prep_result, error = prepare_restart(job_id, use_session_continuation) if error: return {'error': error, 'job_id': job_id} session_id = prep_result['session_id'] # Build command cmd_parts = [ 'bash', str(job_dir / "run.sh"), ] # If we have a session ID and continuation is enabled, use -c flag if use_session_continuation and session_id: # Prepend session continuation to script run_script = job_dir / "run.sh" original_script = run_script.read_text() # Inject session ID into the claude command # This would need to be handled by the CLI wrapper # For now, we'll pass it as environment variable os.environ['CLAUDE_SESSION_ID'] = session_id os.environ['CLAUDE_RECOVERY_MODE'] = '1' # Launch restart try: proc = subprocess.Popen( cmd_parts, cwd=str(job_dir.parent), env={**os.environ, 'CLAUDE_SESSION_ID': session_id} ) with open(meta_file) as f: meta = json.load(f) meta['recovery_pid'] = proc.pid meta['recovery_restart_timestamp'] = datetime.now().isoformat() with open(meta_file, 'w') as f: json.dump(meta, f, indent=2) return { 'job_id': job_id, 'session_id': session_id, 'recovery_attempt': prep_result['recovery_attempt'], 'pid': proc.pid, 'status': 'restarted', } except Exception as e: return { 'error': str(e), 'job_id': job_id, 'session_id': session_id, } if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: job_recovery.py [--restart]") sys.exit(1) job_id = sys.argv[1] do_restart = "--restart" in sys.argv if do_restart: result = restart_job(job_id) else: result, error = prepare_restart(job_id) print(json.dumps(result, indent=2))