luzia/lib/job_recovery.py

#!/usr/bin/env python3
"""
Job recovery and restart system.
Handles resumption of incomplete jobs with session continuation.
"""

import json
import uuid
import subprocess
import os
from pathlib import Path
from datetime import datetime

JOBS_DIR = Path("/var/log/luz-orchestrator/jobs")

def get_claude_session_id(job_id):
    """Get existing session ID or create new one."""
    meta_file = JOBS_DIR / job_id / "meta.json"

    try:
        with open(meta_file) as f:
            meta = json.load(f)

        session_id = meta.get('claude_session_id')
        if session_id:
            return session_id, False  # Existing session

        # Create new session ID
        session_id = f"sess_{uuid.uuid4().hex[:12]}"
        meta['claude_session_id'] = session_id
        meta['claude_session_created'] = datetime.now().isoformat()

        with open(meta_file, 'w') as f:
            json.dump(meta, f, indent=2)

        return session_id, True  # New session
    except Exception as e:
        print(f"Error managing session ID: {e}")
        return None, False

def create_recovery_prompt(original_prompt):
    """Add recovery prefix to original prompt."""
    recovery_prefix = """RECOVERY MODE: Continue from where you left off

IMPORTANT: Before resuming work, do ALL of the following:
1. Check what has been implemented so far
   - Look at git status in the project directory
   - Check /home/<project>/ for any partial work
   - Review any existing output or reports
   - Check the conductor directory for progress markers

2. Verify all artifacts from previous session
   - List files created/modified since dispatch
   - Check timestamps to understand what succeeded
   - Review any logs or error messages

3. Determine current state
   - Is implementation complete?
   - Where exactly did work stop?
   - What's the next logical step?

4. If session was interrupted by system overload:
   - Do NOT retry identical operations
   - Check for partial results first
   - Build incrementally on what exists
   - Report progress immediately

ORIGINAL TASK:
================================================================================
""" + original_prompt + """
================================================================================

RESUME: Begin by following steps 1-4 above, then continue the work.
"""
    return recovery_prefix

def prepare_restart(job_id, use_session_continuation=True):
    """Prepare a job for restart."""
    job_dir = JOBS_DIR / job_id
    meta_file = job_dir / "meta.json"
    prompt_file = job_dir / "prompt.txt"

    if not meta_file.exists() or not prompt_file.exists():
        return None, "Missing job files"

    try:
        # Load original metadata and prompt
        with open(meta_file) as f:
            original_meta = json.load(f)

        with open(prompt_file) as f:
            original_prompt = f.read()

        # Get session ID
        session_id, is_new = get_claude_session_id(job_id)

        # Create recovery prompt
        recovery_prompt = create_recovery_prompt(original_prompt)

        # Create recovery metadata
        recovery_meta = original_meta.copy()
        recovery_meta['status'] = 'running'
        recovery_meta['recovery_attempt'] = recovery_meta.get('recovery_attempt', 0) + 1
        recovery_meta['recovery_started'] = datetime.now().isoformat()
        recovery_meta['claude_session_id'] = session_id
        recovery_meta['recovery_previous_exit_code'] = recovery_meta.get('exit_code', 'unknown')

        # Backup original output
        output_file = job_dir / "output.log"
        if output_file.exists() and output_file.stat().st_size > 0:
            backup_file = job_dir / f"output.previous.attempt{recovery_meta['recovery_attempt'] - 1}.log"
            backup_file.write_bytes(output_file.read_bytes())
            output_file.write_text("")  # Clear for new attempt

        # Save recovery metadata
        with open(meta_file, 'w') as f:
            json.dump(recovery_meta, f, indent=2)

        # Save recovery prompt
        with open(prompt_file, 'w') as f:
            f.write(recovery_prompt)

        return {
            'job_id': job_id,
            'session_id': session_id,
            'is_new_session': is_new,
            'recovery_attempt': recovery_meta['recovery_attempt'],
            'project': original_meta.get('project'),
            'ready_to_restart': True,
        }, None
    except Exception as e:
        return None, str(e)

def restart_job(job_id, use_session_continuation=True):
    """Restart a job with optional session continuation."""
    job_dir = JOBS_DIR / job_id
    meta_file = job_dir / "meta.json"

    # Prepare recovery
    prep_result, error = prepare_restart(job_id, use_session_continuation)
    if error:
        return {'error': error, 'job_id': job_id}

    session_id = prep_result['session_id']

    # Build command
    cmd_parts = [
        'bash',
        str(job_dir / "run.sh"),
    ]

    # If we have a session ID and continuation is enabled, use -c flag
    if use_session_continuation and session_id:
        # Prepend session continuation to script
        run_script = job_dir / "run.sh"
        original_script = run_script.read_text()

        # Inject session ID into the claude command
        # This would need to be handled by the CLI wrapper
        # For now, we'll pass it as environment variable
        os.environ['CLAUDE_SESSION_ID'] = session_id
        os.environ['CLAUDE_RECOVERY_MODE'] = '1'

    # Launch restart
    try:
        proc = subprocess.Popen(
            cmd_parts,
            cwd=str(job_dir.parent),
            env={**os.environ, 'CLAUDE_SESSION_ID': session_id}
        )

        with open(meta_file) as f:
            meta = json.load(f)

        meta['recovery_pid'] = proc.pid
        meta['recovery_restart_timestamp'] = datetime.now().isoformat()

        with open(meta_file, 'w') as f:
            json.dump(meta, f, indent=2)

        return {
            'job_id': job_id,
            'session_id': session_id,
            'recovery_attempt': prep_result['recovery_attempt'],
            'pid': proc.pid,
            'status': 'restarted',
        }
    except Exception as e:
        return {
            'error': str(e),
            'job_id': job_id,
            'session_id': session_id,
        }

if __name__ == "__main__":
    import sys

    if len(sys.argv) < 2:
        print("Usage: job_recovery.py <job_id> [--restart]")
        sys.exit(1)

    job_id = sys.argv[1]
    do_restart = "--restart" in sys.argv

    if do_restart:
        result = restart_job(job_id)
    else:
        result, error = prepare_restart(job_id)

    print(json.dumps(result, indent=2))