Files
luzia/lib/per_user_queue_manager.py
admin ec33ac1936 Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor:

- Added DockerTmuxController class for robust tmux session management
- Implements send_keys() with configurable delay_enter
- Implements capture_pane() for output retrieval
- Implements wait_for_prompt() for pattern-based completion detection
- Implements wait_for_idle() for content-hash-based idle detection
- Implements wait_for_shell_prompt() for shell prompt detection

Also includes workflow improvements:
- Pre-task git snapshot before agent execution
- Post-task commit protocol in agent guidelines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00

361 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Per-User Queue Manager - Ensures only one task per user at a time
Implements:
- Per-user queue isolation using file-based locks
- Atomic locking mechanism to prevent concurrent task execution
- Fair scheduling across users while maintaining user isolation
- Queue monitoring and status reporting per user
Features:
1. File-based per-user locks at /var/lib/luzia/locks/user_{username}.lock
2. Task serialization per user - only one running task at a time
3. Atomic lock acquire/release with timeout handling
4. Fallback to lock file cleanup on stale locks
5. Integration with existing QueueController capacity tracking
"""
import fcntl
import json
import os
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
import logging
logger = logging.getLogger(__name__)
class PerUserQueueManager:
"""Manages per-user task queues with exclusive locking."""
LOCK_BASE = Path("/var/lib/luzia/locks")
LOCK_TIMEOUT_SECONDS = 3600 # 1 hour timeout for stale locks
def __init__(self):
"""Initialize per-user queue manager."""
self.LOCK_BASE.mkdir(parents=True, exist_ok=True)
def _get_lock_path(self, user: str) -> Path:
"""Get lock file path for a user."""
# Sanitize username to prevent path traversal
safe_user = "".join(c for c in user if c.isalnum() or c in "_-")
if not safe_user or len(safe_user) > 32:
safe_user = "default"
return self.LOCK_BASE / f"user_{safe_user}.lock"
def _get_lock_meta_path(self, user: str) -> Path:
"""Get lock metadata file path for a user."""
lock_path = self._get_lock_path(user)
return lock_path.with_suffix(".json")
def acquire_lock(self, user: str, task_id: str, timeout: int = 30) -> Tuple[bool, Optional[str]]:
"""
Acquire exclusive lock for a user's task execution.
Args:
user: Username
task_id: Task ID for this execution
timeout: Lock acquisition timeout in seconds
Returns:
Tuple of (acquired: bool, lock_id: str or None)
lock_id is a unique identifier for this lock acquisition
"""
lock_path = self._get_lock_path(user)
lock_meta_path = self._get_lock_meta_path(user)
# Check for stale locks first
self._cleanup_stale_locks(user)
lock_id = f"{task_id}_{int(time.time())}"
start_time = time.time()
# Try to acquire lock with timeout
while True:
try:
# Try to create lock file exclusively
# O_EXCL | O_CREAT ensures atomicity
fd = os.open(
str(lock_path),
os.O_CREAT | os.O_EXCL | os.O_WRONLY,
0o644
)
# Write lock metadata
meta = {
"lock_id": lock_id,
"user": user,
"task_id": task_id,
"acquired_at": datetime.now().isoformat(),
"acquired_by_pid": os.getpid(),
}
os.write(fd, json.dumps(meta).encode())
os.close(fd)
# Also write metadata file for monitoring
meta["lock_file"] = str(lock_path)
meta["expires_at"] = (
datetime.now() + timedelta(seconds=self.LOCK_TIMEOUT_SECONDS)
).isoformat()
lock_meta_path.write_text(json.dumps(meta, indent=2))
logger.info(f"Acquired lock for user {user}, task {task_id}, lock_id {lock_id}")
return True, lock_id
except FileExistsError:
# Lock file exists, check if it's stale
if self._is_lock_stale(user):
# Remove stale lock and retry
try:
lock_path.unlink()
continue
except FileNotFoundError:
# Another process removed it, retry
continue
# Lock is active
elapsed = time.time() - start_time
if elapsed >= timeout:
logger.warning(
f"Failed to acquire lock for user {user} within {timeout}s. "
f"Another task may be running."
)
return False, None
# Wait and retry
time.sleep(0.5)
except Exception as e:
logger.error(f"Error acquiring lock for user {user}: {e}")
return False, None
def release_lock(self, user: str, lock_id: str) -> bool:
"""
Release lock for a user.
Args:
user: Username
lock_id: Lock ID from acquire_lock
Returns:
True if lock was released, False if it wasn't held
"""
lock_path = self._get_lock_path(user)
lock_meta_path = self._get_lock_meta_path(user)
try:
# Verify lock_id matches before releasing
if lock_path.exists():
meta = json.loads(lock_path.read_text())
if meta.get("lock_id") != lock_id:
logger.warning(
f"Lock ID mismatch for user {user}. "
f"Expected {lock_id}, got {meta.get('lock_id')}"
)
return False
# Remove lock file
if lock_path.exists():
lock_path.unlink()
# Remove metadata file
if lock_meta_path.exists():
lock_meta_path.unlink()
logger.info(f"Released lock for user {user}, lock_id {lock_id}")
return True
except Exception as e:
logger.error(f"Error releasing lock for user {user}: {e}")
return False
def is_user_locked(self, user: str) -> bool:
"""
Check if a user has an active lock (non-stale).
Args:
user: Username
Returns:
True if user has an active lock
"""
lock_path = self._get_lock_path(user)
if not lock_path.exists():
return False
# Check if lock is stale
return not self._is_lock_stale(user)
def get_lock_info(self, user: str) -> Optional[Dict[str, Any]]:
"""
Get information about a user's active lock.
Args:
user: Username
Returns:
Lock metadata dict or None if no lock
"""
lock_meta_path = self._get_lock_meta_path(user)
if not lock_meta_path.exists():
return None
try:
meta = json.loads(lock_meta_path.read_text())
return meta
except Exception as e:
logger.error(f"Error reading lock info for user {user}: {e}")
return None
def get_all_locks(self) -> List[Dict[str, Any]]:
"""
Get information about all active locks.
Returns:
List of lock metadata dicts
"""
locks = []
for meta_file in self.LOCK_BASE.glob("user_*.json"):
try:
meta = json.loads(meta_file.read_text())
# Check if lock is stale
username = meta.get("user")
if not self._is_lock_stale(username):
locks.append(meta)
except Exception as e:
logger.error(f"Error reading lock file {meta_file}: {e}")
return locks
def _is_lock_stale(self, user: str) -> bool:
"""
Check if a user's lock has expired.
Args:
user: Username
Returns:
True if lock is stale (expired or missing metadata)
"""
lock_path = self._get_lock_path(user)
lock_meta_path = self._get_lock_meta_path(user)
if not lock_path.exists():
return True
if not lock_meta_path.exists():
# No metadata, assume stale
return True
try:
meta = json.loads(lock_meta_path.read_text())
expires_at = meta.get("expires_at")
if expires_at:
expire_time = datetime.fromisoformat(expires_at)
return datetime.now() > expire_time
# Fallback: check acquired_at time
acquired_at = meta.get("acquired_at")
if acquired_at:
acquire_time = datetime.fromisoformat(acquired_at)
age_seconds = (datetime.now() - acquire_time).total_seconds()
return age_seconds > self.LOCK_TIMEOUT_SECONDS
return True
except Exception as e:
logger.error(f"Error checking lock staleness for user {user}: {e}")
return True
def _cleanup_stale_locks(self, user: str) -> None:
"""
Clean up stale locks for a user.
Args:
user: Username
"""
lock_path = self._get_lock_path(user)
lock_meta_path = self._get_lock_meta_path(user)
if not lock_path.exists():
return
if self._is_lock_stale(user):
try:
if lock_path.exists():
lock_path.unlink()
if lock_meta_path.exists():
lock_meta_path.unlink()
logger.info(f"Cleaned up stale lock for user {user}")
except Exception as e:
logger.error(f"Error cleaning up stale lock for user {user}: {e}")
def cleanup_all_stale_locks(self) -> int:
"""
Clean up all stale locks.
Returns:
Count of locks cleaned up
"""
count = 0
for meta_file in self.LOCK_BASE.glob("user_*.json"):
try:
meta = json.loads(meta_file.read_text())
username = meta.get("user")
if self._is_lock_stale(username):
# Remove lock and metadata
lock_path = self._get_lock_path(username)
if lock_path.exists():
lock_path.unlink()
if meta_file.exists():
meta_file.unlink()
logger.info(f"Cleaned up stale lock for user {username}")
count += 1
except Exception as e:
logger.error(f"Error processing lock file {meta_file}: {e}")
return count
def wait_for_lock_release(
self, user: str, max_wait_seconds: int = 300
) -> bool:
"""
Wait for a user's lock to be released.
Args:
user: Username
max_wait_seconds: Maximum time to wait (default 5 minutes)
Returns:
True if lock was released, False if timeout
"""
start_time = time.time()
while True:
if not self.is_user_locked(user):
return True
elapsed = time.time() - start_time
if elapsed >= max_wait_seconds:
logger.warning(f"Timeout waiting for lock release for user {user}")
return False
time.sleep(1)
# Module exports
__all__ = [
"PerUserQueueManager",
]