#!/usr/bin/env python3 """ Per-User Queue Manager - Ensures only one task per user at a time Implements: - Per-user queue isolation using file-based locks - Atomic locking mechanism to prevent concurrent task execution - Fair scheduling across users while maintaining user isolation - Queue monitoring and status reporting per user Features: 1. File-based per-user locks at /var/lib/luzia/locks/user_{username}.lock 2. Task serialization per user - only one running task at a time 3. Atomic lock acquire/release with timeout handling 4. Fallback to lock file cleanup on stale locks 5. Integration with existing QueueController capacity tracking """ import fcntl import json import os import time from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional, Tuple, Any import logging logger = logging.getLogger(__name__) class PerUserQueueManager: """Manages per-user task queues with exclusive locking.""" LOCK_BASE = Path("/var/lib/luzia/locks") LOCK_TIMEOUT_SECONDS = 3600 # 1 hour timeout for stale locks def __init__(self): """Initialize per-user queue manager.""" self.LOCK_BASE.mkdir(parents=True, exist_ok=True) def _get_lock_path(self, user: str) -> Path: """Get lock file path for a user.""" # Sanitize username to prevent path traversal safe_user = "".join(c for c in user if c.isalnum() or c in "_-") if not safe_user or len(safe_user) > 32: safe_user = "default" return self.LOCK_BASE / f"user_{safe_user}.lock" def _get_lock_meta_path(self, user: str) -> Path: """Get lock metadata file path for a user.""" lock_path = self._get_lock_path(user) return lock_path.with_suffix(".json") def acquire_lock(self, user: str, task_id: str, timeout: int = 30) -> Tuple[bool, Optional[str]]: """ Acquire exclusive lock for a user's task execution. Args: user: Username task_id: Task ID for this execution timeout: Lock acquisition timeout in seconds Returns: Tuple of (acquired: bool, lock_id: str or None) lock_id is a unique identifier for this lock acquisition """ lock_path = self._get_lock_path(user) lock_meta_path = self._get_lock_meta_path(user) # Check for stale locks first self._cleanup_stale_locks(user) lock_id = f"{task_id}_{int(time.time())}" start_time = time.time() # Try to acquire lock with timeout while True: try: # Try to create lock file exclusively # O_EXCL | O_CREAT ensures atomicity fd = os.open( str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644 ) # Write lock metadata meta = { "lock_id": lock_id, "user": user, "task_id": task_id, "acquired_at": datetime.now().isoformat(), "acquired_by_pid": os.getpid(), } os.write(fd, json.dumps(meta).encode()) os.close(fd) # Also write metadata file for monitoring meta["lock_file"] = str(lock_path) meta["expires_at"] = ( datetime.now() + timedelta(seconds=self.LOCK_TIMEOUT_SECONDS) ).isoformat() lock_meta_path.write_text(json.dumps(meta, indent=2)) logger.info(f"Acquired lock for user {user}, task {task_id}, lock_id {lock_id}") return True, lock_id except FileExistsError: # Lock file exists, check if it's stale if self._is_lock_stale(user): # Remove stale lock and retry try: lock_path.unlink() continue except FileNotFoundError: # Another process removed it, retry continue # Lock is active elapsed = time.time() - start_time if elapsed >= timeout: logger.warning( f"Failed to acquire lock for user {user} within {timeout}s. " f"Another task may be running." ) return False, None # Wait and retry time.sleep(0.5) except Exception as e: logger.error(f"Error acquiring lock for user {user}: {e}") return False, None def release_lock(self, user: str, lock_id: str) -> bool: """ Release lock for a user. Args: user: Username lock_id: Lock ID from acquire_lock Returns: True if lock was released, False if it wasn't held """ lock_path = self._get_lock_path(user) lock_meta_path = self._get_lock_meta_path(user) try: # Verify lock_id matches before releasing if lock_path.exists(): meta = json.loads(lock_path.read_text()) if meta.get("lock_id") != lock_id: logger.warning( f"Lock ID mismatch for user {user}. " f"Expected {lock_id}, got {meta.get('lock_id')}" ) return False # Remove lock file if lock_path.exists(): lock_path.unlink() # Remove metadata file if lock_meta_path.exists(): lock_meta_path.unlink() logger.info(f"Released lock for user {user}, lock_id {lock_id}") return True except Exception as e: logger.error(f"Error releasing lock for user {user}: {e}") return False def is_user_locked(self, user: str) -> bool: """ Check if a user has an active lock (non-stale). Args: user: Username Returns: True if user has an active lock """ lock_path = self._get_lock_path(user) if not lock_path.exists(): return False # Check if lock is stale return not self._is_lock_stale(user) def get_lock_info(self, user: str) -> Optional[Dict[str, Any]]: """ Get information about a user's active lock. Args: user: Username Returns: Lock metadata dict or None if no lock """ lock_meta_path = self._get_lock_meta_path(user) if not lock_meta_path.exists(): return None try: meta = json.loads(lock_meta_path.read_text()) return meta except Exception as e: logger.error(f"Error reading lock info for user {user}: {e}") return None def get_all_locks(self) -> List[Dict[str, Any]]: """ Get information about all active locks. Returns: List of lock metadata dicts """ locks = [] for meta_file in self.LOCK_BASE.glob("user_*.json"): try: meta = json.loads(meta_file.read_text()) # Check if lock is stale username = meta.get("user") if not self._is_lock_stale(username): locks.append(meta) except Exception as e: logger.error(f"Error reading lock file {meta_file}: {e}") return locks def _is_lock_stale(self, user: str) -> bool: """ Check if a user's lock has expired. Args: user: Username Returns: True if lock is stale (expired or missing metadata) """ lock_path = self._get_lock_path(user) lock_meta_path = self._get_lock_meta_path(user) if not lock_path.exists(): return True if not lock_meta_path.exists(): # No metadata, assume stale return True try: meta = json.loads(lock_meta_path.read_text()) expires_at = meta.get("expires_at") if expires_at: expire_time = datetime.fromisoformat(expires_at) return datetime.now() > expire_time # Fallback: check acquired_at time acquired_at = meta.get("acquired_at") if acquired_at: acquire_time = datetime.fromisoformat(acquired_at) age_seconds = (datetime.now() - acquire_time).total_seconds() return age_seconds > self.LOCK_TIMEOUT_SECONDS return True except Exception as e: logger.error(f"Error checking lock staleness for user {user}: {e}") return True def _cleanup_stale_locks(self, user: str) -> None: """ Clean up stale locks for a user. Args: user: Username """ lock_path = self._get_lock_path(user) lock_meta_path = self._get_lock_meta_path(user) if not lock_path.exists(): return if self._is_lock_stale(user): try: if lock_path.exists(): lock_path.unlink() if lock_meta_path.exists(): lock_meta_path.unlink() logger.info(f"Cleaned up stale lock for user {user}") except Exception as e: logger.error(f"Error cleaning up stale lock for user {user}: {e}") def cleanup_all_stale_locks(self) -> int: """ Clean up all stale locks. Returns: Count of locks cleaned up """ count = 0 for meta_file in self.LOCK_BASE.glob("user_*.json"): try: meta = json.loads(meta_file.read_text()) username = meta.get("user") if self._is_lock_stale(username): # Remove lock and metadata lock_path = self._get_lock_path(username) if lock_path.exists(): lock_path.unlink() if meta_file.exists(): meta_file.unlink() logger.info(f"Cleaned up stale lock for user {username}") count += 1 except Exception as e: logger.error(f"Error processing lock file {meta_file}: {e}") return count def wait_for_lock_release( self, user: str, max_wait_seconds: int = 300 ) -> bool: """ Wait for a user's lock to be released. Args: user: Username max_wait_seconds: Maximum time to wait (default 5 minutes) Returns: True if lock was released, False if timeout """ start_time = time.time() while True: if not self.is_user_locked(user): return True elapsed = time.time() - start_time if elapsed >= max_wait_seconds: logger.warning(f"Timeout waiting for lock release for user {user}") return False time.sleep(1) # Module exports __all__ = [ "PerUserQueueManager", ]