#!/usr/bin/env python3 """ Luzia QA Improvements - Preflight Validation for Task Dispatch This module implements 5 QA improvements to reduce job failure rate from 25% to <5%: 1. TimeoutValidator - Detect operations needing >5 min 2. PrivilegeChecker - Detect sudo/privileged commands in restricted containers 3. ServiceHealthChecker - Pre-validate service dependencies 4. ContainerCapabilityChecker - Verify container requirements 5. DurationLearner - Adapt timeouts from historical data Usage: from qa_improvements import run_preflight_checks approved, report = run_preflight_checks({ 'id': 'task-123', 'title': 'Start DSS API', 'description': 'Start the DSS API service on port 5000' }) if not approved: print(f"Task blocked: {report['errors']}") """ import re import os import sys import json import sqlite3 import subprocess import time import logging from pathlib import Path from typing import Dict, List, Tuple, Any, Optional from datetime import datetime # Configure logging logger = logging.getLogger('qa_improvements') logger.setLevel(logging.INFO) # Default paths TASK_QUEUE_DB = Path('/opt/server-agents/state/task_queue.db') CONFIG_PATH = Path('/opt/server-agents/orchestrator/config.json') class TimeoutValidator: """ IMPROVEMENT 1: Timeout Validation Pre-analyze task description for timeout indicators to prevent tasks from timing out after 300s when they need more time. """ def __init__(self): self.timeout_rules = { # Pattern -> minimum timeout in seconds r'start.*service': 600, # 10 min r'restart.*service': 600, r'restart': 600, r'reload.*nginx': 300, # 5 min (graceful) r'healthcheck|health.*check': 180, # 3 min r'api.*create|create.*repo': 300, # 5 min r'database|db|postgres': 300, r'wait.*for|wait.*until': 600, # 10 min r'npm\s+install': 300, # 5 min for npm install r'npm\s+run\s+build': 600, # 10 min for builds r'docker\s+build': 900, # 15 min for docker builds r'migration|migrate': 600, # 10 min for migrations r'backup|restore': 600, # 10 min for backup ops r'sync|synchronize': 600, # 10 min for sync ops r'clone|git\s+clone': 300, # 5 min for clones r'test|tests|npm\s+test': 600, # 10 min for test runs } self.default_timeout = 300 self.critical_threshold = 250 # warn if remaining < 50s def validate_timeout(self, task_title: str, task_description: str = '') -> Dict[str, Any]: """ Analyze task for timeout requirements. Returns: { 'recommended_timeout': int, 'confidence': 'high'|'medium'|'low', 'warning': str or None, 'category': 'short'|'long'|'async', 'matched_patterns': list of (pattern, timeout) tuples } """ text = (task_title + ' ' + task_description).lower() max_timeout = self.default_timeout matched_patterns = [] for pattern, timeout in self.timeout_rules.items(): if re.search(pattern, text): matched_patterns.append((pattern, timeout)) max_timeout = max(max_timeout, timeout) if max_timeout > self.default_timeout: category = 'long' if max_timeout <= 600 else 'async' warning = f"Task likely requires {max_timeout}s but default timeout is {self.default_timeout}s" else: category = 'short' warning = None # Determine confidence if len(matched_patterns) > 1: confidence = 'high' elif matched_patterns: confidence = 'medium' else: confidence = 'low' return { 'recommended_timeout': max_timeout, 'matched_patterns': matched_patterns, 'category': category, 'warning': warning, 'confidence': confidence, 'action': 'warn' if warning else 'allow' } class PrivilegeChecker: """ IMPROVEMENT 2: Privilege Checker Detect privileged commands (sudo, systemctl, etc.) that would fail in containers with no-new-privileges security option. """ def __init__(self): self.privileged_patterns = [ (r'\bsudo\b', 'sudo command'), (r'systemctl\s+(?:start|stop|restart|enable|disable)', 'systemctl control'), (r'apt-get\s+install', 'apt-get install'), (r'apt\s+install', 'apt install'), (r'yum\s+install', 'yum install'), (r'dnf\s+install', 'dnf install'), (r'\bchown\b', 'chown'), (r'chmod\s+[0-7]{3,4}', 'chmod with octal'), (r'setfacl', 'setfacl'), (r'usermod|useradd|userdel', 'user modification'), (r'mount\s+', 'mount command'), (r'iptables|ip6tables', 'iptables'), (r'setcap|getcap', 'capability manipulation'), ] self.alternatives = { 'sudo': 'Remove sudo - container runs with user permissions', 'systemctl': 'Use service scripts or direct process management', 'apt-get install': 'Install dependencies in Dockerfile or use pre-built image', 'apt install': 'Install dependencies in Dockerfile or use pre-built image', 'chown': 'Files are already owned by container user', 'chmod': 'Set permissions during build or via container entrypoint', } def check_container_capabilities(self) -> Dict[str, Any]: """Check container security configuration.""" config = { 'no_new_privileges': False, 'can_sudo': True, 'can_setuid': True, 'detected_from': 'default' } # Check /proc/self/status for NoNewPrivs try: with open('/proc/self/status', 'r') as f: content = f.read() if 'NoNewPrivs:\t1' in content: config['no_new_privileges'] = True config['can_sudo'] = False config['can_setuid'] = False config['detected_from'] = '/proc/self/status' except Exception: pass # Check Docker daemon config try: result = subprocess.run( ['grep', '-l', 'no-new-privileges', '/etc/docker/daemon.json'], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: config['no_new_privileges'] = True config['can_sudo'] = False config['detected_from'] = '/etc/docker/daemon.json' except Exception: pass return config def check_privilege_requirements(self, task_title: str, task_description: str = '') -> Dict[str, Any]: """ Analyze task for privilege requirements. Returns: { 'needs_privileges': bool, 'problematic_commands': list of (pattern, description), 'container_can_sudo': bool, 'action': 'allow'|'warn'|'block', 'suggestion': str or None } """ text = (task_title + ' ' + task_description).lower() caps = self.check_container_capabilities() problematic = [] for pattern, description in self.privileged_patterns: if re.search(pattern, text, re.IGNORECASE): problematic.append((pattern, description)) if not problematic: return { 'needs_privileges': False, 'problematic_commands': [], 'container_can_sudo': caps['can_sudo'], 'action': 'allow', 'suggestion': None } if caps['can_sudo']: return { 'needs_privileges': True, 'problematic_commands': problematic, 'container_can_sudo': True, 'action': 'allow', # Container allows privileges 'suggestion': None } else: # Container cannot sudo - need alternatives suggestions = [] for pattern, desc in problematic[:3]: alt_key = next((k for k in self.alternatives if k in desc.lower()), None) if alt_key: suggestions.append(self.alternatives[alt_key]) else: suggestions.append(f"Find non-privileged alternative for {desc}") return { 'needs_privileges': True, 'problematic_commands': problematic, 'container_can_sudo': False, 'action': 'block', 'suggestion': f"Container has no-new-privileges. Alternatives: {'; '.join(suggestions)}" } class ServiceHealthChecker: """ IMPROVEMENT 3: Service Health Check Pre-check if target service is running and responsive before dispatching tasks that depend on them. """ def __init__(self): # Service -> health check configuration self.service_checks = { 'dss': {'port': 5000, 'health_path': '/health', 'type': 'http'}, 'musica': {'port': 3000, 'health_path': '/health', 'type': 'http'}, 'librechat': {'port': 3200, 'health_path': '/health', 'type': 'http'}, 'overbits': {'port': 3001, 'health_path': '/health', 'type': 'http'}, 'nginx': {'cmd': 'systemctl is-active nginx', 'type': 'systemd'}, 'postgres': {'port': 5432, 'type': 'tcp'}, 'postgresql': {'port': 5432, 'type': 'tcp'}, 'redis': {'port': 6379, 'type': 'tcp'}, 'mysql': {'port': 3306, 'type': 'tcp'}, 'mongodb': {'port': 27017, 'type': 'tcp'}, 'docker': {'cmd': 'docker info', 'type': 'command'}, } def _check_tcp_port(self, port: int, host: str = '127.0.0.1', timeout: float = 2.0) -> bool: """Check if a TCP port is listening.""" import socket try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(timeout) result = sock.connect_ex((host, port)) sock.close() return result == 0 except Exception: return False def _check_http_health(self, port: int, path: str, host: str = '127.0.0.1', timeout: float = 5.0) -> Tuple[bool, Optional[int], Optional[float]]: """Check HTTP health endpoint. Returns (responsive, status_code, latency_ms).""" try: import urllib.request url = f"http://{host}:{port}{path}" start = time.time() req = urllib.request.Request(url, method='GET') with urllib.request.urlopen(req, timeout=timeout) as response: latency = (time.time() - start) * 1000 return True, response.status, latency except Exception as e: return False, None, None def _check_command(self, cmd: str, timeout: float = 5.0) -> bool: """Check if a command succeeds.""" try: result = subprocess.run( cmd.split(), capture_output=True, timeout=timeout ) return result.returncode == 0 except Exception: return False def check_service_health(self, service_name: str) -> Dict[str, Any]: """ Check health of a specific service. Returns: { 'service': str, 'running': bool or None (unknown), 'responsive': bool, 'http_status': int or None, 'latency_ms': float or None, 'issue': str or None } """ service_lower = service_name.lower() if service_lower not in self.service_checks: return { 'service': service_name, 'running': None, 'responsive': None, 'issue': f'Unknown service: {service_name}' } config = self.service_checks[service_lower] check_type = config.get('type', 'tcp') if check_type == 'http': port = config['port'] path = config.get('health_path', '/health') responsive, status, latency = self._check_http_health(port, path) return { 'service': service_name, 'running': responsive or self._check_tcp_port(port), 'responsive': responsive and status == 200, 'http_status': status, 'latency_ms': round(latency, 2) if latency else None, 'issue': None if (responsive and status == 200) else f"HTTP {status or 'unreachable'}" } elif check_type == 'tcp': port = config['port'] running = self._check_tcp_port(port) return { 'service': service_name, 'running': running, 'responsive': running, 'issue': None if running else f'Port {port} not listening' } elif check_type in ('systemd', 'command'): cmd = config['cmd'] running = self._check_command(cmd) return { 'service': service_name, 'running': running, 'responsive': running, 'issue': None if running else f'Command failed: {cmd}' } return { 'service': service_name, 'running': None, 'issue': 'Unknown check type' } def validate_task_services(self, task_description: str) -> Dict[str, Dict[str, Any]]: """ Extract service names from task and check their health. Returns dict mapping service name -> health check result. """ # Pattern to find service references service_pattern = r'\b(' + '|'.join(self.service_checks.keys()) + r')\b' services = re.findall(service_pattern, task_description.lower()) checks = {} for service in set(services): checks[service] = self.check_service_health(service) return checks class ContainerCapabilityChecker: """ IMPROVEMENT 4: Container Capability Check Verify container has required capabilities for the task. Many capabilities are stripped for security. """ def __init__(self): self.required_capabilities = { 'sudo': ['CAP_SYS_ADMIN', 'CAP_SYS_RESOURCE', 'CAP_SETUID', 'CAP_SETGID'], 'network_config': ['CAP_NET_ADMIN', 'CAP_NET_RAW'], 'disk_ops': ['CAP_SYS_ADMIN', 'CAP_DAC_OVERRIDE'], 'process_management': ['CAP_SYS_RESOURCE', 'CAP_SYS_PTRACE'], 'file_ownership': ['CAP_CHOWN', 'CAP_FOWNER'], } self.task_to_requirement = { r'sudo|privilege|root': 'sudo', r'network|nginx|port|iptable|firewall': 'network_config', r'disk|mount|unmount|partition': 'disk_ops', r'kill|signal|ptrace|strace': 'process_management', r'chown|ownership': 'file_ownership', } def get_container_security_config(self) -> Dict[str, Any]: """Read container security options.""" config = { 'no_new_privileges': False, 'capabilities': [], 'read_only_root': False, 'user': os.getuid(), 'in_container': False } # Detect if we're in a container if Path('/.dockerenv').exists(): config['in_container'] = True try: with open('/proc/self/status', 'r') as f: content = f.read() # Check for no_new_privileges if 'NoNewPrivs:\t1' in content: config['no_new_privileges'] = True # Parse CapEff (effective capabilities) cap_match = re.search(r'CapEff:\t([0-9a-f]+)', content) if cap_match: config['cap_effective_hex'] = cap_match.group(1) except Exception: pass # Check if root filesystem is read-only try: with open('/proc/mounts', 'r') as f: for line in f: if ' / ' in line and 'ro,' in line: config['read_only_root'] = True break except Exception: pass return config def check_requirements(self, task_description: str) -> List[str]: """Analyze task for capability requirements.""" requirements = [] text = task_description.lower() for pattern, req in self.task_to_requirement.items(): if re.search(pattern, text): if req not in requirements: requirements.append(req) return requirements def validate_capabilities(self, task_description: str) -> Dict[str, Any]: """ Validate container capabilities against task requirements. Returns: { 'task_requirements': [str], 'container_config': dict, 'capability_gaps': [str], 'action': 'allow'|'warn'|'block' } """ config = self.get_container_security_config() requirements = self.check_requirements(task_description) gaps = [] # Check specific capability gaps if config['no_new_privileges']: if 'sudo' in requirements: gaps.append('no-new-privileges blocks sudo/privilege escalation') if config['read_only_root']: if 'disk_ops' in requirements: gaps.append('read-only root filesystem blocks disk operations') # Non-root user limitations if config['user'] != 0: if 'network_config' in requirements: gaps.append('non-root user cannot modify network config') if 'file_ownership' in requirements: gaps.append('non-root user has limited chown abilities') # Determine action if gaps: action = 'block' elif requirements: action = 'warn' # Has requirements but no detected gaps else: action = 'allow' return { 'task_requirements': requirements, 'container_config': config, 'capability_gaps': gaps, 'action': action } class DurationLearner: """ IMPROVEMENT 5: Historical Duration Learning Learn from historical task durations to provide better timeout recommendations instead of using hardcoded defaults. """ def __init__(self, db_path: Path = TASK_QUEUE_DB): self.db_path = db_path self.cache = {} self.cache_ttl = 300 # 5 min cache self.cache_time = 0 def _get_connection(self) -> Optional[sqlite3.Connection]: """Get database connection if available.""" if not self.db_path.exists(): return None try: return sqlite3.connect(self.db_path, timeout=5) except Exception: return None def get_historical_duration(self, task_title: str) -> Dict[str, Any]: """ Query completed tasks for average duration. Returns: { 'avg_duration': float or None, 'max_duration': float or None, 'sample_count': int, 'success_rate': float or None, 'by_exit_code': list of (exit_code, avg_duration, count) } """ conn = self._get_connection() if not conn: return {'avg_duration': None, 'sample_count': 0} try: cursor = conn.cursor() # Find similar tasks by title pattern # Use LIKE with wildcards for fuzzy matching search_term = f"%{task_title}%" cursor.execute(""" SELECT AVG(CASE WHEN exit_code = 0 THEN completed_at - started_at ELSE NULL END) as avg_success_duration, MAX(completed_at - started_at) as max_duration, COUNT(*) as total_count, SUM(CASE WHEN exit_code = 0 THEN 1 ELSE 0 END) as success_count, exit_code FROM tasks WHERE title LIKE ? AND completed_at IS NOT NULL AND started_at IS NOT NULL AND completed_at > started_at GROUP BY exit_code """, (search_term,)) results = cursor.fetchall() conn.close() if not results: return {'avg_duration': None, 'sample_count': 0} # Aggregate results total_count = sum(r[2] for r in results) success_count = sum(r[3] for r in results) max_duration = max((r[1] for r in results if r[1]), default=None) # Get average from successful runs avg_success = next((r[0] for r in results if r[4] == 0 and r[0]), None) return { 'avg_duration': avg_success, 'max_duration': max_duration, 'sample_count': total_count, 'success_count': success_count, 'success_rate': success_count / total_count if total_count > 0 else None, 'by_exit_code': [(r[4], r[0], r[2]) for r in results] } except Exception as e: logger.warning(f"Error querying historical duration: {e}") if conn: conn.close() return {'avg_duration': None, 'sample_count': 0, 'error': str(e)} def recommend_timeout(self, task_title: str, task_description: str = '') -> Dict[str, Any]: """ Recommend timeout based on historical data. Returns: { 'recommended': int, 'based_on_history': bool, 'historical_avg': float or None, 'historical_max': float or None, 'sample_count': int, 'confidence': 'high'|'medium'|'low'|'none' } """ # Check cache first cache_key = task_title.lower()[:50] now = time.time() if cache_key in self.cache and (now - self.cache_time) < self.cache_ttl: return self.cache[cache_key] history = self.get_historical_duration(task_title) if history.get('avg_duration') and history.get('sample_count', 0) >= 2: # Use historical data with safety buffer avg = history['avg_duration'] max_dur = history.get('max_duration', avg) # Recommend max of avg*1.5 or max_duration recommended = int(max(avg * 1.5, max_dur * 1.1)) recommended = max(60, min(recommended, 1800)) # Clamp 1min-30min # Confidence based on sample size sample_count = history['sample_count'] if sample_count >= 10: confidence = 'high' elif sample_count >= 5: confidence = 'medium' else: confidence = 'low' result = { 'recommended': recommended, 'based_on_history': True, 'historical_avg': round(avg, 1) if avg else None, 'historical_max': round(max_dur, 1) if max_dur else None, 'sample_count': sample_count, 'confidence': confidence, 'success_rate': history.get('success_rate') } else: # No historical data - use default result = { 'recommended': 300, # Default 5 min 'based_on_history': False, 'sample_count': history.get('sample_count', 0), 'confidence': 'none' } # Update cache self.cache[cache_key] = result self.cache_time = now return result def run_preflight_checks(task: Dict[str, Any], config: Optional[Dict[str, Any]] = None) -> Tuple[bool, Dict[str, Any]]: """ Run all 5 QA preflight checks before dispatching a task. Args: task: Task dict with 'id', 'title', 'description' keys config: Optional QA config with enabled checks, timeouts, etc. Returns: (approved: bool, report: dict) tuple report structure: { 'task_id': str, 'checks': { 'timeout': {...}, 'privileges': {...}, 'services': {...}, 'capabilities': {...}, 'duration': {...} }, 'warnings': [str], 'errors': [str], 'approved': bool, 'recommended_timeout': int, 'timestamp': str } """ config = config or {} report = { 'task_id': task.get('id', 'unknown'), 'checks': {}, 'warnings': [], 'errors': [], 'approved': False, 'recommended_timeout': 300, 'timestamp': datetime.now().isoformat() } title = task.get('title', '') description = task.get('description', '') # Check 1: Timeout validation try: timeout_validator = TimeoutValidator() timeout_check = timeout_validator.validate_timeout(title, description) report['checks']['timeout'] = timeout_check if timeout_check.get('warning'): report['warnings'].append(f"TIMEOUT: {timeout_check['warning']}") # Update recommended timeout report['recommended_timeout'] = max( report['recommended_timeout'], timeout_check.get('recommended_timeout', 300) ) except Exception as e: logger.error(f"Timeout check failed: {e}") report['checks']['timeout'] = {'error': str(e)} # Check 2: Privilege requirements try: priv_checker = PrivilegeChecker() priv_check = priv_checker.check_privilege_requirements(title, description) report['checks']['privileges'] = priv_check if priv_check['action'] == 'block': report['errors'].append(f"PRIVILEGE: {priv_check['suggestion']}") elif priv_check['action'] == 'warn' and priv_check.get('suggestion'): report['warnings'].append(f"PRIVILEGE: {priv_check['suggestion']}") except Exception as e: logger.error(f"Privilege check failed: {e}") report['checks']['privileges'] = {'error': str(e)} # Check 3: Service health try: service_checker = ServiceHealthChecker() service_checks = service_checker.validate_task_services(description) report['checks']['services'] = service_checks for service, status in service_checks.items(): if status.get('running') is False: report['warnings'].append(f"SERVICE: {service} is not running") elif status.get('running') is True and not status.get('responsive'): report['errors'].append(f"SERVICE: {service} is running but not responding") except Exception as e: logger.error(f"Service check failed: {e}") report['checks']['services'] = {'error': str(e)} # Check 4: Container capabilities try: cap_checker = ContainerCapabilityChecker() cap_check = cap_checker.validate_capabilities(description) report['checks']['capabilities'] = cap_check if cap_check['action'] == 'block': for gap in cap_check.get('capability_gaps', []): report['errors'].append(f"CAPABILITY: {gap}") except Exception as e: logger.error(f"Capability check failed: {e}") report['checks']['capabilities'] = {'error': str(e)} # Check 5: Duration learning try: learner = DurationLearner() duration_check = learner.recommend_timeout(title, description) report['checks']['duration'] = duration_check if duration_check.get('based_on_history'): # Use historical recommendation if confident if duration_check.get('confidence') in ('high', 'medium'): report['recommended_timeout'] = max( report['recommended_timeout'], duration_check['recommended'] ) logger.info( f"HISTORY: Similar tasks avg {duration_check.get('historical_avg')}s, " f"recommending {duration_check['recommended']}s " f"(confidence: {duration_check.get('confidence')})" ) except Exception as e: logger.error(f"Duration learning failed: {e}") report['checks']['duration'] = {'error': str(e)} # Final decision report['approved'] = len(report['errors']) == 0 return report['approved'], report def format_preflight_report(report: Dict[str, Any], verbose: bool = False) -> str: """Format preflight report for display.""" lines = [] status = "[OK]" if report['approved'] else "[BLOCKED]" lines.append(f"\n=== Preflight Check {status} ===") lines.append(f"Task: {report['task_id']}") lines.append(f"Recommended timeout: {report['recommended_timeout']}s") if report['errors']: lines.append("\nBlocking Issues:") for err in report['errors']: lines.append(f" [!] {err}") if report['warnings']: lines.append("\nWarnings:") for warn in report['warnings']: lines.append(f" [?] {warn}") if verbose: lines.append("\nDetailed Checks:") for check_name, check_result in report['checks'].items(): lines.append(f" {check_name}:") if isinstance(check_result, dict): for k, v in check_result.items(): if k != 'error': lines.append(f" {k}: {v}") return '\n'.join(lines) # CLI interface for testing if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="QA Preflight Validator") parser.add_argument("--title", "-t", required=True, help="Task title") parser.add_argument("--description", "-d", default="", help="Task description") parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() task = { 'id': 'cli-test', 'title': args.title, 'description': args.description } approved, report = run_preflight_checks(task) if args.json: print(json.dumps(report, indent=2, default=str)) else: print(format_preflight_report(report, verbose=args.verbose)) sys.exit(0 if approved else 1)