#!/usr/bin/env python3 """ System Health Orchestrator Master health check coordinator that validates: - System capacity (disk, memory, CPU, concurrency) - Configuration consistency - Integration testing across all subsystems - Unified health scoring (0-100) """ import json import os import shutil import subprocess import time from pathlib import Path from typing import List, Dict from kg_health_checker import KGHealthChecker from conductor_health_checker import ConductorHealthChecker from context_health_checker import ContextHealthChecker from script_health_checker import ScriptHealthChecker from routine_validator import RoutineValidator class SystemHealthOrchestrator: """Master orchestrator for system-wide health validation.""" def __init__(self): """Initialize system health orchestrator.""" self.kg_checker = KGHealthChecker() self.conductor_checker = ConductorHealthChecker() self.context_checker = ContextHealthChecker() self.script_checker = ScriptHealthChecker() self.routine_validator = RoutineValidator() def check_system_capacity(self) -> Dict: """ Check system capacity constraints. Returns: Dict with capacity metrics """ capacity = { 'disk': {}, 'memory': {}, 'cpu': {}, 'concurrency': {}, 'issues': [] } # Disk usage try: total, used, free = shutil.disk_usage('/') disk_usage_pct = (used / total) * 100 disk_free_gb = free / (1024 ** 3) capacity['disk'] = { 'usage_pct': round(disk_usage_pct, 1), 'free_gb': round(disk_free_gb, 1), 'status': 'critical' if disk_usage_pct > 90 else 'warning' if disk_usage_pct > 80 else 'healthy' } if disk_usage_pct > 90: capacity['issues'].append(f"Disk critically full ({disk_usage_pct}%)") elif disk_usage_pct > 85: capacity['issues'].append(f"Disk usage high ({disk_usage_pct}%)") except Exception as e: capacity['issues'].append(f"Cannot check disk: {e}") # Memory usage try: with open('/proc/meminfo', 'r') as f: lines = f.readlines() mem_info = {line.split()[0].rstrip(':'): int(line.split()[1]) for line in lines} total_mem = mem_info.get('MemTotal', 0) available_mem = mem_info.get('MemAvailable', 0) used_mem = total_mem - available_mem mem_usage_pct = (used_mem / max(total_mem, 1)) * 100 capacity['memory'] = { 'usage_pct': round(mem_usage_pct, 1), 'available_gb': round(available_mem / (1024 ** 2), 1), 'status': 'critical' if mem_usage_pct > 90 else 'warning' if mem_usage_pct > 85 else 'healthy' } if mem_usage_pct > 90: capacity['issues'].append(f"Memory usage critical ({mem_usage_pct}%)") except Exception as e: capacity['issues'].append(f"Cannot check memory: {e}") # CPU load try: load_avg = os.getloadavg() cpu_count = os.cpu_count() load_pct = (load_avg[0] / max(cpu_count, 1)) * 100 capacity['cpu'] = { 'load_average': tuple(round(l, 2) for l in load_avg), 'load_pct': round(load_pct, 1), 'cpu_count': cpu_count, 'status': 'critical' if load_pct > 100 else 'warning' if load_pct > 80 else 'healthy' } except Exception as e: capacity['issues'].append(f"Cannot check CPU: {e}") # Concurrency limits try: # Check max concurrent agents conductor_dir = Path('/home/admin/conductor/active') active_tasks = len(list(conductor_dir.iterdir())) if conductor_dir.exists() else 0 max_concurrent = 4 # Design limit capacity['concurrency'] = { 'active_agents': active_tasks, 'max_concurrent': max_concurrent, 'available_slots': max(0, max_concurrent - active_tasks), 'status': 'warning' if active_tasks >= max_concurrent else 'healthy' } if active_tasks >= max_concurrent: capacity['issues'].append(f"Concurrency at limit ({active_tasks}/{max_concurrent})") except Exception as e: capacity['issues'].append(f"Cannot check concurrency: {e}") return capacity def check_configuration_consistency(self) -> Dict: """ Validate configuration consistency across system. Returns: Dict with configuration status """ config_status = { 'config_file_valid': False, 'permissions_valid': False, 'databases_accessible': False, 'mcp_servers_configured': False, 'issues': [] } # Check config.json config_file = Path('/opt/server-agents/orchestrator/config.json') if config_file.exists(): try: config = json.loads(config_file.read_text()) config_status['config_file_valid'] = True except Exception as e: config_status['issues'].append(f"Config parse error: {e}") else: config_status['issues'].append("Config file not found") # Check file permissions try: orchestrator_root = Path('/opt/server-agents/orchestrator') for item in orchestrator_root.rglob('*'): if item.is_file(): # Check readable if not os.access(item, os.R_OK): config_status['issues'].append(f"Not readable: {item}") if item.suffix == '.py' and not os.access(item, os.X_OK): # Python files should be executable pass config_status['permissions_valid'] = len([i for i in config_status['issues'] if 'readable' in i]) == 0 except Exception as e: config_status['issues'].append(f"Cannot check permissions: {e}") # Check database accessibility db_paths = [ '/etc/luz-knowledge/research.db', '/etc/luz-knowledge/projects.db', '/opt/server-agents/state/task_queue.db', ] dbs_accessible = 0 for db_path in db_paths: if Path(db_path).exists() and os.access(db_path, os.R_OK): dbs_accessible += 1 else: config_status['issues'].append(f"Database not accessible: {db_path}") config_status['databases_accessible'] = dbs_accessible >= 2 # Check MCP server configuration try: if config_status['config_file_valid']: mcp_servers = config.get('mcpServers', {}) if mcp_servers: config_status['mcp_servers_configured'] = True except Exception: pass return config_status def run_integration_tests(self) -> Dict: """ Run integration tests across critical system paths. Returns: Dict with test results """ tests = { 'kg_query': False, 'conductor_rw': False, 'context_retrieval': False, 'bash_execution': False, 'issues': [] } # Test 1: KG query try: import sqlite3 with sqlite3.connect('/etc/luz-knowledge/research.db') as conn: cursor = conn.cursor() cursor.execute("SELECT COUNT(*) FROM entities LIMIT 1") result = cursor.fetchone() tests['kg_query'] = result is not None except Exception as e: tests['issues'].append(f"KG query failed: {e}") # Test 2: Conductor read/write try: conductor_dir = Path('/home/admin/conductor') test_file = conductor_dir / '.health_check_test' test_file.write_text(json.dumps({'test': 'ok'})) content = test_file.read_text() test_file.unlink() tests['conductor_rw'] = 'test' in content except Exception as e: tests['issues'].append(f"Conductor R/W failed: {e}") # Test 3: Context retrieval simulation try: # Simulate context injection from pathlib import Path context_file = Path('/opt/server-agents/orchestrator/lib/four_bucket_context.py') if context_file.exists(): tests['context_retrieval'] = True except Exception as e: tests['issues'].append(f"Context test failed: {e}") # Test 4: Bash execution try: result = subprocess.run(['echo', 'test'], capture_output=True, timeout=2) tests['bash_execution'] = result.returncode == 0 except Exception as e: tests['issues'].append(f"Bash execution failed: {e}") return tests def generate_unified_health_score(self) -> Dict: """ Generate unified 0-100 health score across all subsystems. Returns: Dict with overall health assessment """ # Get all component scores kg_health = self.kg_checker.generate_health_score() conductor_health = self.conductor_checker.generate_conductor_health_score() context_health = self.context_checker.generate_context_health_score() script_health = self.script_checker.generate_script_health_report() routine_health = self.routine_validator.generate_routine_validation_report() # Capacity and integration capacity = self.check_system_capacity() config = self.check_configuration_consistency() integration = self.run_integration_tests() # Calculate capacity score capacity_score = 100 if capacity['disk']['status'] == 'critical': capacity_score -= 30 elif capacity['disk']['status'] == 'warning': capacity_score -= 15 if capacity['memory']['status'] == 'critical': capacity_score -= 20 elif capacity['memory']['status'] == 'warning': capacity_score -= 10 # Configuration score config_score = 100 config_score -= len(config['issues']) * 5 if not config['config_file_valid']: config_score -= 20 if not config['databases_accessible']: config_score -= 30 # Integration score integration_score = (sum(1 for k, v in integration.items() if k != 'issues' and v) / 4) * 100 # Weighted overall score overall_score = ( kg_health['overall_score'] * 0.20 + conductor_health['overall_score'] * 0.20 + context_health['overall_score'] * 0.15 + script_health['health_score'] * 0.10 + routine_health['health_score'] * 0.10 + max(0, capacity_score) * 0.15 + max(0, config_score) * 0.05 + integration_score * 0.05 ) return { 'overall_score': round(overall_score, 1), 'status': 'healthy' if overall_score >= 80 else 'degraded' if overall_score >= 60 else 'critical', 'component_scores': { 'kg': round(kg_health['overall_score'], 1), 'conductor': round(conductor_health['overall_score'], 1), 'context': round(context_health['overall_score'], 1), 'scripts': round(script_health['health_score'], 1), 'routines': round(routine_health['health_score'], 1), 'capacity': round(max(0, capacity_score), 1), 'configuration': round(max(0, config_score), 1), 'integration': round(integration_score, 1) }, 'capacity': capacity, 'configuration': config, 'integration': integration, 'timestamp': time.time() } if __name__ == '__main__': orchestrator = SystemHealthOrchestrator() print("=" * 70) print("SYSTEM HEALTH ORCHESTRATOR") print("=" * 70) print("\nRunning unified health check...") health = orchestrator.generate_unified_health_score() print(f"\nOVERALL HEALTH SCORE: {health['overall_score']}/100 ({health['status'].upper()})") print(f"\nComponent Scores:") for component, score in health['component_scores'].items(): print(f" {component:20} {score:6.1f}/100") print(f"\nSystem Capacity:") capacity = health['capacity'] print(f" Disk: {capacity['disk']['usage_pct']}% ({capacity['disk']['status']})") print(f" Memory: {capacity['memory']['usage_pct']}% ({capacity['memory']['status']})") print(f" CPU: {capacity['cpu']['load_pct']}% ({capacity['cpu']['status']})") if capacity['issues']: print(f"\nCapacity Issues ({len(capacity['issues'])}):") for issue in capacity['issues']: print(f" - {issue}") if health['configuration']['issues']: print(f"\nConfiguration Issues ({len(health['configuration']['issues'])}):") for issue in health['configuration']['issues'][:5]: print(f" - {issue}")