luzia/lib/system_health_orchestrator.py

#!/usr/bin/env python3
"""
System Health Orchestrator

Master health check coordinator that validates:
- System capacity (disk, memory, CPU, concurrency)
- Configuration consistency
- Integration testing across all subsystems
- Unified health scoring (0-100)
"""

import json
import os
import shutil
import subprocess
import time
from pathlib import Path
from typing import List, Dict

from kg_health_checker import KGHealthChecker
from conductor_health_checker import ConductorHealthChecker
from context_health_checker import ContextHealthChecker
from script_health_checker import ScriptHealthChecker
from routine_validator import RoutineValidator


class SystemHealthOrchestrator:
    """Master orchestrator for system-wide health validation."""

    def __init__(self):
        """Initialize system health orchestrator."""
        self.kg_checker = KGHealthChecker()
        self.conductor_checker = ConductorHealthChecker()
        self.context_checker = ContextHealthChecker()
        self.script_checker = ScriptHealthChecker()
        self.routine_validator = RoutineValidator()

    def check_system_capacity(self) -> Dict:
        """
        Check system capacity constraints.

        Returns:
            Dict with capacity metrics
        """
        capacity = {
            'disk': {},
            'memory': {},
            'cpu': {},
            'concurrency': {},
            'issues': []
        }

        # Disk usage
        try:
            total, used, free = shutil.disk_usage('/')
            disk_usage_pct = (used / total) * 100
            disk_free_gb = free / (1024 ** 3)

            capacity['disk'] = {
                'usage_pct': round(disk_usage_pct, 1),
                'free_gb': round(disk_free_gb, 1),
                'status': 'critical' if disk_usage_pct > 90 else 'warning' if disk_usage_pct > 80 else 'healthy'
            }

            if disk_usage_pct > 90:
                capacity['issues'].append(f"Disk critically full ({disk_usage_pct}%)")
            elif disk_usage_pct > 85:
                capacity['issues'].append(f"Disk usage high ({disk_usage_pct}%)")
        except Exception as e:
            capacity['issues'].append(f"Cannot check disk: {e}")

        # Memory usage
        try:
            with open('/proc/meminfo', 'r') as f:
                lines = f.readlines()
                mem_info = {line.split()[0].rstrip(':'): int(line.split()[1]) for line in lines}

            total_mem = mem_info.get('MemTotal', 0)
            available_mem = mem_info.get('MemAvailable', 0)
            used_mem = total_mem - available_mem
            mem_usage_pct = (used_mem / max(total_mem, 1)) * 100

            capacity['memory'] = {
                'usage_pct': round(mem_usage_pct, 1),
                'available_gb': round(available_mem / (1024 ** 2), 1),
                'status': 'critical' if mem_usage_pct > 90 else 'warning' if mem_usage_pct > 85 else 'healthy'
            }

            if mem_usage_pct > 90:
                capacity['issues'].append(f"Memory usage critical ({mem_usage_pct}%)")
        except Exception as e:
            capacity['issues'].append(f"Cannot check memory: {e}")

        # CPU load
        try:
            load_avg = os.getloadavg()
            cpu_count = os.cpu_count()

            load_pct = (load_avg[0] / max(cpu_count, 1)) * 100

            capacity['cpu'] = {
                'load_average': tuple(round(l, 2) for l in load_avg),
                'load_pct': round(load_pct, 1),
                'cpu_count': cpu_count,
                'status': 'critical' if load_pct > 100 else 'warning' if load_pct > 80 else 'healthy'
            }
        except Exception as e:
            capacity['issues'].append(f"Cannot check CPU: {e}")

        # Concurrency limits
        try:
            # Check max concurrent agents
            conductor_dir = Path('/home/admin/conductor/active')
            active_tasks = len(list(conductor_dir.iterdir())) if conductor_dir.exists() else 0

            max_concurrent = 4  # Design limit
            capacity['concurrency'] = {
                'active_agents': active_tasks,
                'max_concurrent': max_concurrent,
                'available_slots': max(0, max_concurrent - active_tasks),
                'status': 'warning' if active_tasks >= max_concurrent else 'healthy'
            }

            if active_tasks >= max_concurrent:
                capacity['issues'].append(f"Concurrency at limit ({active_tasks}/{max_concurrent})")
        except Exception as e:
            capacity['issues'].append(f"Cannot check concurrency: {e}")

        return capacity

    def check_configuration_consistency(self) -> Dict:
        """
        Validate configuration consistency across system.

        Returns:
            Dict with configuration status
        """
        config_status = {
            'config_file_valid': False,
            'permissions_valid': False,
            'databases_accessible': False,
            'mcp_servers_configured': False,
            'issues': []
        }

        # Check config.json
        config_file = Path('/opt/server-agents/orchestrator/config.json')
        if config_file.exists():
            try:
                config = json.loads(config_file.read_text())
                config_status['config_file_valid'] = True
            except Exception as e:
                config_status['issues'].append(f"Config parse error: {e}")
        else:
            config_status['issues'].append("Config file not found")

        # Check file permissions
        try:
            orchestrator_root = Path('/opt/server-agents/orchestrator')
            for item in orchestrator_root.rglob('*'):
                if item.is_file():
                    # Check readable
                    if not os.access(item, os.R_OK):
                        config_status['issues'].append(f"Not readable: {item}")
                    if item.suffix == '.py' and not os.access(item, os.X_OK):
                        # Python files should be executable
                        pass

            config_status['permissions_valid'] = len([i for i in config_status['issues'] if 'readable' in i]) == 0
        except Exception as e:
            config_status['issues'].append(f"Cannot check permissions: {e}")

        # Check database accessibility
        db_paths = [
            '/etc/luz-knowledge/research.db',
            '/etc/luz-knowledge/projects.db',
            '/opt/server-agents/state/task_queue.db',
        ]

        dbs_accessible = 0
        for db_path in db_paths:
            if Path(db_path).exists() and os.access(db_path, os.R_OK):
                dbs_accessible += 1
            else:
                config_status['issues'].append(f"Database not accessible: {db_path}")

        config_status['databases_accessible'] = dbs_accessible >= 2

        # Check MCP server configuration
        try:
            if config_status['config_file_valid']:
                mcp_servers = config.get('mcpServers', {})
                if mcp_servers:
                    config_status['mcp_servers_configured'] = True
        except Exception:
            pass

        return config_status

    def run_integration_tests(self) -> Dict:
        """
        Run integration tests across critical system paths.

        Returns:
            Dict with test results
        """
        tests = {
            'kg_query': False,
            'conductor_rw': False,
            'context_retrieval': False,
            'bash_execution': False,
            'issues': []
        }

        # Test 1: KG query
        try:
            import sqlite3
            with sqlite3.connect('/etc/luz-knowledge/research.db') as conn:
                cursor = conn.cursor()
                cursor.execute("SELECT COUNT(*) FROM entities LIMIT 1")
                result = cursor.fetchone()
                tests['kg_query'] = result is not None
        except Exception as e:
            tests['issues'].append(f"KG query failed: {e}")

        # Test 2: Conductor read/write
        try:
            conductor_dir = Path('/home/admin/conductor')
            test_file = conductor_dir / '.health_check_test'
            test_file.write_text(json.dumps({'test': 'ok'}))
            content = test_file.read_text()
            test_file.unlink()
            tests['conductor_rw'] = 'test' in content
        except Exception as e:
            tests['issues'].append(f"Conductor R/W failed: {e}")

        # Test 3: Context retrieval simulation
        try:
            # Simulate context injection
            from pathlib import Path
            context_file = Path('/opt/server-agents/orchestrator/lib/four_bucket_context.py')
            if context_file.exists():
                tests['context_retrieval'] = True
        except Exception as e:
            tests['issues'].append(f"Context test failed: {e}")

        # Test 4: Bash execution
        try:
            result = subprocess.run(['echo', 'test'], capture_output=True, timeout=2)
            tests['bash_execution'] = result.returncode == 0
        except Exception as e:
            tests['issues'].append(f"Bash execution failed: {e}")

        return tests

    def generate_unified_health_score(self) -> Dict:
        """
        Generate unified 0-100 health score across all subsystems.

        Returns:
            Dict with overall health assessment
        """
        # Get all component scores
        kg_health = self.kg_checker.generate_health_score()
        conductor_health = self.conductor_checker.generate_conductor_health_score()
        context_health = self.context_checker.generate_context_health_score()
        script_health = self.script_checker.generate_script_health_report()
        routine_health = self.routine_validator.generate_routine_validation_report()

        # Capacity and integration
        capacity = self.check_system_capacity()
        config = self.check_configuration_consistency()
        integration = self.run_integration_tests()

        # Calculate capacity score
        capacity_score = 100
        if capacity['disk']['status'] == 'critical':
            capacity_score -= 30
        elif capacity['disk']['status'] == 'warning':
            capacity_score -= 15

        if capacity['memory']['status'] == 'critical':
            capacity_score -= 20
        elif capacity['memory']['status'] == 'warning':
            capacity_score -= 10

        # Configuration score
        config_score = 100
        config_score -= len(config['issues']) * 5
        if not config['config_file_valid']:
            config_score -= 20
        if not config['databases_accessible']:
            config_score -= 30

        # Integration score
        integration_score = (sum(1 for k, v in integration.items() if k != 'issues' and v) / 4) * 100

        # Weighted overall score
        overall_score = (
            kg_health['overall_score'] * 0.20 +
            conductor_health['overall_score'] * 0.20 +
            context_health['overall_score'] * 0.15 +
            script_health['health_score'] * 0.10 +
            routine_health['health_score'] * 0.10 +
            max(0, capacity_score) * 0.15 +
            max(0, config_score) * 0.05 +
            integration_score * 0.05
        )

        return {
            'overall_score': round(overall_score, 1),
            'status': 'healthy' if overall_score >= 80 else 'degraded' if overall_score >= 60 else 'critical',
            'component_scores': {
                'kg': round(kg_health['overall_score'], 1),
                'conductor': round(conductor_health['overall_score'], 1),
                'context': round(context_health['overall_score'], 1),
                'scripts': round(script_health['health_score'], 1),
                'routines': round(routine_health['health_score'], 1),
                'capacity': round(max(0, capacity_score), 1),
                'configuration': round(max(0, config_score), 1),
                'integration': round(integration_score, 1)
            },
            'capacity': capacity,
            'configuration': config,
            'integration': integration,
            'timestamp': time.time()
        }


if __name__ == '__main__':
    orchestrator = SystemHealthOrchestrator()

    print("=" * 70)
    print("SYSTEM HEALTH ORCHESTRATOR")
    print("=" * 70)

    print("\nRunning unified health check...")
    health = orchestrator.generate_unified_health_score()

    print(f"\nOVERALL HEALTH SCORE: {health['overall_score']}/100 ({health['status'].upper()})")

    print(f"\nComponent Scores:")
    for component, score in health['component_scores'].items():
        print(f"  {component:20} {score:6.1f}/100")

    print(f"\nSystem Capacity:")
    capacity = health['capacity']
    print(f"  Disk:     {capacity['disk']['usage_pct']}% ({capacity['disk']['status']})")
    print(f"  Memory:   {capacity['memory']['usage_pct']}% ({capacity['memory']['status']})")
    print(f"  CPU:      {capacity['cpu']['load_pct']}% ({capacity['cpu']['status']})")

    if capacity['issues']:
        print(f"\nCapacity Issues ({len(capacity['issues'])}):")
        for issue in capacity['issues']:
            print(f"  - {issue}")

    if health['configuration']['issues']:
        print(f"\nConfiguration Issues ({len(health['configuration']['issues'])}):")
        for issue in health['configuration']['issues'][:5]:
            print(f"  - {issue}")