Files
luzia/lib/system_health_orchestrator.py
admin ec33ac1936 Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor:

- Added DockerTmuxController class for robust tmux session management
- Implements send_keys() with configurable delay_enter
- Implements capture_pane() for output retrieval
- Implements wait_for_prompt() for pattern-based completion detection
- Implements wait_for_idle() for content-hash-based idle detection
- Implements wait_for_shell_prompt() for shell prompt detection

Also includes workflow improvements:
- Pre-task git snapshot before agent execution
- Post-task commit protocol in agent guidelines

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-14 10:42:16 -03:00

361 lines
13 KiB
Python

#!/usr/bin/env python3
"""
System Health Orchestrator
Master health check coordinator that validates:
- System capacity (disk, memory, CPU, concurrency)
- Configuration consistency
- Integration testing across all subsystems
- Unified health scoring (0-100)
"""
import json
import os
import shutil
import subprocess
import time
from pathlib import Path
from typing import List, Dict
from kg_health_checker import KGHealthChecker
from conductor_health_checker import ConductorHealthChecker
from context_health_checker import ContextHealthChecker
from script_health_checker import ScriptHealthChecker
from routine_validator import RoutineValidator
class SystemHealthOrchestrator:
"""Master orchestrator for system-wide health validation."""
def __init__(self):
"""Initialize system health orchestrator."""
self.kg_checker = KGHealthChecker()
self.conductor_checker = ConductorHealthChecker()
self.context_checker = ContextHealthChecker()
self.script_checker = ScriptHealthChecker()
self.routine_validator = RoutineValidator()
def check_system_capacity(self) -> Dict:
"""
Check system capacity constraints.
Returns:
Dict with capacity metrics
"""
capacity = {
'disk': {},
'memory': {},
'cpu': {},
'concurrency': {},
'issues': []
}
# Disk usage
try:
total, used, free = shutil.disk_usage('/')
disk_usage_pct = (used / total) * 100
disk_free_gb = free / (1024 ** 3)
capacity['disk'] = {
'usage_pct': round(disk_usage_pct, 1),
'free_gb': round(disk_free_gb, 1),
'status': 'critical' if disk_usage_pct > 90 else 'warning' if disk_usage_pct > 80 else 'healthy'
}
if disk_usage_pct > 90:
capacity['issues'].append(f"Disk critically full ({disk_usage_pct}%)")
elif disk_usage_pct > 85:
capacity['issues'].append(f"Disk usage high ({disk_usage_pct}%)")
except Exception as e:
capacity['issues'].append(f"Cannot check disk: {e}")
# Memory usage
try:
with open('/proc/meminfo', 'r') as f:
lines = f.readlines()
mem_info = {line.split()[0].rstrip(':'): int(line.split()[1]) for line in lines}
total_mem = mem_info.get('MemTotal', 0)
available_mem = mem_info.get('MemAvailable', 0)
used_mem = total_mem - available_mem
mem_usage_pct = (used_mem / max(total_mem, 1)) * 100
capacity['memory'] = {
'usage_pct': round(mem_usage_pct, 1),
'available_gb': round(available_mem / (1024 ** 2), 1),
'status': 'critical' if mem_usage_pct > 90 else 'warning' if mem_usage_pct > 85 else 'healthy'
}
if mem_usage_pct > 90:
capacity['issues'].append(f"Memory usage critical ({mem_usage_pct}%)")
except Exception as e:
capacity['issues'].append(f"Cannot check memory: {e}")
# CPU load
try:
load_avg = os.getloadavg()
cpu_count = os.cpu_count()
load_pct = (load_avg[0] / max(cpu_count, 1)) * 100
capacity['cpu'] = {
'load_average': tuple(round(l, 2) for l in load_avg),
'load_pct': round(load_pct, 1),
'cpu_count': cpu_count,
'status': 'critical' if load_pct > 100 else 'warning' if load_pct > 80 else 'healthy'
}
except Exception as e:
capacity['issues'].append(f"Cannot check CPU: {e}")
# Concurrency limits
try:
# Check max concurrent agents
conductor_dir = Path('/home/admin/conductor/active')
active_tasks = len(list(conductor_dir.iterdir())) if conductor_dir.exists() else 0
max_concurrent = 4 # Design limit
capacity['concurrency'] = {
'active_agents': active_tasks,
'max_concurrent': max_concurrent,
'available_slots': max(0, max_concurrent - active_tasks),
'status': 'warning' if active_tasks >= max_concurrent else 'healthy'
}
if active_tasks >= max_concurrent:
capacity['issues'].append(f"Concurrency at limit ({active_tasks}/{max_concurrent})")
except Exception as e:
capacity['issues'].append(f"Cannot check concurrency: {e}")
return capacity
def check_configuration_consistency(self) -> Dict:
"""
Validate configuration consistency across system.
Returns:
Dict with configuration status
"""
config_status = {
'config_file_valid': False,
'permissions_valid': False,
'databases_accessible': False,
'mcp_servers_configured': False,
'issues': []
}
# Check config.json
config_file = Path('/opt/server-agents/orchestrator/config.json')
if config_file.exists():
try:
config = json.loads(config_file.read_text())
config_status['config_file_valid'] = True
except Exception as e:
config_status['issues'].append(f"Config parse error: {e}")
else:
config_status['issues'].append("Config file not found")
# Check file permissions
try:
orchestrator_root = Path('/opt/server-agents/orchestrator')
for item in orchestrator_root.rglob('*'):
if item.is_file():
# Check readable
if not os.access(item, os.R_OK):
config_status['issues'].append(f"Not readable: {item}")
if item.suffix == '.py' and not os.access(item, os.X_OK):
# Python files should be executable
pass
config_status['permissions_valid'] = len([i for i in config_status['issues'] if 'readable' in i]) == 0
except Exception as e:
config_status['issues'].append(f"Cannot check permissions: {e}")
# Check database accessibility
db_paths = [
'/etc/luz-knowledge/research.db',
'/etc/luz-knowledge/projects.db',
'/opt/server-agents/state/task_queue.db',
]
dbs_accessible = 0
for db_path in db_paths:
if Path(db_path).exists() and os.access(db_path, os.R_OK):
dbs_accessible += 1
else:
config_status['issues'].append(f"Database not accessible: {db_path}")
config_status['databases_accessible'] = dbs_accessible >= 2
# Check MCP server configuration
try:
if config_status['config_file_valid']:
mcp_servers = config.get('mcpServers', {})
if mcp_servers:
config_status['mcp_servers_configured'] = True
except Exception:
pass
return config_status
def run_integration_tests(self) -> Dict:
"""
Run integration tests across critical system paths.
Returns:
Dict with test results
"""
tests = {
'kg_query': False,
'conductor_rw': False,
'context_retrieval': False,
'bash_execution': False,
'issues': []
}
# Test 1: KG query
try:
import sqlite3
with sqlite3.connect('/etc/luz-knowledge/research.db') as conn:
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM entities LIMIT 1")
result = cursor.fetchone()
tests['kg_query'] = result is not None
except Exception as e:
tests['issues'].append(f"KG query failed: {e}")
# Test 2: Conductor read/write
try:
conductor_dir = Path('/home/admin/conductor')
test_file = conductor_dir / '.health_check_test'
test_file.write_text(json.dumps({'test': 'ok'}))
content = test_file.read_text()
test_file.unlink()
tests['conductor_rw'] = 'test' in content
except Exception as e:
tests['issues'].append(f"Conductor R/W failed: {e}")
# Test 3: Context retrieval simulation
try:
# Simulate context injection
from pathlib import Path
context_file = Path('/opt/server-agents/orchestrator/lib/four_bucket_context.py')
if context_file.exists():
tests['context_retrieval'] = True
except Exception as e:
tests['issues'].append(f"Context test failed: {e}")
# Test 4: Bash execution
try:
result = subprocess.run(['echo', 'test'], capture_output=True, timeout=2)
tests['bash_execution'] = result.returncode == 0
except Exception as e:
tests['issues'].append(f"Bash execution failed: {e}")
return tests
def generate_unified_health_score(self) -> Dict:
"""
Generate unified 0-100 health score across all subsystems.
Returns:
Dict with overall health assessment
"""
# Get all component scores
kg_health = self.kg_checker.generate_health_score()
conductor_health = self.conductor_checker.generate_conductor_health_score()
context_health = self.context_checker.generate_context_health_score()
script_health = self.script_checker.generate_script_health_report()
routine_health = self.routine_validator.generate_routine_validation_report()
# Capacity and integration
capacity = self.check_system_capacity()
config = self.check_configuration_consistency()
integration = self.run_integration_tests()
# Calculate capacity score
capacity_score = 100
if capacity['disk']['status'] == 'critical':
capacity_score -= 30
elif capacity['disk']['status'] == 'warning':
capacity_score -= 15
if capacity['memory']['status'] == 'critical':
capacity_score -= 20
elif capacity['memory']['status'] == 'warning':
capacity_score -= 10
# Configuration score
config_score = 100
config_score -= len(config['issues']) * 5
if not config['config_file_valid']:
config_score -= 20
if not config['databases_accessible']:
config_score -= 30
# Integration score
integration_score = (sum(1 for k, v in integration.items() if k != 'issues' and v) / 4) * 100
# Weighted overall score
overall_score = (
kg_health['overall_score'] * 0.20 +
conductor_health['overall_score'] * 0.20 +
context_health['overall_score'] * 0.15 +
script_health['health_score'] * 0.10 +
routine_health['health_score'] * 0.10 +
max(0, capacity_score) * 0.15 +
max(0, config_score) * 0.05 +
integration_score * 0.05
)
return {
'overall_score': round(overall_score, 1),
'status': 'healthy' if overall_score >= 80 else 'degraded' if overall_score >= 60 else 'critical',
'component_scores': {
'kg': round(kg_health['overall_score'], 1),
'conductor': round(conductor_health['overall_score'], 1),
'context': round(context_health['overall_score'], 1),
'scripts': round(script_health['health_score'], 1),
'routines': round(routine_health['health_score'], 1),
'capacity': round(max(0, capacity_score), 1),
'configuration': round(max(0, config_score), 1),
'integration': round(integration_score, 1)
},
'capacity': capacity,
'configuration': config,
'integration': integration,
'timestamp': time.time()
}
if __name__ == '__main__':
orchestrator = SystemHealthOrchestrator()
print("=" * 70)
print("SYSTEM HEALTH ORCHESTRATOR")
print("=" * 70)
print("\nRunning unified health check...")
health = orchestrator.generate_unified_health_score()
print(f"\nOVERALL HEALTH SCORE: {health['overall_score']}/100 ({health['status'].upper()})")
print(f"\nComponent Scores:")
for component, score in health['component_scores'].items():
print(f" {component:20} {score:6.1f}/100")
print(f"\nSystem Capacity:")
capacity = health['capacity']
print(f" Disk: {capacity['disk']['usage_pct']}% ({capacity['disk']['status']})")
print(f" Memory: {capacity['memory']['usage_pct']}% ({capacity['memory']['status']})")
print(f" CPU: {capacity['cpu']['load_pct']}% ({capacity['cpu']['status']})")
if capacity['issues']:
print(f"\nCapacity Issues ({len(capacity['issues'])}):")
for issue in capacity['issues']:
print(f" - {issue}")
if health['configuration']['issues']:
print(f"\nConfiguration Issues ({len(health['configuration']['issues'])}):")
for issue in health['configuration']['issues'][:5]:
print(f" - {issue}")