Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
361 lines
13 KiB
Python
361 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
System Health Orchestrator
|
|
|
|
Master health check coordinator that validates:
|
|
- System capacity (disk, memory, CPU, concurrency)
|
|
- Configuration consistency
|
|
- Integration testing across all subsystems
|
|
- Unified health scoring (0-100)
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import time
|
|
from pathlib import Path
|
|
from typing import List, Dict
|
|
|
|
from kg_health_checker import KGHealthChecker
|
|
from conductor_health_checker import ConductorHealthChecker
|
|
from context_health_checker import ContextHealthChecker
|
|
from script_health_checker import ScriptHealthChecker
|
|
from routine_validator import RoutineValidator
|
|
|
|
|
|
class SystemHealthOrchestrator:
|
|
"""Master orchestrator for system-wide health validation."""
|
|
|
|
def __init__(self):
|
|
"""Initialize system health orchestrator."""
|
|
self.kg_checker = KGHealthChecker()
|
|
self.conductor_checker = ConductorHealthChecker()
|
|
self.context_checker = ContextHealthChecker()
|
|
self.script_checker = ScriptHealthChecker()
|
|
self.routine_validator = RoutineValidator()
|
|
|
|
def check_system_capacity(self) -> Dict:
|
|
"""
|
|
Check system capacity constraints.
|
|
|
|
Returns:
|
|
Dict with capacity metrics
|
|
"""
|
|
capacity = {
|
|
'disk': {},
|
|
'memory': {},
|
|
'cpu': {},
|
|
'concurrency': {},
|
|
'issues': []
|
|
}
|
|
|
|
# Disk usage
|
|
try:
|
|
total, used, free = shutil.disk_usage('/')
|
|
disk_usage_pct = (used / total) * 100
|
|
disk_free_gb = free / (1024 ** 3)
|
|
|
|
capacity['disk'] = {
|
|
'usage_pct': round(disk_usage_pct, 1),
|
|
'free_gb': round(disk_free_gb, 1),
|
|
'status': 'critical' if disk_usage_pct > 90 else 'warning' if disk_usage_pct > 80 else 'healthy'
|
|
}
|
|
|
|
if disk_usage_pct > 90:
|
|
capacity['issues'].append(f"Disk critically full ({disk_usage_pct}%)")
|
|
elif disk_usage_pct > 85:
|
|
capacity['issues'].append(f"Disk usage high ({disk_usage_pct}%)")
|
|
except Exception as e:
|
|
capacity['issues'].append(f"Cannot check disk: {e}")
|
|
|
|
# Memory usage
|
|
try:
|
|
with open('/proc/meminfo', 'r') as f:
|
|
lines = f.readlines()
|
|
mem_info = {line.split()[0].rstrip(':'): int(line.split()[1]) for line in lines}
|
|
|
|
total_mem = mem_info.get('MemTotal', 0)
|
|
available_mem = mem_info.get('MemAvailable', 0)
|
|
used_mem = total_mem - available_mem
|
|
mem_usage_pct = (used_mem / max(total_mem, 1)) * 100
|
|
|
|
capacity['memory'] = {
|
|
'usage_pct': round(mem_usage_pct, 1),
|
|
'available_gb': round(available_mem / (1024 ** 2), 1),
|
|
'status': 'critical' if mem_usage_pct > 90 else 'warning' if mem_usage_pct > 85 else 'healthy'
|
|
}
|
|
|
|
if mem_usage_pct > 90:
|
|
capacity['issues'].append(f"Memory usage critical ({mem_usage_pct}%)")
|
|
except Exception as e:
|
|
capacity['issues'].append(f"Cannot check memory: {e}")
|
|
|
|
# CPU load
|
|
try:
|
|
load_avg = os.getloadavg()
|
|
cpu_count = os.cpu_count()
|
|
|
|
load_pct = (load_avg[0] / max(cpu_count, 1)) * 100
|
|
|
|
capacity['cpu'] = {
|
|
'load_average': tuple(round(l, 2) for l in load_avg),
|
|
'load_pct': round(load_pct, 1),
|
|
'cpu_count': cpu_count,
|
|
'status': 'critical' if load_pct > 100 else 'warning' if load_pct > 80 else 'healthy'
|
|
}
|
|
except Exception as e:
|
|
capacity['issues'].append(f"Cannot check CPU: {e}")
|
|
|
|
# Concurrency limits
|
|
try:
|
|
# Check max concurrent agents
|
|
conductor_dir = Path('/home/admin/conductor/active')
|
|
active_tasks = len(list(conductor_dir.iterdir())) if conductor_dir.exists() else 0
|
|
|
|
max_concurrent = 4 # Design limit
|
|
capacity['concurrency'] = {
|
|
'active_agents': active_tasks,
|
|
'max_concurrent': max_concurrent,
|
|
'available_slots': max(0, max_concurrent - active_tasks),
|
|
'status': 'warning' if active_tasks >= max_concurrent else 'healthy'
|
|
}
|
|
|
|
if active_tasks >= max_concurrent:
|
|
capacity['issues'].append(f"Concurrency at limit ({active_tasks}/{max_concurrent})")
|
|
except Exception as e:
|
|
capacity['issues'].append(f"Cannot check concurrency: {e}")
|
|
|
|
return capacity
|
|
|
|
def check_configuration_consistency(self) -> Dict:
|
|
"""
|
|
Validate configuration consistency across system.
|
|
|
|
Returns:
|
|
Dict with configuration status
|
|
"""
|
|
config_status = {
|
|
'config_file_valid': False,
|
|
'permissions_valid': False,
|
|
'databases_accessible': False,
|
|
'mcp_servers_configured': False,
|
|
'issues': []
|
|
}
|
|
|
|
# Check config.json
|
|
config_file = Path('/opt/server-agents/orchestrator/config.json')
|
|
if config_file.exists():
|
|
try:
|
|
config = json.loads(config_file.read_text())
|
|
config_status['config_file_valid'] = True
|
|
except Exception as e:
|
|
config_status['issues'].append(f"Config parse error: {e}")
|
|
else:
|
|
config_status['issues'].append("Config file not found")
|
|
|
|
# Check file permissions
|
|
try:
|
|
orchestrator_root = Path('/opt/server-agents/orchestrator')
|
|
for item in orchestrator_root.rglob('*'):
|
|
if item.is_file():
|
|
# Check readable
|
|
if not os.access(item, os.R_OK):
|
|
config_status['issues'].append(f"Not readable: {item}")
|
|
if item.suffix == '.py' and not os.access(item, os.X_OK):
|
|
# Python files should be executable
|
|
pass
|
|
|
|
config_status['permissions_valid'] = len([i for i in config_status['issues'] if 'readable' in i]) == 0
|
|
except Exception as e:
|
|
config_status['issues'].append(f"Cannot check permissions: {e}")
|
|
|
|
# Check database accessibility
|
|
db_paths = [
|
|
'/etc/luz-knowledge/research.db',
|
|
'/etc/luz-knowledge/projects.db',
|
|
'/opt/server-agents/state/task_queue.db',
|
|
]
|
|
|
|
dbs_accessible = 0
|
|
for db_path in db_paths:
|
|
if Path(db_path).exists() and os.access(db_path, os.R_OK):
|
|
dbs_accessible += 1
|
|
else:
|
|
config_status['issues'].append(f"Database not accessible: {db_path}")
|
|
|
|
config_status['databases_accessible'] = dbs_accessible >= 2
|
|
|
|
# Check MCP server configuration
|
|
try:
|
|
if config_status['config_file_valid']:
|
|
mcp_servers = config.get('mcpServers', {})
|
|
if mcp_servers:
|
|
config_status['mcp_servers_configured'] = True
|
|
except Exception:
|
|
pass
|
|
|
|
return config_status
|
|
|
|
def run_integration_tests(self) -> Dict:
|
|
"""
|
|
Run integration tests across critical system paths.
|
|
|
|
Returns:
|
|
Dict with test results
|
|
"""
|
|
tests = {
|
|
'kg_query': False,
|
|
'conductor_rw': False,
|
|
'context_retrieval': False,
|
|
'bash_execution': False,
|
|
'issues': []
|
|
}
|
|
|
|
# Test 1: KG query
|
|
try:
|
|
import sqlite3
|
|
with sqlite3.connect('/etc/luz-knowledge/research.db') as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT COUNT(*) FROM entities LIMIT 1")
|
|
result = cursor.fetchone()
|
|
tests['kg_query'] = result is not None
|
|
except Exception as e:
|
|
tests['issues'].append(f"KG query failed: {e}")
|
|
|
|
# Test 2: Conductor read/write
|
|
try:
|
|
conductor_dir = Path('/home/admin/conductor')
|
|
test_file = conductor_dir / '.health_check_test'
|
|
test_file.write_text(json.dumps({'test': 'ok'}))
|
|
content = test_file.read_text()
|
|
test_file.unlink()
|
|
tests['conductor_rw'] = 'test' in content
|
|
except Exception as e:
|
|
tests['issues'].append(f"Conductor R/W failed: {e}")
|
|
|
|
# Test 3: Context retrieval simulation
|
|
try:
|
|
# Simulate context injection
|
|
from pathlib import Path
|
|
context_file = Path('/opt/server-agents/orchestrator/lib/four_bucket_context.py')
|
|
if context_file.exists():
|
|
tests['context_retrieval'] = True
|
|
except Exception as e:
|
|
tests['issues'].append(f"Context test failed: {e}")
|
|
|
|
# Test 4: Bash execution
|
|
try:
|
|
result = subprocess.run(['echo', 'test'], capture_output=True, timeout=2)
|
|
tests['bash_execution'] = result.returncode == 0
|
|
except Exception as e:
|
|
tests['issues'].append(f"Bash execution failed: {e}")
|
|
|
|
return tests
|
|
|
|
def generate_unified_health_score(self) -> Dict:
|
|
"""
|
|
Generate unified 0-100 health score across all subsystems.
|
|
|
|
Returns:
|
|
Dict with overall health assessment
|
|
"""
|
|
# Get all component scores
|
|
kg_health = self.kg_checker.generate_health_score()
|
|
conductor_health = self.conductor_checker.generate_conductor_health_score()
|
|
context_health = self.context_checker.generate_context_health_score()
|
|
script_health = self.script_checker.generate_script_health_report()
|
|
routine_health = self.routine_validator.generate_routine_validation_report()
|
|
|
|
# Capacity and integration
|
|
capacity = self.check_system_capacity()
|
|
config = self.check_configuration_consistency()
|
|
integration = self.run_integration_tests()
|
|
|
|
# Calculate capacity score
|
|
capacity_score = 100
|
|
if capacity['disk']['status'] == 'critical':
|
|
capacity_score -= 30
|
|
elif capacity['disk']['status'] == 'warning':
|
|
capacity_score -= 15
|
|
|
|
if capacity['memory']['status'] == 'critical':
|
|
capacity_score -= 20
|
|
elif capacity['memory']['status'] == 'warning':
|
|
capacity_score -= 10
|
|
|
|
# Configuration score
|
|
config_score = 100
|
|
config_score -= len(config['issues']) * 5
|
|
if not config['config_file_valid']:
|
|
config_score -= 20
|
|
if not config['databases_accessible']:
|
|
config_score -= 30
|
|
|
|
# Integration score
|
|
integration_score = (sum(1 for k, v in integration.items() if k != 'issues' and v) / 4) * 100
|
|
|
|
# Weighted overall score
|
|
overall_score = (
|
|
kg_health['overall_score'] * 0.20 +
|
|
conductor_health['overall_score'] * 0.20 +
|
|
context_health['overall_score'] * 0.15 +
|
|
script_health['health_score'] * 0.10 +
|
|
routine_health['health_score'] * 0.10 +
|
|
max(0, capacity_score) * 0.15 +
|
|
max(0, config_score) * 0.05 +
|
|
integration_score * 0.05
|
|
)
|
|
|
|
return {
|
|
'overall_score': round(overall_score, 1),
|
|
'status': 'healthy' if overall_score >= 80 else 'degraded' if overall_score >= 60 else 'critical',
|
|
'component_scores': {
|
|
'kg': round(kg_health['overall_score'], 1),
|
|
'conductor': round(conductor_health['overall_score'], 1),
|
|
'context': round(context_health['overall_score'], 1),
|
|
'scripts': round(script_health['health_score'], 1),
|
|
'routines': round(routine_health['health_score'], 1),
|
|
'capacity': round(max(0, capacity_score), 1),
|
|
'configuration': round(max(0, config_score), 1),
|
|
'integration': round(integration_score, 1)
|
|
},
|
|
'capacity': capacity,
|
|
'configuration': config,
|
|
'integration': integration,
|
|
'timestamp': time.time()
|
|
}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
orchestrator = SystemHealthOrchestrator()
|
|
|
|
print("=" * 70)
|
|
print("SYSTEM HEALTH ORCHESTRATOR")
|
|
print("=" * 70)
|
|
|
|
print("\nRunning unified health check...")
|
|
health = orchestrator.generate_unified_health_score()
|
|
|
|
print(f"\nOVERALL HEALTH SCORE: {health['overall_score']}/100 ({health['status'].upper()})")
|
|
|
|
print(f"\nComponent Scores:")
|
|
for component, score in health['component_scores'].items():
|
|
print(f" {component:20} {score:6.1f}/100")
|
|
|
|
print(f"\nSystem Capacity:")
|
|
capacity = health['capacity']
|
|
print(f" Disk: {capacity['disk']['usage_pct']}% ({capacity['disk']['status']})")
|
|
print(f" Memory: {capacity['memory']['usage_pct']}% ({capacity['memory']['status']})")
|
|
print(f" CPU: {capacity['cpu']['load_pct']}% ({capacity['cpu']['status']})")
|
|
|
|
if capacity['issues']:
|
|
print(f"\nCapacity Issues ({len(capacity['issues'])}):")
|
|
for issue in capacity['issues']:
|
|
print(f" - {issue}")
|
|
|
|
if health['configuration']['issues']:
|
|
print(f"\nConfiguration Issues ({len(health['configuration']['issues'])}):")
|
|
for issue in health['configuration']['issues'][:5]:
|
|
print(f" - {issue}")
|