#!/usr/bin/env python3 """ Error Pattern Analyzer Analyzes system issues to identify systemic patterns: - Groups issues by root cause - Calculates frequency and impact - Recommends systemic fixes - Identifies precursors and prevention strategies """ import time from typing import List, Dict, Tuple from collections import defaultdict class ErrorPatternAnalyzer: """Analyze error patterns to identify systemic issues.""" # Known systemic patterns PATTERNS = { 'incomplete_research_blocking': { 'description': 'Research sessions ask user question, never resume', 'root_causes': ['Research agent ends without follow-up', 'User question not resumed'], 'indicators': ['unresolved_question', 'claude_no_conclusion'], 'frequency_threshold': 5, # Per 30 days 'impact': 'KG quality degradation, user confusion', 'prevention': 'Block session completion if unresolved questions exist' }, 'task_stalling_under_load': { 'description': 'Long-running tasks timeout heartbeat updates', 'root_causes': ['Heartbeat updates blocked', 'Task exceeds timeout', 'Process hangs'], 'indicators': ['heartbeat_timeout', 'process_not_found'], 'frequency_threshold': 3, # Per 30 days 'impact': 'Tasks marked running indefinitely, resources held', 'prevention': 'Increase heartbeat timeout or add intermediate progress signals' }, 'disk_pressure_growth': { 'description': 'Old conductor tasks accumulating, not archived', 'root_causes': ['No automatic archival', 'Task cleanup not running', 'Large task logs'], 'indicators': ['disk_usage_high', 'old_tasks_accumulating'], 'frequency_threshold': 5, # %/month growth 'impact': 'Approaching critical capacity, performance degradation', 'prevention': 'Implement automatic archival of >30 day tasks' }, 'missing_documentation': { 'description': 'Research findings incomplete or not documented', 'root_causes': ['No mandatory documentation', 'Findings not extracted', 'Synthesis missing'], 'indicators': ['incomplete_duration', 'missing_findings'], 'frequency_threshold': 8, # Per 30 days 'impact': 'Knowledge loss, difficult to track progress', 'prevention': 'Require structured findings section before completion' }, 'script_quality_drift': { 'description': 'Script quality degrades over time', 'root_causes': ['No validation on commit', 'Dependencies change', 'Type hints missing'], 'indicators': ['syntax_error', 'unused_import', 'low_type_coverage'], 'frequency_threshold': 3, # Issues per week 'impact': 'Fragility, hard to maintain, bugs increase', 'prevention': 'Enforce validation in pre-commit hooks' } } def __init__(self): """Initialize error pattern analyzer.""" self.issues_log: List[Dict] = [] self.pattern_matches: Dict[str, List[Dict]] = defaultdict(list) def analyze_kg_issues(self, kg_findings: List[Dict]) -> Dict: """ Analyze KG findings for error patterns. Args: kg_findings: List of findings from KGHealthChecker Returns: Dict with pattern analysis """ patterns = {} # Pattern 1: Incomplete Research Blocking unresolved = [f for f in kg_findings if f.get('pattern') == 'unresolved_question'] if len(unresolved) >= self.PATTERNS['incomplete_research_blocking']['frequency_threshold']: patterns['incomplete_research_blocking'] = { 'matched': True, 'evidence_count': len(unresolved), 'examples': unresolved[:3], 'severity': 'high' if len(unresolved) > 10 else 'medium', 'frequency_30d': len(unresolved), 'root_cause_analysis': self._analyze_incomplete_research(unresolved), 'recommended_fix': self.PATTERNS['incomplete_research_blocking']['prevention'] } # Pattern 2: Missing Documentation no_conclusion = [f for f in kg_findings if f.get('pattern') == 'claude_no_conclusion'] if len(no_conclusion) >= self.PATTERNS['missing_documentation']['frequency_threshold']: patterns['missing_documentation'] = { 'matched': True, 'evidence_count': len(no_conclusion), 'examples': no_conclusion[:3], 'severity': 'medium', 'root_cause_analysis': 'Claude responses present but missing synthesis/conclusions', 'recommended_fix': 'Add validation requiring "Conclusion:" or "Summary:" section' } return patterns def analyze_conductor_issues(self, conductor_stalled: List[Dict], disk_usage_pct: float) -> Dict: """ Analyze conductor issues for error patterns. Args: conductor_stalled: List of stalled tasks disk_usage_pct: Disk usage percentage Returns: Dict with pattern analysis """ patterns = {} # Pattern 1: Task Stalling Under Load if len(conductor_stalled) >= self.PATTERNS['task_stalling_under_load']['frequency_threshold']: patterns['task_stalling_under_load'] = { 'matched': True, 'evidence_count': len(conductor_stalled), 'examples': conductor_stalled[:3], 'severity': 'high' if len(conductor_stalled) > 5 else 'medium', 'root_cause_analysis': self._analyze_stalled_tasks(conductor_stalled), 'recommended_fix': self.PATTERNS['task_stalling_under_load']['prevention'] } # Pattern 2: Disk Pressure Growth if disk_usage_pct > 80: patterns['disk_pressure_growth'] = { 'matched': True, 'current_usage_pct': disk_usage_pct, 'severity': 'critical' if disk_usage_pct > 90 else 'high' if disk_usage_pct > 85 else 'medium', 'estimated_growth_pct_month': 5, # Historical average 'days_until_critical': max(0, int((95 - disk_usage_pct) / 5 * 30)), 'root_cause_analysis': 'Old conductor tasks accumulating without archival', 'recommended_fix': self.PATTERNS['disk_pressure_growth']['prevention'] } return patterns def analyze_script_issues(self, script_health: Dict) -> Dict: """ Analyze script quality for error patterns. Args: script_health: Script health report data Returns: Dict with pattern analysis """ patterns = {} # Pattern 1: Script Quality Drift problematic_scripts = [s for s in script_health.get('scripts', []) if s['status'] in ['syntax_error', 'issues']] if len(problematic_scripts) >= self.PATTERNS['script_quality_drift']['frequency_threshold']: patterns['script_quality_drift'] = { 'matched': True, 'problematic_count': len(problematic_scripts), 'examples': [{'script': s['script'], 'status': s['status']} for s in problematic_scripts[:3]], 'severity': 'high' if len(problematic_scripts) > 5 else 'medium', 'root_cause_analysis': 'No pre-commit validation enforcing script quality', 'recommended_fix': self.PATTERNS['script_quality_drift']['prevention'] } return patterns def run_full_pattern_analysis(self, all_health_data: Dict) -> Dict: """ Run comprehensive pattern analysis across all systems. Args: all_health_data: Complete health data from orchestrator Returns: Dict with all identified patterns """ all_patterns = {} # Analyze KG issues kg_issues = self._extract_kg_issues(all_health_data) kg_patterns = self.analyze_kg_issues(kg_issues) all_patterns.update(kg_patterns) # Analyze conductor issues conductor_stalled = self._extract_conductor_stalled(all_health_data) disk_usage = all_health_data.get('capacity', {}).get('disk', {}).get('usage_pct', 0) conductor_patterns = self.analyze_conductor_issues(conductor_stalled, disk_usage) all_patterns.update(conductor_patterns) # Analyze script issues script_patterns = self.analyze_script_issues(all_health_data) all_patterns.update(script_patterns) return { 'total_patterns': len(all_patterns), 'patterns': all_patterns, 'summary': self._generate_pattern_summary(all_patterns), 'systemic_recommendations': self._generate_systemic_recommendations(all_patterns), 'timestamp': time.time() } def _analyze_incomplete_research(self, unresolved_findings: List[Dict]) -> str: """Generate detailed root cause analysis for incomplete research.""" if not unresolved_findings: return "No data available" # Analyze pattern avg_duration = sum(f.get('duration_secs', 0) for f in unresolved_findings) / len(unresolved_findings) analysis = f""" Root Cause: Research agent creates initial analysis but asks user question. User answer is expected but session is marked complete anyway. Evidence: - {len(unresolved_findings)} sessions ended with unresolved questions - Average session duration: {int(avg_duration)}s - Pattern: Initial research → Claude analysis → "What do you think?" → END Impact: - User confusion (unclear next steps) - Knowledge incomplete (user input never captured) - KG quality degraded (research marked done but unresolved) Systemic Issue: Research workflow doesn't enforce follow-up on user questions. Sessions can complete even with pending decisions. """ return analysis.strip() def _analyze_stalled_tasks(self, stalled_tasks: List[Dict]) -> str: """Generate detailed root cause analysis for stalled tasks.""" if not stalled_tasks: return "No data available" heartbeat_timeouts = [t for t in stalled_tasks if t.get('stall_reason') == 'heartbeat_timeout'] process_missing = [t for t in stalled_tasks if t.get('stall_reason') == 'process_not_found'] analysis = f""" Root Cause: Long-running tasks exceed heartbeat timeout window. No intermediate progress updates during execution. Evidence: - {len(heartbeat_timeouts)} tasks with heartbeat timeout - {len(process_missing)} tasks with missing process - Pattern: Task starts → no heartbeat update → marked stalled after 300s Impact: - Resources held indefinitely - Tasks can't recover automatically - System capacity wasted Systemic Issue: Heartbeat mechanism assumes short tasks (< 5 min). Long-running tasks (> 10 min) always timeout regardless of progress. No intermediate signal for slow but progressing tasks. """ return analysis.strip() def _generate_pattern_summary(self, patterns: Dict) -> Dict: """Generate summary statistics for all patterns.""" summary = { 'total_patterns_detected': len(patterns), 'high_severity': 0, 'medium_severity': 0, 'total_evidence_items': 0 } for pattern_name, pattern_data in patterns.items(): if pattern_data.get('matched'): severity = pattern_data.get('severity', 'medium') if severity == 'high': summary['high_severity'] += 1 elif severity == 'medium': summary['medium_severity'] += 1 summary['total_evidence_items'] += pattern_data.get('evidence_count', 1) return summary def _generate_systemic_recommendations(self, patterns: Dict) -> List[str]: """Generate systemic recommendations from identified patterns.""" recommendations = [] for pattern_name, pattern_data in patterns.items(): if pattern_data.get('matched'): severity = pattern_data.get('severity', 'medium') prefix = "[URGENT]" if severity == 'high' else "[WARNING]" recommendations.append( f"{prefix} {pattern_data.get('recommended_fix', 'Fix this issue')}" ) # Add forward-looking recommendations if len(recommendations) > 0: recommendations.append("\nLong-term Systemic Fixes:") recommendations.append(" 1. Implement pre-commit validation for script quality") recommendations.append(" 2. Add mandatory documentation sections for research") recommendations.append(" 3. Increase heartbeat timeout or add intermediate signals") recommendations.append(" 4. Implement automatic archival for old tasks") return recommendations def _extract_kg_issues(self, health_data: Dict) -> List[Dict]: """Extract KG issues from health data.""" # This would be populated from actual KG checker results return [] def _extract_conductor_stalled(self, health_data: Dict) -> List[Dict]: """Extract stalled conductor tasks from health data.""" # This would be populated from actual conductor checker results return [] if __name__ == '__main__': analyzer = ErrorPatternAnalyzer() # Example: Run pattern analysis with sample data sample_data = { 'capacity': {'disk': {'usage_pct': 82}}, 'integration': {} } result = analyzer.run_full_pattern_analysis(sample_data) print("=" * 70) print("ERROR PATTERN ANALYSIS") print("=" * 70) print(f"\nPatterns detected: {result['total_patterns']}") print(f"High severity: {result['summary']['high_severity']}") print(f"Medium severity: {result['summary']['medium_severity']}") print(f"\nSystemic Recommendations:") for rec in result['systemic_recommendations']: print(f" {rec}")