luzia/lib/skill_usage_analyzer.py

#!/usr/bin/env python3
"""
Skill & Documentation Usage Analyzer for Luzia

Provides comprehensive analysis of:
1. Which skills are being used during task dispatch
2. Documentation file access patterns
3. Usage trends and statistics
4. Skill-to-documentation relationships
5. Project-specific skill usage

This tool reads from:
- Queue entries: /var/lib/luzia/queue/pending/
- Job metadata: /var/log/luz-orchestrator/jobs/
- Knowledge graph databases: /etc/luz-knowledge/
"""

import json
import sqlite3
import os
from pathlib import Path
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple, Any
from collections import defaultdict, Counter
import re


class SkillUsageAnalyzer:
    """Analyze skill and documentation usage patterns."""

    QUEUE_BASE = Path("/var/lib/luzia/queue")
    JOB_LOG_BASE = Path("/var/log/luz-orchestrator/jobs")
    KG_BASE = Path("/etc/luz-knowledge")

    CLAUDE_DEV_KEYWORDS = {
        'skill': 'claude_dev',
        'plugin': 'claude_dev',
        'command': 'claude_dev',
        'mcp': 'claude_dev',
        'hook': 'claude_dev',
        'slash': 'claude_dev',
        'claude code': 'claude_dev',
        'agent': 'agent_framework',
        'tool': 'tool_framework',
        'integration': 'integration',
        'custom command': 'claude_dev',
        '.claude': 'claude_config',
        'slash command': 'claude_dev',
        'skill file': 'claude_dev',
        'skill library': 'claude_dev',
        'tool specification': 'tool_spec',
        'mcp server': 'mcp',
        'mcp config': 'mcp',
        'anthropic': 'anthropic_api',
        'claude-code': 'claude_dev',
    }

    def __init__(self):
        self.skills_detected = defaultdict(int)
        self.doc_references = defaultdict(int)
        self.project_skill_distribution = defaultdict(lambda: defaultdict(int))
        self.job_metadata = []
        self.queue_entries = []

    def analyze_queue_entries(self) -> Dict[str, Any]:
        """Analyze pending queue entries for skill_match fields."""
        result = {
            "total_tasks": 0,
            "tasks_with_skill": 0,
            "skills_found": {},
            "by_project": {},
            "by_priority": {"high": 0, "normal": 0},
            "entries": [],
        }

        for tier_dir in [self.QUEUE_BASE / "pending" / "high",
                          self.QUEUE_BASE / "pending" / "normal"]:
            if not tier_dir.exists():
                continue

            tier_name = tier_dir.name
            for entry_file in tier_dir.glob("*.json"):
                try:
                    entry = json.loads(entry_file.read_text())
                    result["total_tasks"] += 1
                    result["by_priority"][tier_name] += 1

                    project = entry.get("project", "unknown")
                    if project not in result["by_project"]:
                        result["by_project"][project] = {"total": 0, "with_skill": 0}
                    result["by_project"][project]["total"] += 1

                    skill = entry.get("skill_match")
                    if skill:
                        result["tasks_with_skill"] += 1
                        result["by_project"][project]["with_skill"] += 1
                        result["skills_found"][skill] = result["skills_found"].get(skill, 0) + 1
                        self.skills_detected[skill] += 1

                    result["entries"].append({
                        "id": entry.get("id"),
                        "project": project,
                        "skill": skill,
                        "priority": entry.get("priority"),
                        "enqueued_at": entry.get("enqueued_at"),
                    })

                except (json.JSONDecodeError, IOError):
                    pass

        return result

    def analyze_job_metadata(self, hours: int = 24) -> Dict[str, Any]:
        """Analyze job metadata for skill usage patterns."""
        result = {
            "time_window": f"Last {hours} hours",
            "total_jobs": 0,
            "jobs_with_skill": 0,
            "skills_used": {},
            "debug_mode_tasks": 0,
            "by_project": {},
            "jobs": [],
        }

        since = datetime.now() - timedelta(hours=hours)

        if not self.JOB_LOG_BASE.exists():
            return result

        for job_dir in self.JOB_LOG_BASE.glob("*/meta.json"):
            try:
                meta = json.loads(job_dir.read_text())
                started = datetime.fromisoformat(meta.get("started", ""))

                if started < since:
                    continue

                result["total_jobs"] += 1
                project = meta.get("project", "unknown")

                if project not in result["by_project"]:
                    result["by_project"][project] = {
                        "total": 0,
                        "with_skill": 0,
                        "debug_mode": 0,
                    }
                result["by_project"][project]["total"] += 1

                skill = meta.get("skill")
                if skill:
                    result["jobs_with_skill"] += 1
                    result["by_project"][project]["with_skill"] += 1
                    result["skills_used"][skill] = result["skills_used"].get(skill, 0) + 1
                    self.skills_detected[skill] += 1

                # Check for debug mode (indicates Claude dev task)
                if meta.get("debug"):
                    result["debug_mode_tasks"] += 1
                    result["by_project"][project]["debug_mode"] += 1

                result["jobs"].append({
                    "id": meta.get("id"),
                    "project": project,
                    "task": meta.get("task", "")[:100],
                    "skill": skill,
                    "started": meta.get("started"),
                    "status": meta.get("status"),
                    "debug": meta.get("debug", False),
                })

            except (json.JSONDecodeError, IOError, ValueError):
                pass

        return result

    def detect_skills_in_tasks(self) -> Dict[str, List[Dict]]:
        """Detect skills from task prompts using keyword analysis."""
        result = defaultdict(list)

        # Analyze queue entries
        if self.QUEUE_BASE.exists():
            for entry_file in (self.QUEUE_BASE / "pending").glob("*/*/*.json"):
                try:
                    entry = json.loads(entry_file.read_text())
                    prompt = entry.get("prompt", "").lower()
                    task_id = entry.get("id", "unknown")
                    project = entry.get("project", "unknown")

                    detected = self._detect_keywords(prompt)
                    if detected:
                        for skill_type in set(detected.values()):
                            result[skill_type].append({
                                "task_id": task_id,
                                "project": project,
                                "prompt": entry.get("prompt", "")[:100],
                            })

                except (json.JSONDecodeError, IOError):
                    pass

        return result

    def _detect_keywords(self, text: str) -> Dict[str, str]:
        """Detect skill keywords in text."""
        detected = {}
        for keyword, skill_type in self.CLAUDE_DEV_KEYWORDS.items():
            if keyword in text:
                detected[keyword] = skill_type
        return detected

    def analyze_documentation_usage(self) -> Dict[str, Any]:
        """Analyze documentation file usage patterns."""
        result = {
            "doc_files": {},
            "doc_references": {},
            "sync_patterns": {},
        }

        # Check for .md files in project directories
        for doc_file in Path("/opt/server-agents/orchestrator").glob("*.md"):
            stat = doc_file.stat()
            result["doc_files"][doc_file.name] = {
                "size_bytes": stat.st_size,
                "last_modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
            }

        # Analyze job logs for doc references
        for job_dir in self.JOB_LOG_BASE.glob("*/dialogue/*/"):
            try:
                dialogue_file = job_dir / "agent.md"
                if dialogue_file.exists():
                    content = dialogue_file.read_text()
                    # Look for doc references
                    doc_refs = self._find_doc_references(content)
                    for ref in doc_refs:
                        result["doc_references"][ref] = result["doc_references"].get(ref, 0) + 1
            except (IOError, OSError):
                pass

        return result

    def _find_doc_references(self, text: str) -> List[str]:
        """Find references to documentation files in text."""
        refs = []
        # Match patterns like [doc_name], .md file references, etc.
        patterns = [
            r'\[([A-Z_\-]+\.md)\]',
            r'([A-Z_\-]+\.md)',
            r'luzia docs (\S+)',
        ]
        for pattern in patterns:
            refs.extend(re.findall(pattern, text, re.IGNORECASE))
        return list(set(refs))

    def get_skill_distribution(self) -> Dict[str, int]:
        """Get distribution of skills across all tasks."""
        return dict(self.skills_detected)

    def get_project_skill_usage(self) -> Dict[str, Dict[str, int]]:
        """Get skill usage breakdown by project."""
        result = {}

        # Analyze job logs
        for job_dir in self.JOB_LOG_BASE.glob("*/meta.json"):
            try:
                meta = json.loads(job_dir.read_text())
                project = meta.get("project", "unknown")
                skill = meta.get("skill")

                if skill:
                    if project not in result:
                        result[project] = {}
                    result[project][skill] = result[project].get(skill, 0) + 1

            except (json.JSONDecodeError, IOError):
                pass

        return result

    def generate_report(self) -> Dict[str, Any]:
        """Generate comprehensive usage report."""
        return {
            "timestamp": datetime.now().isoformat(),
            "queue_analysis": self.analyze_queue_entries(),
            "job_analysis": self.analyze_job_metadata(),
            "skill_detection": self.detect_skills_in_tasks(),
            "doc_analysis": self.analyze_documentation_usage(),
            "skill_distribution": self.get_skill_distribution(),
            "project_skill_usage": self.get_project_skill_usage(),
            "summary": {
                "total_unique_skills": len(self.skills_detected),
                "most_used_skill": max(self.skills_detected, key=self.skills_detected.get)
                                  if self.skills_detected else None,
                "skill_usage_stats": dict(self.skills_detected),
            }
        }

    def save_report(self, filepath: str) -> None:
        """Save report to file."""
        report = self.generate_report()
        with open(filepath, 'w') as f:
            json.dump(report, f, indent=2)
        print(f"Report saved to {filepath}")

    def print_summary(self) -> None:
        """Print summary of findings."""
        queue_analysis = self.analyze_queue_entries()
        job_analysis = self.analyze_job_metadata()
        skill_dist = self.get_skill_distribution()
        project_usage = self.get_project_skill_usage()

        print("\n" + "="*70)
        print("LUZIA SKILL & DOCUMENTATION USAGE REPORT")
        print("="*70)

        print("\n📋 QUEUE ANALYSIS")
        print(f"  Total pending tasks: {queue_analysis['total_tasks']}")
        print(f"  Tasks with skill match: {queue_analysis['tasks_with_skill']}")
        print(f"  High priority: {queue_analysis['by_priority'].get('high', 0)}")
        print(f"  Normal priority: {queue_analysis['by_priority'].get('normal', 0)}")

        if queue_analysis['skills_found']:
            print(f"\n  Skills in queue:")
            for skill, count in queue_analysis['skills_found'].items():
                print(f"    - {skill}: {count}")

        print("\n📊 JOB EXECUTION ANALYSIS (Last 24h)")
        print(f"  Total jobs: {job_analysis['total_jobs']}")
        print(f"  Jobs with skill: {job_analysis['jobs_with_skill']}")
        print(f"  Debug mode tasks: {job_analysis['debug_mode_tasks']}")

        if job_analysis['skills_used']:
            print(f"\n  Skills executed:")
            for skill, count in job_analysis['skills_used'].items():
                print(f"    - {skill}: {count}")

        print("\n📈 PROJECT SKILL DISTRIBUTION")
        for project, skills in project_usage.items():
            print(f"  {project}:")
            for skill, count in skills.items():
                print(f"    - {skill}: {count}")

        if skill_dist:
            print("\n🎯 SKILL USAGE STATISTICS")
            total = sum(skill_dist.values())
            for skill, count in sorted(skill_dist.items(), key=lambda x: x[1], reverse=True):
                pct = (count / total * 100) if total > 0 else 0
                print(f"  {skill}: {count} ({pct:.1f}%)")

        print("\n" + "="*70 + "\n")


def main():
    """Main entry point."""
    import sys

    analyzer = SkillUsageAnalyzer()

    if len(sys.argv) > 1:
        if sys.argv[1] == "json":
            report = analyzer.generate_report()
            print(json.dumps(report, indent=2))
        elif sys.argv[1] == "save" and len(sys.argv) > 2:
            analyzer.save_report(sys.argv[2])
        else:
            analyzer.print_summary()
    else:
        analyzer.print_summary()


if __name__ == "__main__":
    main()