luzia/lib/web_search_integrator.py

#!/usr/bin/env python3
"""
Web Search Integrator - Context enhancement via web search

Features:
1. Detect when web search would be helpful
2. Query Stack Overflow for solutions
3. Fetch and summarize reference docs
4. Track learned solutions
5. Integrate references into prompts
"""

import json
import re
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime
from dataclasses import dataclass, asdict

@dataclass
class WebReference:
    """A reference found via web search"""
    title: str
    url: str
    source: str  # stackoverflow, docs, blog, etc
    snippet: str
    relevance: float  # 0-1 score
    topic: str
    found_at: str

@dataclass
class LearningResult:
    """A solution learned from web search"""
    problem: str
    solution: str
    references: List[str]
    tags: List[str]
    learned_at: str
    confidence: float  # How confident in this solution

class WebSearchIntegrator:
    """Integrates web search for context enhancement"""

    def __init__(self, cache_dir: Optional[Path] = None):
        """Initialize web search integrator

        Args:
            cache_dir: Optional directory for caching search results
        """
        self.cache_dir = cache_dir or Path("/tmp/.luzia-web-cache")
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.learning_db: List[LearningResult] = []
        self.search_history: List[Dict[str, Any]] = []
        self.load_learning_db()

    def load_learning_db(self) -> None:
        """Load learned solutions from cache"""
        db_file = self.cache_dir / "learning.json"
        if db_file.exists():
            try:
                data = json.loads(db_file.read_text())
                self.learning_db = [LearningResult(**item) for item in data.get("learned", [])]
            except Exception as e:
                print(f"[Warning] Failed to load learning DB: {e}")

    def save_learning_db(self) -> None:
        """Save learned solutions to cache"""
        db_file = self.cache_dir / "learning.json"
        db_file.write_text(json.dumps({
            "learned": [asdict(item) for item in self.learning_db],
            "timestamp": datetime.now().isoformat()
        }, indent=2))

    def should_search(self, task: str, error: Optional[str] = None) -> Tuple[bool, str]:
        """Determine if web search would be helpful

        Args:
            task: Task description
            error: Optional error message

        Returns:
            Tuple of (should_search, search_query)
        """
        search_triggers = [
            # Error investigation
            (r"error|exception|failed|problem", "error_investigation"),
            # How-to tasks
            (r"how\s+to|guide|tutorial|learn", "how_to"),
            # Library/tool questions
            (r"npm|pip|cargo|ruby", "package_mgmt"),
            # Framework questions
            (r"react|vue|angular|django|flask", "framework"),
            # Integration/setup
            (r"integrate|setup|configure|install", "setup"),
            # Best practices
            (r"best practice|pattern|architecture", "architecture"),
        ]

        combined = f"{task} {error or ''}".lower()

        for pattern, category in search_triggers:
            if re.search(pattern, combined):
                # Extract search query
                if "error" in combined:
                    # For errors, extract the error message
                    search_query = re.sub(r".*error.*?:\s*", "", error or task)[:80]
                else:
                    search_query = task[:100]

                return True, search_query

        return False, ""

    def find_stackoverflow_answer(self, query: str) -> Optional[WebReference]:
        """Find Stack Overflow answer for query

        This is a reference implementation. In production, would use
        Stack Overflow API or web search.

        Args:
            query: Search query

        Returns:
            Best matching reference, or None
        """
        # In actual implementation, would call web search API
        # For now, return structure for documentation
        return WebReference(
            title="Relevant Stack Overflow Answer",
            url="https://stackoverflow.com/search?q=...",
            source="stackoverflow",
            snippet="[Search result snippet would appear here]",
            relevance=0.8,
            topic=query,
            found_at=datetime.now().isoformat()
        )

    def fetch_documentation(self, library: str, topic: str) -> Optional[WebReference]:
        """Fetch documentation for a library/topic

        Args:
            library: Library name (npm package, python module, etc)
            topic: Specific topic within library

        Returns:
            Reference to documentation, or None
        """
        # Common documentation URLs
        doc_patterns = {
            "react": "https://react.dev/reference/",
            "nodejs": "https://nodejs.org/api/",
            "python": "https://docs.python.org/3/",
            "typescript": "https://www.typescriptlang.org/docs/",
            "rust": "https://doc.rust-lang.org/",
            "django": "https://docs.djangoproject.com/",
            "flask": "https://flask.palletsprojects.com/",
        }

        base_url = doc_patterns.get(library.lower())
        if not base_url:
            return None

        return WebReference(
            title=f"{library} Documentation - {topic}",
            url=f"{base_url}{topic}/",
            source="official_docs",
            snippet=f"Official documentation for {library} {topic}",
            relevance=0.95,
            topic=topic,
            found_at=datetime.now().isoformat()
        )

    def detect_tech_stack(self, task: str) -> List[str]:
        """Detect technology stack from task description

        Args:
            task: Task description

        Returns:
            List of detected technologies
        """
        tech_patterns = {
            "React": r"react|jsx",
            "TypeScript": r"typescript|\.ts",
            "Node.js": r"node|npm|javascript",
            "Python": r"python|pip|py",
            "Rust": r"rust|cargo",
            "Docker": r"docker|container",
            "PostgreSQL": r"postgres|sql",
            "MongoDB": r"mongo|mongodb",
            "Redis": r"redis",
            "Kubernetes": r"k8s|kubernetes",
            "GraphQL": r"graphql|apollo",
            "REST": r"rest|api",
            "WebSocket": r"websocket|ws",
        }

        detected = []
        task_lower = task.lower()

        for tech, pattern in tech_patterns.items():
            if re.search(pattern, task_lower):
                detected.append(tech)

        return detected

    def generate_context_section(self, references: List[WebReference]) -> str:
        """Generate a context section with web references

        Args:
            references: List of web references

        Returns:
            Markdown section to add to prompt
        """
        if not references:
            return ""

        sections = ["# Web References and Context\n"]

        for ref in references:
            sections.append(f"\n## {ref.title}")
            sections.append(f"**Source:** {ref.source}")
            sections.append(f"**URL:** {ref.url}")
            sections.append(f"**Relevance:** {ref.relevance:.1%}")
            sections.append(f"\n{ref.snippet}\n")

        return "\n".join(sections)

    def learn_solution(self, problem: str, solution: str,
                      references: List[str], tags: List[str],
                      confidence: float = 0.8) -> None:
        """Record a learned solution for future reference

        Args:
            problem: Problem description
            solution: Solution description
            references: List of reference URLs
            tags: Topic tags
            confidence: Confidence in this solution (0-1)
        """
        learning = LearningResult(
            problem=problem,
            solution=solution,
            references=references,
            tags=tags,
            learned_at=datetime.now().isoformat(),
            confidence=confidence
        )
        self.learning_db.append(learning)
        self.save_learning_db()

    def search_learned_solutions(self, query: str) -> List[LearningResult]:
        """Search previously learned solutions

        Args:
            query: Search query

        Returns:
            List of matching learned solutions
        """
        matches = []
        query_lower = query.lower()

        for result in self.learning_db:
            # Search in problem, solution, and tags
            if (query_lower in result.problem.lower() or
                query_lower in result.solution.lower() or
                any(query_lower in tag.lower() for tag in result.tags)):
                matches.append(result)

        # Sort by confidence and recency
        matches.sort(
            key=lambda r: (r.confidence, datetime.fromisoformat(r.learned_at)),
            reverse=True
        )

        return matches

    def get_reference_for_technology(self, tech: str) -> Optional[WebReference]:
        """Get reference documentation for a technology

        Args:
            tech: Technology name

        Returns:
            Reference to documentation
        """
        refs = {
            "React": self.fetch_documentation("react", "introduction"),
            "TypeScript": self.fetch_documentation("typescript", "handbook"),
            "Node.js": self.fetch_documentation("nodejs", "api"),
            "Python": self.fetch_documentation("python", "tutorial"),
            "Docker": WebReference(
                title="Docker Documentation",
                url="https://docs.docker.com/",
                source="official_docs",
                snippet="Official Docker documentation",
                relevance=1.0,
                topic="Docker",
                found_at=datetime.now().isoformat()
            ),
        }
        return refs.get(tech)

    def generate_research_prompt(self, task: str, tech_stack: List[str],
                               error: Optional[str] = None) -> str:
        """Generate a prompt for web research

        Args:
            task: Task description
            tech_stack: List of technologies involved
            error: Optional error message

        Returns:
            Research prompt
        """
        sections = [
            f"# Research Task\n",
            f"**Task:** {task}\n",
        ]

        if error:
            sections.append(f"**Error:** {error}\n")

        if tech_stack:
            sections.append(f"**Technologies:** {', '.join(tech_stack)}\n")

        # Learned solutions
        learned = self.search_learned_solutions(task)
        if learned:
            sections.append("\n## Previously Learned Solutions\n")
            for i, result in enumerate(learned[:3], 1):
                sections.append(f"{i}. **{result.problem}**")
                sections.append(f"   - Solution: {result.solution}")
                sections.append(f"   - Tags: {', '.join(result.tags)}")
                sections.append(f"   - Confidence: {result.confidence:.0%}\n")

        sections.append("\n## Research Approach\n")
        sections.append("1. Check previously learned solutions")
        sections.append("2. Search Stack Overflow for similar issues")
        sections.append("3. Check official documentation")
        sections.append("4. Look for blog posts or tutorials")
        sections.append("5. Synthesize findings into solution")

        return "\n".join(sections)

    def export_learning_data(self, output_path: Path) -> None:
        """Export learning database for analysis

        Args:
            output_path: Path to write export to
        """
        export_data = {
            "total_learned": len(self.learning_db),
            "by_topic": {},
            "average_confidence": 0,
            "solutions": [asdict(item) for item in self.learning_db]
        }

        # Calculate statistics
        if self.learning_db:
            export_data["average_confidence"] = (
                sum(r.confidence for r in self.learning_db) / len(self.learning_db)
            )

            # Group by tags
            by_topic = {}
            for result in self.learning_db:
                for tag in result.tags:
                    if tag not in by_topic:
                        by_topic[tag] = 0
                    by_topic[tag] += 1
            export_data["by_topic"] = by_topic

        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_text(json.dumps(export_data, indent=2))

    def get_stats(self) -> Dict[str, Any]:
        """Get statistics about web search usage

        Returns:
            Statistics dict
        """
        if not self.learning_db:
            return {
                "total_learned": 0,
                "average_confidence": 0,
                "searches_performed": len(self.search_history)
            }

        avg_confidence = sum(r.confidence for r in self.learning_db) / len(self.learning_db)

        return {
            "total_learned": len(self.learning_db),
            "average_confidence": avg_confidence,
            "searches_performed": len(self.search_history),
            "topics": list(set(
                tag for result in self.learning_db
                for tag in result.tags
            ))
        }