""" Project Knowledge Loader - Per-project RAG context injection for Luzia. Industry Standard Implementation: - .knowledge/ directory in each project (similar to LlamaIndex storage/, LangChain vector_store/) - entities.json: Project-specific facts and definitions - relations.json: Connections between concepts - context.md: Human-readable project context (like CLAUDE.md) - vectors/: Optional embeddings for semantic search Usage: from project_knowledge_loader import ProjectKnowledgeLoader loader = ProjectKnowledgeLoader() context = loader.load_project_context("musica", task_query) """ import json import os import sqlite3 import logging from typing import Dict, List, Any, Optional from dataclasses import dataclass, asdict from datetime import datetime from pathlib import Path logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') logger = logging.getLogger(__name__) # ============================================================================= # DATA STRUCTURES # ============================================================================= @dataclass class KnowledgeEntity: """Single knowledge entity in project KG.""" id: str name: str type: str # component, api, config, pattern, decision, etc. description: str tags: List[str] metadata: Dict[str, Any] created_at: str = "" updated_at: str = "" @dataclass class KnowledgeRelation: """Relationship between entities.""" source: str relation: str # uses, depends_on, implements, extends, etc. target: str context: str = "" weight: float = 1.0 @dataclass class ProjectKnowledge: """Complete knowledge for a project.""" project: str version: str entities: List[KnowledgeEntity] relations: List[KnowledgeRelation] context_md: str # Human-readable context last_updated: str # ============================================================================= # KNOWLEDGE STRUCTURE TEMPLATE # ============================================================================= KNOWLEDGE_TEMPLATE = { "version": "1.0", "project": "", "description": "", "entities": [ { "id": "project_root", "name": "Project Root", "type": "component", "description": "Main project structure", "tags": ["structure"], "metadata": {} } ], "relations": [], "context": { "focus": "", "tech_stack": [], "conventions": [], "important_files": [], "common_tasks": [] } } CONTEXT_MD_TEMPLATE = """# {project} Project Knowledge ## Overview {description} ## Tech Stack {tech_stack} ## Key Directories {directories} ## Common Tasks {tasks} ## Important Patterns {patterns} --- *Auto-generated by Luzia. Edit to customize project context.* """ # ============================================================================= # PROJECT KNOWLEDGE LOADER # ============================================================================= class ProjectKnowledgeLoader: """Load and manage per-project knowledge graphs.""" # Standard paths Luzia expects in each project KNOWLEDGE_DIR = ".knowledge" ENTITIES_FILE = "entities.json" RELATIONS_FILE = "relations.json" CONTEXT_FILE = "context.md" KG_DB_FILE = "knowledge.db" def __init__(self, config_path: str = "/opt/server-agents/orchestrator/config.json"): self.config_path = config_path self.projects = self._load_projects() self._cache: Dict[str, ProjectKnowledge] = {} logger.debug(f"ProjectKnowledgeLoader initialized with {len(self.projects)} projects") def _load_projects(self) -> Dict[str, Dict]: """Load project configurations from Luzia config.""" try: with open(self.config_path, 'r') as f: config = json.load(f) return config.get("projects", {}) except Exception as e: logger.warning(f"Could not load config: {e}") return {} def get_knowledge_path(self, project: str) -> Optional[Path]: """Get the .knowledge/ path for a project.""" if project not in self.projects: return None project_path = self.projects[project].get("path", f"/home/{project}") return Path(project_path) / self.KNOWLEDGE_DIR def has_knowledge(self, project: str) -> bool: """Check if a project has a .knowledge/ directory.""" kg_path = self.get_knowledge_path(project) if kg_path is None: return False try: return kg_path.exists() except PermissionError: # Can't access the directory (not our project) return False def load_project_knowledge(self, project: str, force_reload: bool = False) -> Optional[ProjectKnowledge]: """Load all knowledge for a project.""" # Check cache if not force_reload and project in self._cache: return self._cache[project] kg_path = self.get_knowledge_path(project) if not kg_path or not kg_path.exists(): logger.debug(f"No .knowledge/ for project {project}") return None try: # Load entities entities = [] entities_file = kg_path / self.ENTITIES_FILE if entities_file.exists(): with open(entities_file, 'r') as f: data = json.load(f) for e in data.get("entities", []): entities.append(KnowledgeEntity(**e)) # Load relations relations = [] relations_file = kg_path / self.RELATIONS_FILE if relations_file.exists(): with open(relations_file, 'r') as f: data = json.load(f) for r in data.get("relations", []): relations.append(KnowledgeRelation(**r)) # Load context.md context_md = "" context_file = kg_path / self.CONTEXT_FILE if context_file.exists(): with open(context_file, 'r') as f: context_md = f.read() # Create ProjectKnowledge knowledge = ProjectKnowledge( project=project, version="1.0", entities=entities, relations=relations, context_md=context_md, last_updated=datetime.now().isoformat() ) # Cache it self._cache[project] = knowledge logger.debug(f"Loaded knowledge for {project}: {len(entities)} entities, {len(relations)} relations") return knowledge except Exception as e: logger.warning(f"Error loading knowledge for {project}: {e}") return None def search_project_knowledge(self, project: str, query: str, top_k: int = 5) -> List[Dict[str, Any]]: """Search project knowledge for relevant context.""" knowledge = self.load_project_knowledge(project) if not knowledge: return [] results = [] query_lower = query.lower() query_words = set(query_lower.split()) # Score entities by relevance for entity in knowledge.entities: score = 0 # Check name match if query_lower in entity.name.lower(): score += 3 # Check description match if query_lower in entity.description.lower(): score += 2 # Check tag matches for tag in entity.tags: if tag.lower() in query_lower or query_lower in tag.lower(): score += 1 # Word overlap scoring entity_words = set(entity.name.lower().split() + entity.description.lower().split()) overlap = len(query_words & entity_words) score += overlap * 0.5 if score > 0: results.append({ "entity_id": entity.id, "name": entity.name, "type": entity.type, "description": entity.description, "tags": entity.tags, "relevance": score, "source": "project_kg" }) # Sort by relevance and return top_k results.sort(key=lambda x: x["relevance"], reverse=True) return results[:top_k] def get_related_entities(self, project: str, entity_id: str, depth: int = 1) -> List[Dict[str, Any]]: """Get entities related to a given entity.""" knowledge = self.load_project_knowledge(project) if not knowledge: return [] related = [] entity_map = {e.id: e for e in knowledge.entities} # Find direct relations for relation in knowledge.relations: if relation.source == entity_id: target = entity_map.get(relation.target) if target: related.append({ "entity": asdict(target), "relation": relation.relation, "direction": "outgoing" }) elif relation.target == entity_id: source = entity_map.get(relation.source) if source: related.append({ "entity": asdict(source), "relation": relation.relation, "direction": "incoming" }) return related def format_for_prompt(self, project: str, query: str, max_tokens: int = 2000) -> str: """Format project knowledge for prompt injection.""" knowledge = self.load_project_knowledge(project) if not knowledge: return "" sections = [] # Add context.md summary (prioritize human-written context) if knowledge.context_md: # Take first 1000 chars of context.md context_preview = knowledge.context_md[:1000] if len(knowledge.context_md) > 1000: context_preview += "\n..." sections.append(f"## Project Context\n{context_preview}") # Add relevant entities based on query relevant = self.search_project_knowledge(project, query, top_k=5) if relevant: entities_text = "## Relevant Project Knowledge\n" for item in relevant: entities_text += f"- **{item['name']}** ({item['type']}): {item['description'][:100]}\n" sections.append(entities_text) # Combine sections result = "\n\n".join(sections) # Truncate if needed if len(result) > max_tokens * 4: # rough char to token ratio result = result[:max_tokens * 4] + "\n...(truncated)" return result def initialize_project_knowledge(self, project: str, overwrite: bool = False) -> bool: """Create .knowledge/ directory with template files for a project.""" kg_path = self.get_knowledge_path(project) if not kg_path: logger.error(f"Unknown project: {project}") return False if kg_path.exists() and not overwrite: logger.info(f"Knowledge already exists for {project}. Use overwrite=True to replace.") return False try: # Create directory kg_path.mkdir(parents=True, exist_ok=True) # Get project info project_config = self.projects.get(project, {}) description = project_config.get("description", "") focus = project_config.get("focus", "") # Create entities.json now = datetime.now().isoformat() entities_data = { "version": "1.0", "project": project, "entities": [ { "id": "project_overview", "name": project, "type": "project", "description": description or f"{project} project", "tags": ["root", "overview"], "metadata": {"focus": focus}, "created_at": now, "updated_at": now } ] } with open(kg_path / self.ENTITIES_FILE, 'w') as f: json.dump(entities_data, f, indent=2) # Create relations.json relations_data = { "version": "1.0", "project": project, "relations": [] } with open(kg_path / self.RELATIONS_FILE, 'w') as f: json.dump(relations_data, f, indent=2) # Create context.md context_content = CONTEXT_MD_TEMPLATE.format( project=project, description=description or "Project description here", tech_stack="- Add tech stack items", directories="- /src - Source code\n- /docs - Documentation", tasks="- Build: `npm run build`\n- Test: `npm test`", patterns="- Add important patterns and conventions" ) with open(kg_path / self.CONTEXT_FILE, 'w') as f: f.write(context_content) logger.info(f"Initialized .knowledge/ for {project} at {kg_path}") return True except Exception as e: logger.error(f"Failed to initialize knowledge for {project}: {e}") return False def sync_from_claude_md(self, project: str) -> bool: """Sync knowledge from existing CLAUDE.md file.""" project_path = self.projects.get(project, {}).get("path") if not project_path: return False claude_md_path = Path(project_path) / "CLAUDE.md" if not claude_md_path.exists(): logger.debug(f"No CLAUDE.md found for {project}") return False kg_path = self.get_knowledge_path(project) if not kg_path: return False try: # Ensure .knowledge/ exists kg_path.mkdir(parents=True, exist_ok=True) # Read CLAUDE.md with open(claude_md_path, 'r') as f: claude_content = f.read() # Write to context.md (preserving CLAUDE.md content) context_file = kg_path / self.CONTEXT_FILE with open(context_file, 'w') as f: f.write(f"# {project} Project Knowledge\n\n") f.write("*Synced from CLAUDE.md*\n\n") f.write(claude_content) logger.info(f"Synced CLAUDE.md to .knowledge/context.md for {project}") return True except Exception as e: logger.error(f"Failed to sync CLAUDE.md for {project}: {e}") return False def list_projects_with_knowledge(self) -> List[Dict[str, Any]]: """List all projects and their knowledge status.""" results = [] for project, config in self.projects.items(): has_kg = self.has_knowledge(project) kg_path = self.get_knowledge_path(project) info = { "project": project, "path": config.get("path", ""), "description": config.get("description", ""), "has_knowledge": has_kg, "knowledge_path": str(kg_path) if kg_path else None } if has_kg: knowledge = self.load_project_knowledge(project) if knowledge: info["entity_count"] = len(knowledge.entities) info["relation_count"] = len(knowledge.relations) info["has_context_md"] = bool(knowledge.context_md) results.append(info) return results # ============================================================================= # RAG CONTEXT BUILDER (For prompt injection) # ============================================================================= class ProjectRAGContext: """Build RAG-enhanced context for task dispatch.""" def __init__(self): self.loader = ProjectKnowledgeLoader() def build_context(self, project: str, task: str, include_global: bool = True) -> Dict[str, Any]: """ Build complete RAG context for a project task. Returns: { "project_context": str, # Formatted project knowledge "relevant_entities": List[Dict], # Relevant knowledge items "context_source": str, # "project_kg", "global_kg", "none" "metadata": Dict # Additional context info } """ result = { "project_context": "", "relevant_entities": [], "context_source": "none", "metadata": {} } # Try to load project-specific knowledge project_context = self.loader.format_for_prompt(project, task) if project_context: result["project_context"] = project_context result["relevant_entities"] = self.loader.search_project_knowledge(project, task) result["context_source"] = "project_kg" result["metadata"]["project"] = project result["metadata"]["entities_found"] = len(result["relevant_entities"]) # Optionally include global knowledge (from /etc/luz-knowledge/) if include_global: try: from langchain_kg_retriever import KnowledgeGraphRetriever global_retriever = KnowledgeGraphRetriever() global_results = global_retriever.retrieve(f"{project} {task}", top_k=3) if global_results: global_text = "\n## Global Knowledge\n" for item in global_results: global_text += f"- {item['name']}: {item.get('content', '')[:100]}\n" result["project_context"] += global_text result["metadata"]["global_results"] = len(global_results) if result["context_source"] == "none": result["context_source"] = "global_kg" else: result["context_source"] = "hybrid" except Exception as e: logger.debug(f"Global KG retrieval failed: {e}") return result # ============================================================================= # CLI INTERFACE # ============================================================================= def main(): """CLI for project knowledge management.""" import sys loader = ProjectKnowledgeLoader() if len(sys.argv) < 2: print("Usage: project_knowledge_loader.py [args]") print("") print("Commands:") print(" list - List all projects and knowledge status") print(" init - Initialize .knowledge/ for a project") print(" sync - Sync from CLAUDE.md") print(" search - Search project knowledge") print(" context - Get RAG context for a task") print(" init-all - Initialize knowledge for all projects") return command = sys.argv[1] if command == "list": projects = loader.list_projects_with_knowledge() print(f"\n{'Project':<15} {'Has KG':<10} {'Entities':<10} {'Description'}") print("-" * 70) for p in projects: has_kg = "Yes" if p["has_knowledge"] else "No" entities = p.get("entity_count", "-") print(f"{p['project']:<15} {has_kg:<10} {str(entities):<10} {p['description'][:30]}") elif command == "init" and len(sys.argv) > 2: project = sys.argv[2] success = loader.initialize_project_knowledge(project) if success: print(f"Initialized .knowledge/ for {project}") else: print(f"Failed to initialize knowledge for {project}") elif command == "sync" and len(sys.argv) > 2: project = sys.argv[2] success = loader.sync_from_claude_md(project) if success: print(f"Synced CLAUDE.md to .knowledge/ for {project}") else: print(f"Failed to sync (no CLAUDE.md or error)") elif command == "search" and len(sys.argv) > 3: project = sys.argv[2] query = " ".join(sys.argv[3:]) results = loader.search_project_knowledge(project, query) print(f"\nSearch results for '{query}' in {project}:") for r in results: print(f" - {r['name']} ({r['type']}): {r['description'][:50]}... [score: {r['relevance']:.2f}]") elif command == "context" and len(sys.argv) > 3: project = sys.argv[2] task = " ".join(sys.argv[3:]) rag = ProjectRAGContext() context = rag.build_context(project, task) print(f"\nRAG Context for {project} - '{task}':") print(f"Source: {context['context_source']}") print(f"Entities found: {len(context['relevant_entities'])}") print("\n--- Context ---") print(context['project_context'][:2000]) elif command == "init-all": for project in loader.projects: if not loader.has_knowledge(project): loader.initialize_project_knowledge(project) print(f"Initialized: {project}") else: print(f"Skipped (exists): {project}") else: print(f"Unknown command: {command}") print("Run without args for help") if __name__ == "__main__": main()