Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
477 lines
16 KiB
Python
477 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
QA Postflight - Post-task validation and learning capture
|
|
|
|
Runs after each task completes to:
|
|
1. Validate task output quality
|
|
2. Detect common error patterns
|
|
3. Capture learnings for the knowledge graph
|
|
4. Generate QA report
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import os
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Dict, Any, List, Optional
|
|
import logging
|
|
|
|
# Configure logging
|
|
log_dir = Path("/var/log/luz-orchestrator")
|
|
log_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Paths
|
|
JOBS_DIR = Path("/var/log/luz-orchestrator/jobs")
|
|
QA_REPORTS_DIR = Path("/var/log/luz-orchestrator/qa-reports")
|
|
LEARNING_LOG = log_dir / "learning-captures.jsonl"
|
|
|
|
|
|
class QAPostflight:
|
|
"""Post-task QA validation and learning capture."""
|
|
|
|
# Error patterns to detect
|
|
ERROR_PATTERNS = [
|
|
(r"error:|Error:|ERROR:", "error_detected", "high"),
|
|
(r"exception:|Exception:|EXCEPTION:", "exception_detected", "high"),
|
|
(r"failed|Failed|FAILED", "failure_detected", "medium"),
|
|
(r"permission denied|Permission denied", "permission_error", "high"),
|
|
(r"not found|Not found|NOT FOUND", "not_found_error", "medium"),
|
|
(r"timeout|Timeout|TIMEOUT", "timeout_error", "high"),
|
|
(r"connection refused|Connection refused", "connection_error", "high"),
|
|
(r"syntax error|SyntaxError", "syntax_error", "high"),
|
|
(r"import error|ImportError|ModuleNotFoundError", "import_error", "high"),
|
|
(r"GOOGLE_KEY not configured|API.*not configured", "config_error", "medium"),
|
|
]
|
|
|
|
# Success patterns
|
|
SUCCESS_PATTERNS = [
|
|
(r"completed successfully|task completed|done", "success_signal"),
|
|
(r"tests? passed|all.*pass", "tests_passed"),
|
|
(r"deployed|deployment.*success", "deployment_success"),
|
|
(r"created|updated|fixed", "action_completed"),
|
|
]
|
|
|
|
# Learning extraction patterns
|
|
LEARNING_PATTERNS = [
|
|
(r"learned?:?\s*(.+?)(?:\n|$)", "explicit_learning"),
|
|
(r"solution:?\s*(.+?)(?:\n|$)", "solution_found"),
|
|
(r"fixed by:?\s*(.+?)(?:\n|$)", "fix_applied"),
|
|
(r"root cause:?\s*(.+?)(?:\n|$)", "root_cause"),
|
|
(r"workaround:?\s*(.+?)(?:\n|$)", "workaround"),
|
|
]
|
|
|
|
def __init__(self):
|
|
QA_REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
def validate_task(self, job_id: str) -> Dict[str, Any]:
|
|
"""
|
|
Run full postflight validation on a completed task.
|
|
|
|
Returns validation report with:
|
|
- exit_code analysis
|
|
- error detection
|
|
- success signals
|
|
- quality score
|
|
- extracted learnings
|
|
"""
|
|
job_dir = JOBS_DIR / job_id
|
|
report = {
|
|
"job_id": job_id,
|
|
"timestamp": datetime.now().isoformat(),
|
|
"validated": False,
|
|
"exit_code": None,
|
|
"quality_score": 0,
|
|
"errors": [],
|
|
"warnings": [],
|
|
"successes": [],
|
|
"learnings": [],
|
|
"recommendations": [],
|
|
}
|
|
|
|
if not job_dir.exists():
|
|
report["errors"].append(f"Job directory not found: {job_dir}")
|
|
return report
|
|
|
|
# Read output file
|
|
output_file = job_dir / "output.log"
|
|
output_content = ""
|
|
if output_file.exists():
|
|
try:
|
|
output_content = output_file.read_text(errors='ignore')
|
|
except Exception as e:
|
|
report["warnings"].append(f"Could not read output: {e}")
|
|
|
|
# Read metadata
|
|
meta_file = job_dir / "meta.json"
|
|
meta = {}
|
|
if meta_file.exists():
|
|
try:
|
|
meta = json.loads(meta_file.read_text())
|
|
except:
|
|
pass
|
|
|
|
report["project"] = meta.get("project", "unknown")
|
|
report["task"] = meta.get("task", "")[:200]
|
|
|
|
# Extract exit code
|
|
report["exit_code"] = self._extract_exit_code(output_content)
|
|
|
|
# Run validations
|
|
report["errors"] = self._detect_errors(output_content)
|
|
report["successes"] = self._detect_successes(output_content)
|
|
report["learnings"] = self._extract_learnings(output_content)
|
|
|
|
# Calculate quality score
|
|
report["quality_score"] = self._calculate_quality_score(report)
|
|
|
|
# Generate recommendations
|
|
report["recommendations"] = self._generate_recommendations(report)
|
|
|
|
report["validated"] = True
|
|
|
|
# Save report
|
|
self._save_report(report)
|
|
|
|
# Capture learnings
|
|
if report["learnings"]:
|
|
self._capture_learnings(report)
|
|
|
|
return report
|
|
|
|
def _extract_exit_code(self, content: str) -> Optional[int]:
|
|
"""Extract exit code from output."""
|
|
match = re.search(r'exit:(\d+)', content)
|
|
if match:
|
|
return int(match.group(1))
|
|
return None
|
|
|
|
def _detect_errors(self, content: str) -> List[Dict[str, Any]]:
|
|
"""Detect error patterns in output."""
|
|
errors = []
|
|
for pattern, error_type, severity in self.ERROR_PATTERNS:
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
if matches:
|
|
# Get context around first match
|
|
match = re.search(pattern, content, re.IGNORECASE)
|
|
if match:
|
|
start = max(0, match.start() - 50)
|
|
end = min(len(content), match.end() + 100)
|
|
context = content[start:end].strip()
|
|
errors.append({
|
|
"type": error_type,
|
|
"severity": severity,
|
|
"count": len(matches),
|
|
"context": context[:200],
|
|
})
|
|
return errors
|
|
|
|
def _detect_successes(self, content: str) -> List[Dict[str, str]]:
|
|
"""Detect success patterns in output."""
|
|
successes = []
|
|
for pattern, success_type in self.SUCCESS_PATTERNS:
|
|
if re.search(pattern, content, re.IGNORECASE):
|
|
successes.append({"type": success_type})
|
|
return successes
|
|
|
|
def _extract_learnings(self, content: str) -> List[Dict[str, str]]:
|
|
"""Extract learnings from output."""
|
|
learnings = []
|
|
for pattern, learning_type in self.LEARNING_PATTERNS:
|
|
matches = re.findall(pattern, content, re.IGNORECASE)
|
|
for match in matches:
|
|
if len(match.strip()) > 10: # Filter noise
|
|
learnings.append({
|
|
"type": learning_type,
|
|
"content": match.strip()[:500],
|
|
})
|
|
return learnings
|
|
|
|
def _calculate_quality_score(self, report: Dict) -> int:
|
|
"""Calculate quality score 0-100."""
|
|
score = 50 # Base score
|
|
|
|
# Exit code impact
|
|
if report["exit_code"] == 0:
|
|
score += 30
|
|
elif report["exit_code"] is not None:
|
|
score -= 20
|
|
|
|
# Error impact
|
|
for error in report["errors"]:
|
|
if error["severity"] == "high":
|
|
score -= 15
|
|
elif error["severity"] == "medium":
|
|
score -= 8
|
|
|
|
# Success signals boost
|
|
score += len(report["successes"]) * 5
|
|
|
|
# Learnings boost (shows reflection)
|
|
score += len(report["learnings"]) * 3
|
|
|
|
return max(0, min(100, score))
|
|
|
|
def _generate_recommendations(self, report: Dict) -> List[str]:
|
|
"""Generate actionable recommendations."""
|
|
recs = []
|
|
|
|
if report["exit_code"] != 0 and report["exit_code"] is not None:
|
|
recs.append("Task failed - review error logs and consider retry")
|
|
|
|
for error in report["errors"]:
|
|
if error["type"] == "config_error":
|
|
recs.append("Configuration error detected - check environment variables")
|
|
elif error["type"] == "permission_error":
|
|
recs.append("Permission issue - verify file ownership and access rights")
|
|
elif error["type"] == "timeout_error":
|
|
recs.append("Timeout occurred - consider increasing timeout or optimizing task")
|
|
elif error["type"] == "import_error":
|
|
recs.append("Import error - check dependencies are installed")
|
|
|
|
if report["quality_score"] < 50:
|
|
recs.append("Low quality score - task may need review or retry")
|
|
|
|
if not report["learnings"]:
|
|
recs.append("No learnings captured - consider documenting key insights")
|
|
|
|
return recs
|
|
|
|
def _save_report(self, report: Dict):
|
|
"""Save QA report to file."""
|
|
report_file = QA_REPORTS_DIR / f"{report['job_id']}.json"
|
|
try:
|
|
with open(report_file, 'w') as f:
|
|
json.dump(report, f, indent=2, default=str)
|
|
logger.info(f"QA report saved: {report_file}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to save QA report: {e}")
|
|
|
|
def _capture_learnings(self, report: Dict):
|
|
"""Capture learnings to learning log."""
|
|
try:
|
|
with open(LEARNING_LOG, 'a') as f:
|
|
for learning in report["learnings"]:
|
|
entry = {
|
|
"timestamp": report["timestamp"],
|
|
"job_id": report["job_id"],
|
|
"project": report["project"],
|
|
"type": learning["type"],
|
|
"content": learning["content"],
|
|
"quality_score": report["quality_score"],
|
|
}
|
|
f.write(json.dumps(entry) + "\n")
|
|
logger.info(f"Captured {len(report['learnings'])} learnings from {report['job_id']}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to capture learnings: {e}")
|
|
|
|
|
|
class PerTaskLearning:
|
|
"""Per-task learning capture and KG integration."""
|
|
|
|
def __init__(self):
|
|
self.kg_path = Path("/etc/luz-knowledge/research.db")
|
|
|
|
def capture_task_learning(self, job_id: str, report: Dict) -> Dict[str, Any]:
|
|
"""
|
|
Capture learnings from task and store in KG.
|
|
|
|
Extracts:
|
|
- Solutions found
|
|
- Errors resolved
|
|
- Patterns discovered
|
|
- Tools/commands used
|
|
"""
|
|
result = {
|
|
"job_id": job_id,
|
|
"learnings_stored": 0,
|
|
"relations_created": 0,
|
|
}
|
|
|
|
if not report.get("learnings"):
|
|
return result
|
|
|
|
# Try to store in KG
|
|
try:
|
|
from knowledge_graph import KnowledgeGraph
|
|
kg = KnowledgeGraph("research")
|
|
|
|
for learning in report["learnings"]:
|
|
# Create learning entity
|
|
entity_name = f"learning_{job_id}_{learning['type']}"
|
|
content = f"""
|
|
Project: {report.get('project', 'unknown')}
|
|
Task: {report.get('task', '')[:100]}
|
|
Type: {learning['type']}
|
|
Learning: {learning['content']}
|
|
Quality Score: {report.get('quality_score', 0)}
|
|
"""
|
|
kg.add_entity(
|
|
name=entity_name,
|
|
entity_type="finding",
|
|
content=content,
|
|
metadata={
|
|
"job_id": job_id,
|
|
"project": report.get("project"),
|
|
"learning_type": learning["type"],
|
|
"quality_score": report.get("quality_score", 0),
|
|
},
|
|
source=f"job:{job_id}"
|
|
)
|
|
result["learnings_stored"] += 1
|
|
|
|
# Create relation to project if exists
|
|
project = report.get("project")
|
|
if project:
|
|
try:
|
|
kg.add_relation(entity_name, project, "learned_from")
|
|
result["relations_created"] += 1
|
|
except:
|
|
pass
|
|
|
|
logger.info(f"Stored {result['learnings_stored']} learnings in KG for {job_id}")
|
|
|
|
except ImportError:
|
|
logger.warning("KnowledgeGraph not available - learnings stored to log only")
|
|
except Exception as e:
|
|
logger.error(f"Failed to store learnings in KG: {e}")
|
|
|
|
return result
|
|
|
|
|
|
def _send_telegram_notification(report: Dict[str, Any]) -> bool:
|
|
"""
|
|
Send telegram notification for important task completions.
|
|
|
|
Notifies for:
|
|
- Task failures (exit_code != 0)
|
|
- Low quality score (<50)
|
|
- High severity errors
|
|
"""
|
|
try:
|
|
# Import telegram bridge
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
from telegram_bridge import notify_bruno as send_notification
|
|
|
|
job_id = report.get("job_id", "unknown")[:8]
|
|
project = report.get("project", "luzia")
|
|
exit_code = report.get("exit_code")
|
|
quality = report.get("quality_score", 0)
|
|
|
|
# Determine if notification needed and severity
|
|
should_notify = False
|
|
severity = "info"
|
|
message = ""
|
|
|
|
# Task failure
|
|
if exit_code is not None and exit_code != 0:
|
|
should_notify = True
|
|
severity = "critical" if exit_code in [126, 137, 254] else "warning"
|
|
message = f"Task `{job_id}` failed (exit {exit_code})"
|
|
|
|
# Low quality score
|
|
elif quality < 50:
|
|
should_notify = True
|
|
severity = "warning"
|
|
message = f"Task `{job_id}` low quality ({quality}/100)"
|
|
|
|
# High severity errors detected
|
|
elif any(e.get("severity") == "high" for e in report.get("errors", [])):
|
|
should_notify = True
|
|
severity = "warning"
|
|
high_errors = [e["type"] for e in report.get("errors", []) if e.get("severity") == "high"]
|
|
message = f"Task `{job_id}` errors: {', '.join(high_errors[:3])}"
|
|
|
|
# Success with learnings (optional notification)
|
|
elif exit_code == 0 and report.get("learnings"):
|
|
# Only notify on success if there are significant learnings
|
|
if len(report.get("learnings", [])) >= 2:
|
|
should_notify = True
|
|
severity = "info"
|
|
message = f"Task `{job_id}` completed with {len(report['learnings'])} learnings"
|
|
|
|
if should_notify:
|
|
send_notification(message, project, job_id, severity)
|
|
logger.info(f"Telegram notification sent for {job_id}")
|
|
return True
|
|
|
|
except ImportError:
|
|
logger.debug("Telegram bridge not available - notification skipped")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to send telegram notification: {e}")
|
|
|
|
return False
|
|
|
|
|
|
def run_postflight(job_id: str) -> Dict[str, Any]:
|
|
"""
|
|
Main entry point for postflight validation.
|
|
|
|
Called after task completion to:
|
|
1. Validate output quality
|
|
2. Extract and store learnings
|
|
3. Generate QA report
|
|
4. Send telegram notification for important events
|
|
"""
|
|
logger.info(f"Running postflight for job: {job_id}")
|
|
|
|
qa = QAPostflight()
|
|
report = qa.validate_task(job_id)
|
|
|
|
# Per-task learning
|
|
learning = PerTaskLearning()
|
|
learning_result = learning.capture_task_learning(job_id, report)
|
|
|
|
report["learning_result"] = learning_result
|
|
|
|
# Send telegram notification for important events
|
|
report["telegram_notified"] = _send_telegram_notification(report)
|
|
|
|
# Log summary
|
|
logger.info(
|
|
f"Postflight complete for {job_id}: "
|
|
f"score={report['quality_score']}, "
|
|
f"errors={len(report['errors'])}, "
|
|
f"learnings={len(report['learnings'])}, "
|
|
f"notified={report.get('telegram_notified', False)}"
|
|
)
|
|
|
|
return report
|
|
|
|
|
|
# CLI interface
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
if len(sys.argv) < 2:
|
|
print("Usage: qa_postflight.py <job_id>")
|
|
print(" qa_postflight.py --recent [count]")
|
|
sys.exit(1)
|
|
|
|
if sys.argv[1] == "--recent":
|
|
# Run postflight on recent jobs
|
|
count = int(sys.argv[2]) if len(sys.argv) > 2 else 5
|
|
jobs = sorted(JOBS_DIR.iterdir(), key=lambda x: x.stat().st_mtime, reverse=True)[:count]
|
|
for job_dir in jobs:
|
|
job_id = job_dir.name
|
|
print(f"\n=== Postflight: {job_id} ===")
|
|
report = run_postflight(job_id)
|
|
print(f" Score: {report['quality_score']}/100")
|
|
print(f" Errors: {len(report['errors'])}")
|
|
print(f" Learnings: {len(report['learnings'])}")
|
|
if report['recommendations']:
|
|
print(f" Recommendations:")
|
|
for rec in report['recommendations'][:3]:
|
|
print(f" - {rec}")
|
|
else:
|
|
job_id = sys.argv[1]
|
|
report = run_postflight(job_id)
|
|
print(json.dumps(report, indent=2, default=str))
|