Refactor cockpit to use DockerTmuxController pattern
Based on claude-code-tools TmuxCLIController, this refactor: - Added DockerTmuxController class for robust tmux session management - Implements send_keys() with configurable delay_enter - Implements capture_pane() for output retrieval - Implements wait_for_prompt() for pattern-based completion detection - Implements wait_for_idle() for content-hash-based idle detection - Implements wait_for_shell_prompt() for shell prompt detection Also includes workflow improvements: - Pre-task git snapshot before agent execution - Post-task commit protocol in agent guidelines Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
984
lib/time_metrics.py
Normal file
984
lib/time_metrics.py
Normal file
@@ -0,0 +1,984 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Time Metrics Module for Luzia Task Measurement & Reporting
|
||||
|
||||
Provides:
|
||||
- UTC timestamp generation for task dispatch/completion
|
||||
- Duration calculations
|
||||
- System context snapshots (CPU, memory, disk)
|
||||
- Timezone conversion support
|
||||
- Causality tracking between tasks
|
||||
- Performance baseline and anomaly detection
|
||||
|
||||
Uses Time MCP for accurate timezone-aware timestamps.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional, Tuple, List
|
||||
import sqlite3
|
||||
|
||||
# Default timezone for display
|
||||
DEFAULT_TIMEZONE = "America/Montevideo"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TIMESTAMP FUNCTIONS
|
||||
# =============================================================================
|
||||
|
||||
def get_utc_now() -> str:
|
||||
"""Get current UTC timestamp in ISO 8601 format.
|
||||
|
||||
Returns format: 2026-01-11T03:31:57Z
|
||||
"""
|
||||
return datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def get_utc_now_with_offset() -> str:
|
||||
"""Get current UTC timestamp with +00:00 offset.
|
||||
|
||||
Returns format: 2026-01-11T03:31:57+00:00
|
||||
"""
|
||||
return datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S+00:00")
|
||||
|
||||
|
||||
def parse_iso_timestamp(ts: str) -> Optional[datetime]:
|
||||
"""Parse ISO 8601 timestamp to datetime object.
|
||||
|
||||
Handles formats:
|
||||
- 2026-01-11T03:31:57Z
|
||||
- 2026-01-11T03:31:57+00:00
|
||||
- 2026-01-11T03:31:57-03:00
|
||||
- 2026-01-11T03:31:57 (assumes UTC)
|
||||
"""
|
||||
if not ts:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Remove Z suffix and treat as UTC
|
||||
if ts.endswith('Z'):
|
||||
ts = ts[:-1]
|
||||
|
||||
# Handle timezone offset
|
||||
if '+' in ts or (ts.count('-') > 2):
|
||||
# Has timezone - use fromisoformat
|
||||
return datetime.fromisoformat(ts)
|
||||
else:
|
||||
# Assume UTC
|
||||
return datetime.fromisoformat(ts)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
def calculate_duration_seconds(start_ts: str, end_ts: str) -> Optional[float]:
|
||||
"""Calculate duration between two ISO timestamps in seconds."""
|
||||
start = parse_iso_timestamp(start_ts)
|
||||
end = parse_iso_timestamp(end_ts)
|
||||
|
||||
if start is None or end is None:
|
||||
return None
|
||||
|
||||
return (end - start).total_seconds()
|
||||
|
||||
|
||||
def format_duration(seconds: Optional[float]) -> str:
|
||||
"""Format duration in HH:MM:SS format.
|
||||
|
||||
Examples:
|
||||
- 3661 -> "01:01:01"
|
||||
- 45 -> "00:00:45"
|
||||
- 3700 -> "01:01:40"
|
||||
"""
|
||||
if seconds is None or seconds < 0:
|
||||
return "--:--:--"
|
||||
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
secs = int(seconds % 60)
|
||||
|
||||
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
||||
|
||||
|
||||
def format_duration_human(seconds: Optional[float]) -> str:
|
||||
"""Format duration in human-readable format.
|
||||
|
||||
Examples:
|
||||
- 45 -> "45s"
|
||||
- 125 -> "2m 5s"
|
||||
- 3661 -> "1h 1m 1s"
|
||||
- 90061 -> "1d 1h 1m"
|
||||
"""
|
||||
if seconds is None or seconds < 0:
|
||||
return "unknown"
|
||||
|
||||
if seconds < 60:
|
||||
return f"{int(seconds)}s"
|
||||
elif seconds < 3600:
|
||||
mins = int(seconds // 60)
|
||||
secs = int(seconds % 60)
|
||||
return f"{mins}m {secs}s"
|
||||
elif seconds < 86400:
|
||||
hours = int(seconds // 3600)
|
||||
mins = int((seconds % 3600) // 60)
|
||||
secs = int(seconds % 60)
|
||||
return f"{hours}h {mins}m {secs}s"
|
||||
else:
|
||||
days = int(seconds // 86400)
|
||||
hours = int((seconds % 86400) // 3600)
|
||||
mins = int((seconds % 3600) // 60)
|
||||
return f"{days}d {hours}h {mins}m"
|
||||
|
||||
|
||||
def elapsed_since(start_ts: str) -> str:
|
||||
"""Get human-readable elapsed time since a timestamp."""
|
||||
now = get_utc_now()
|
||||
seconds = calculate_duration_seconds(start_ts, now)
|
||||
return format_duration_human(seconds)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TIMEZONE CONVERSION
|
||||
# =============================================================================
|
||||
|
||||
def convert_to_local_time(utc_ts: str, local_tz: str = DEFAULT_TIMEZONE) -> str:
|
||||
"""Convert UTC timestamp to local timezone display string.
|
||||
|
||||
Returns: "HH:MM:SS" in local timezone
|
||||
"""
|
||||
dt = parse_iso_timestamp(utc_ts)
|
||||
if dt is None:
|
||||
return utc_ts
|
||||
|
||||
# Get offset for the timezone (simplified approach - use pytz if available)
|
||||
try:
|
||||
import pytz
|
||||
utc = pytz.UTC
|
||||
local = pytz.timezone(local_tz)
|
||||
dt_utc = utc.localize(dt.replace(tzinfo=None) if dt.tzinfo else dt)
|
||||
dt_local = dt_utc.astimezone(local)
|
||||
return dt_local.strftime("%H:%M:%S")
|
||||
except ImportError:
|
||||
# Fallback: for America/Montevideo, offset is -03:00
|
||||
if local_tz == "America/Montevideo":
|
||||
local_dt = dt - timedelta(hours=3)
|
||||
return local_dt.strftime("%H:%M:%S")
|
||||
return dt.strftime("%H:%M:%S") + " UTC"
|
||||
|
||||
|
||||
def format_timestamp_with_local(utc_ts: str, local_tz: str = DEFAULT_TIMEZONE) -> str:
|
||||
"""Format timestamp showing both UTC and local time.
|
||||
|
||||
Returns: "2026-01-11T03:31:57Z (America/Montevideo: 00:31:57)"
|
||||
"""
|
||||
local_time = convert_to_local_time(utc_ts, local_tz)
|
||||
return f"{utc_ts} ({local_tz}: {local_time})"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SYSTEM CONTEXT CAPTURE
|
||||
# =============================================================================
|
||||
|
||||
def get_system_load() -> Tuple[float, float, float]:
|
||||
"""Get system load averages (1, 5, 15 minute)."""
|
||||
try:
|
||||
return os.getloadavg()
|
||||
except OSError:
|
||||
return (0.0, 0.0, 0.0)
|
||||
|
||||
|
||||
def get_memory_usage() -> Dict[str, Any]:
|
||||
"""Get memory usage from /proc/meminfo."""
|
||||
try:
|
||||
with open("/proc/meminfo") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
mem = {}
|
||||
for line in lines:
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
key = parts[0].rstrip(":")
|
||||
value = int(parts[1]) # kB
|
||||
mem[key] = value
|
||||
|
||||
total_mb = mem.get("MemTotal", 0) // 1024
|
||||
available_mb = mem.get("MemAvailable", mem.get("MemFree", 0)) // 1024
|
||||
used_mb = total_mb - available_mb
|
||||
used_pct = int(100 * used_mb / total_mb) if total_mb else 0
|
||||
|
||||
return {
|
||||
"total_mb": total_mb,
|
||||
"available_mb": available_mb,
|
||||
"used_mb": used_mb,
|
||||
"used_percent": used_pct
|
||||
}
|
||||
except Exception:
|
||||
return {
|
||||
"total_mb": 0,
|
||||
"available_mb": 0,
|
||||
"used_mb": 0,
|
||||
"used_percent": 0
|
||||
}
|
||||
|
||||
|
||||
def get_disk_usage(path: str = "/") -> Dict[str, Any]:
|
||||
"""Get disk usage for a path."""
|
||||
try:
|
||||
stat = os.statvfs(path)
|
||||
total = stat.f_blocks * stat.f_frsize
|
||||
free = stat.f_bavail * stat.f_frsize
|
||||
used = total - free
|
||||
used_pct = int(100 * used / total) if total else 0
|
||||
|
||||
return {
|
||||
"total_gb": round(total / (1024**3), 1),
|
||||
"free_gb": round(free / (1024**3), 1),
|
||||
"used_gb": round(used / (1024**3), 1),
|
||||
"used_percent": used_pct
|
||||
}
|
||||
except Exception:
|
||||
return {
|
||||
"total_gb": 0,
|
||||
"free_gb": 0,
|
||||
"used_gb": 0,
|
||||
"used_percent": 0
|
||||
}
|
||||
|
||||
|
||||
def capture_system_context() -> Dict[str, Any]:
|
||||
"""Capture full system context snapshot.
|
||||
|
||||
Returns dict suitable for storing in task metadata.
|
||||
"""
|
||||
load = get_system_load()
|
||||
memory = get_memory_usage()
|
||||
disk = get_disk_usage("/")
|
||||
|
||||
return {
|
||||
"timestamp": get_utc_now(),
|
||||
"system_load": [round(l, 2) for l in load],
|
||||
"memory": {
|
||||
"used_percent": memory["used_percent"],
|
||||
"available_mb": memory["available_mb"]
|
||||
},
|
||||
"disk": {
|
||||
"used_percent": disk["used_percent"],
|
||||
"free_gb": disk["free_gb"]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TASK TIME METADATA
|
||||
# =============================================================================
|
||||
|
||||
class TaskTimeTracker:
|
||||
"""Track time metrics for a task throughout its lifecycle."""
|
||||
|
||||
def __init__(self, task_id: str, project: str):
|
||||
self.task_id = task_id
|
||||
self.project = project
|
||||
self.dispatch_time: Optional[str] = None
|
||||
self.start_time: Optional[str] = None
|
||||
self.completion_time: Optional[str] = None
|
||||
self.dispatch_context: Optional[Dict] = None
|
||||
self.completion_context: Optional[Dict] = None
|
||||
self.agent_timezone: str = DEFAULT_TIMEZONE
|
||||
|
||||
def mark_dispatched(self) -> Dict[str, Any]:
|
||||
"""Mark task as dispatched, capturing context."""
|
||||
self.dispatch_time = get_utc_now()
|
||||
self.dispatch_context = capture_system_context()
|
||||
|
||||
return {
|
||||
"dispatch": {
|
||||
"utc_time": self.dispatch_time,
|
||||
"agent_timezone": self.agent_timezone,
|
||||
"system_load": self.dispatch_context["system_load"],
|
||||
"memory_percent": self.dispatch_context["memory"]["used_percent"],
|
||||
"disk_percent": self.dispatch_context["disk"]["used_percent"]
|
||||
}
|
||||
}
|
||||
|
||||
def mark_started(self) -> Dict[str, Any]:
|
||||
"""Mark task as started (agent begins execution)."""
|
||||
self.start_time = get_utc_now()
|
||||
|
||||
return {
|
||||
"start_time": self.start_time
|
||||
}
|
||||
|
||||
def mark_completed(self, exit_code: int = 0) -> Dict[str, Any]:
|
||||
"""Mark task as completed, capturing final context."""
|
||||
self.completion_time = get_utc_now()
|
||||
self.completion_context = capture_system_context()
|
||||
|
||||
duration = calculate_duration_seconds(
|
||||
self.dispatch_time or self.start_time or self.completion_time,
|
||||
self.completion_time
|
||||
)
|
||||
|
||||
return {
|
||||
"completion": {
|
||||
"utc_time": self.completion_time,
|
||||
"duration_seconds": duration,
|
||||
"duration_formatted": format_duration(duration),
|
||||
"exit_code": exit_code,
|
||||
"system_load": self.completion_context["system_load"],
|
||||
"memory_percent": self.completion_context["memory"]["used_percent"],
|
||||
"disk_percent": self.completion_context["disk"]["used_percent"]
|
||||
}
|
||||
}
|
||||
|
||||
def get_full_metrics(self) -> Dict[str, Any]:
|
||||
"""Get complete time metrics for the task."""
|
||||
metrics = {
|
||||
"task_id": self.task_id,
|
||||
"project": self.project,
|
||||
"agent_timezone": self.agent_timezone
|
||||
}
|
||||
|
||||
if self.dispatch_time:
|
||||
metrics["dispatch"] = {
|
||||
"utc_time": self.dispatch_time,
|
||||
"local_time": convert_to_local_time(self.dispatch_time, self.agent_timezone),
|
||||
"context": self.dispatch_context
|
||||
}
|
||||
|
||||
if self.start_time:
|
||||
metrics["start_time"] = self.start_time
|
||||
|
||||
if self.completion_time:
|
||||
duration = calculate_duration_seconds(
|
||||
self.dispatch_time or self.start_time,
|
||||
self.completion_time
|
||||
)
|
||||
metrics["completion"] = {
|
||||
"utc_time": self.completion_time,
|
||||
"local_time": convert_to_local_time(self.completion_time, self.agent_timezone),
|
||||
"duration_seconds": duration,
|
||||
"duration_formatted": format_duration(duration),
|
||||
"context": self.completion_context
|
||||
}
|
||||
|
||||
# Calculate elapsed if still running
|
||||
if self.dispatch_time and not self.completion_time:
|
||||
metrics["elapsed"] = elapsed_since(self.dispatch_time)
|
||||
metrics["status"] = "running"
|
||||
elif self.completion_time:
|
||||
metrics["status"] = "completed"
|
||||
else:
|
||||
metrics["status"] = "pending"
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CAUSALITY TRACKING
|
||||
# =============================================================================
|
||||
|
||||
def get_recent_completions(limit: int = 10) -> List[Dict[str, Any]]:
|
||||
"""Get recently completed tasks for causality tracking.
|
||||
|
||||
Returns list of {task_id, project, completion_time} sorted by time.
|
||||
"""
|
||||
jobs_dir = Path("/var/log/luz-orchestrator/jobs")
|
||||
completions = []
|
||||
|
||||
if not jobs_dir.exists():
|
||||
return completions
|
||||
|
||||
for job_dir in jobs_dir.iterdir():
|
||||
if not job_dir.is_dir():
|
||||
continue
|
||||
|
||||
meta_file = job_dir / "meta.json"
|
||||
if not meta_file.exists():
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(meta_file) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
# Check for time_metrics with completion
|
||||
time_metrics = meta.get("time_metrics", {})
|
||||
completion = time_metrics.get("completion", {})
|
||||
|
||||
if completion.get("utc_time"):
|
||||
completions.append({
|
||||
"task_id": meta.get("id", job_dir.name),
|
||||
"project": meta.get("project", "unknown"),
|
||||
"completion_time": completion["utc_time"],
|
||||
"duration_seconds": completion.get("duration_seconds")
|
||||
})
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Sort by completion time, newest first
|
||||
completions.sort(key=lambda x: x["completion_time"], reverse=True)
|
||||
return completions[:limit]
|
||||
|
||||
|
||||
def find_prior_task(dispatch_time: str, project: str = None) -> Optional[Dict[str, Any]]:
|
||||
"""Find the most recent task that completed before this dispatch.
|
||||
|
||||
Useful for causality tracking: "This task started 5min after Task X completed"
|
||||
"""
|
||||
recent = get_recent_completions(limit=20)
|
||||
|
||||
for task in recent:
|
||||
# Skip if filtering by project and doesn't match
|
||||
if project and task["project"] != project:
|
||||
continue
|
||||
|
||||
# Check if this task completed before our dispatch
|
||||
if task["completion_time"] < dispatch_time:
|
||||
# Calculate gap
|
||||
gap_seconds = calculate_duration_seconds(task["completion_time"], dispatch_time)
|
||||
return {
|
||||
**task,
|
||||
"gap_seconds": gap_seconds,
|
||||
"gap_formatted": format_duration_human(gap_seconds)
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PERFORMANCE BASELINE & ANOMALY DETECTION
|
||||
# =============================================================================
|
||||
|
||||
# Performance baseline database
|
||||
BASELINE_DB = Path("/var/lib/luzia/metrics/baselines.db")
|
||||
|
||||
|
||||
def init_baseline_db():
|
||||
"""Initialize the performance baseline database."""
|
||||
BASELINE_DB.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
conn = sqlite3.connect(BASELINE_DB)
|
||||
c = conn.cursor()
|
||||
|
||||
# Task durations by project
|
||||
c.execute('''CREATE TABLE IF NOT EXISTS task_durations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
project TEXT NOT NULL,
|
||||
task_hash TEXT,
|
||||
duration_seconds REAL NOT NULL,
|
||||
exit_code INTEGER,
|
||||
system_load_avg REAL,
|
||||
memory_percent INTEGER,
|
||||
recorded_at TEXT NOT NULL
|
||||
)''')
|
||||
|
||||
# Baseline statistics (calculated aggregates)
|
||||
c.execute('''CREATE TABLE IF NOT EXISTS baselines (
|
||||
project TEXT PRIMARY KEY,
|
||||
avg_duration REAL,
|
||||
median_duration REAL,
|
||||
stddev_duration REAL,
|
||||
p95_duration REAL,
|
||||
sample_count INTEGER,
|
||||
updated_at TEXT
|
||||
)''')
|
||||
|
||||
c.execute('CREATE INDEX IF NOT EXISTS idx_durations_project ON task_durations(project)')
|
||||
c.execute('CREATE INDEX IF NOT EXISTS idx_durations_recorded ON task_durations(recorded_at)')
|
||||
|
||||
conn.commit()
|
||||
return conn
|
||||
|
||||
|
||||
def record_task_duration(project: str, duration_seconds: float,
|
||||
exit_code: int = 0, task_hash: str = None,
|
||||
system_load: float = None, memory_percent: int = None):
|
||||
"""Record a task duration for baseline calculation."""
|
||||
conn = init_baseline_db()
|
||||
c = conn.cursor()
|
||||
|
||||
c.execute('''INSERT INTO task_durations
|
||||
(project, task_hash, duration_seconds, exit_code,
|
||||
system_load_avg, memory_percent, recorded_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)''',
|
||||
(project, task_hash, duration_seconds, exit_code,
|
||||
system_load, memory_percent, get_utc_now()))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def calculate_baseline(project: str) -> Dict[str, Any]:
|
||||
"""Calculate performance baseline for a project.
|
||||
|
||||
Uses last 30 days of successful tasks.
|
||||
"""
|
||||
conn = init_baseline_db()
|
||||
c = conn.cursor()
|
||||
|
||||
# Get durations from last 30 days, exit_code 0 only
|
||||
cutoff = (datetime.utcnow() - timedelta(days=30)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
c.execute('''SELECT duration_seconds FROM task_durations
|
||||
WHERE project = ? AND exit_code = 0 AND recorded_at > ?
|
||||
ORDER BY duration_seconds''',
|
||||
(project, cutoff))
|
||||
|
||||
durations = [row[0] for row in c.fetchall()]
|
||||
|
||||
if not durations:
|
||||
conn.close()
|
||||
return {
|
||||
"project": project,
|
||||
"sample_count": 0,
|
||||
"error": "No data available"
|
||||
}
|
||||
|
||||
# Calculate statistics
|
||||
n = len(durations)
|
||||
avg = sum(durations) / n
|
||||
|
||||
# Median
|
||||
mid = n // 2
|
||||
median = durations[mid] if n % 2 else (durations[mid-1] + durations[mid]) / 2
|
||||
|
||||
# Standard deviation
|
||||
variance = sum((d - avg) ** 2 for d in durations) / n
|
||||
stddev = variance ** 0.5
|
||||
|
||||
# 95th percentile
|
||||
p95_idx = int(n * 0.95)
|
||||
p95 = durations[min(p95_idx, n - 1)]
|
||||
|
||||
baseline = {
|
||||
"project": project,
|
||||
"avg_duration": round(avg, 2),
|
||||
"median_duration": round(median, 2),
|
||||
"stddev_duration": round(stddev, 2),
|
||||
"p95_duration": round(p95, 2),
|
||||
"sample_count": n,
|
||||
"updated_at": get_utc_now()
|
||||
}
|
||||
|
||||
# Store baseline
|
||||
c.execute('''INSERT OR REPLACE INTO baselines
|
||||
(project, avg_duration, median_duration, stddev_duration,
|
||||
p95_duration, sample_count, updated_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)''',
|
||||
(project, baseline["avg_duration"], baseline["median_duration"],
|
||||
baseline["stddev_duration"], baseline["p95_duration"],
|
||||
baseline["sample_count"], baseline["updated_at"]))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
return baseline
|
||||
|
||||
|
||||
def check_anomaly(project: str, duration_seconds: float) -> Dict[str, Any]:
|
||||
"""Check if a task duration is anomalous compared to baseline.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"is_anomaly": bool,
|
||||
"severity": "normal"|"slow"|"very_slow"|"extreme",
|
||||
"ratio": float (duration / avg),
|
||||
"message": str
|
||||
}
|
||||
"""
|
||||
conn = init_baseline_db()
|
||||
c = conn.cursor()
|
||||
|
||||
c.execute('''SELECT avg_duration, stddev_duration, p95_duration, sample_count
|
||||
FROM baselines WHERE project = ?''', (project,))
|
||||
row = c.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not row or row[3] < 5: # Need at least 5 samples
|
||||
return {
|
||||
"is_anomaly": False,
|
||||
"severity": "unknown",
|
||||
"ratio": 0,
|
||||
"message": "Insufficient baseline data"
|
||||
}
|
||||
|
||||
avg, stddev, p95, sample_count = row
|
||||
|
||||
if avg <= 0:
|
||||
return {
|
||||
"is_anomaly": False,
|
||||
"severity": "unknown",
|
||||
"ratio": 0,
|
||||
"message": "Invalid baseline"
|
||||
}
|
||||
|
||||
ratio = duration_seconds / avg
|
||||
|
||||
# Classify severity
|
||||
if duration_seconds <= p95:
|
||||
severity = "normal"
|
||||
is_anomaly = False
|
||||
message = "Within normal range"
|
||||
elif duration_seconds <= avg + 2 * stddev:
|
||||
severity = "slow"
|
||||
is_anomaly = True
|
||||
message = f"Slower than average ({ratio:.1f}x normal)"
|
||||
elif duration_seconds <= avg + 3 * stddev:
|
||||
severity = "very_slow"
|
||||
is_anomaly = True
|
||||
message = f"Much slower than average ({ratio:.1f}x normal)"
|
||||
else:
|
||||
severity = "extreme"
|
||||
is_anomaly = True
|
||||
message = f"Extremely slow ({ratio:.1f}x normal)"
|
||||
|
||||
return {
|
||||
"is_anomaly": is_anomaly,
|
||||
"severity": severity,
|
||||
"ratio": round(ratio, 2),
|
||||
"message": message,
|
||||
"baseline_avg": round(avg, 2),
|
||||
"baseline_p95": round(p95, 2)
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# AGGREGATE METRICS
|
||||
# =============================================================================
|
||||
|
||||
def get_project_metrics(project: str, days: int = 7) -> Dict[str, Any]:
|
||||
"""Get aggregate metrics for a project over a time period."""
|
||||
conn = init_baseline_db()
|
||||
c = conn.cursor()
|
||||
|
||||
cutoff = (datetime.utcnow() - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
c.execute('''SELECT
|
||||
COUNT(*) as total_tasks,
|
||||
SUM(duration_seconds) as total_time,
|
||||
AVG(duration_seconds) as avg_duration,
|
||||
MIN(duration_seconds) as min_duration,
|
||||
MAX(duration_seconds) as max_duration,
|
||||
SUM(CASE WHEN exit_code = 0 THEN 1 ELSE 0 END) as success_count,
|
||||
AVG(system_load_avg) as avg_load,
|
||||
AVG(memory_percent) as avg_memory
|
||||
FROM task_durations
|
||||
WHERE project = ? AND recorded_at > ?''',
|
||||
(project, cutoff))
|
||||
|
||||
row = c.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not row or row[0] == 0:
|
||||
return {
|
||||
"project": project,
|
||||
"period_days": days,
|
||||
"total_tasks": 0,
|
||||
"error": "No data in period"
|
||||
}
|
||||
|
||||
total, total_time, avg, min_dur, max_dur, success, avg_load, avg_mem = row
|
||||
|
||||
return {
|
||||
"project": project,
|
||||
"period_days": days,
|
||||
"total_tasks": total,
|
||||
"total_time_seconds": total_time or 0,
|
||||
"total_time_formatted": format_duration(total_time),
|
||||
"avg_duration_seconds": round(avg or 0, 2),
|
||||
"avg_duration_formatted": format_duration(avg),
|
||||
"min_duration_seconds": round(min_dur or 0, 2),
|
||||
"max_duration_seconds": round(max_dur or 0, 2),
|
||||
"success_rate": round(100 * success / total, 1) if total else 0,
|
||||
"avg_system_load": round(avg_load or 0, 2),
|
||||
"avg_memory_percent": round(avg_mem or 0, 1)
|
||||
}
|
||||
|
||||
|
||||
def get_all_projects_metrics(days: int = 7) -> Dict[str, Any]:
|
||||
"""Get aggregate metrics for all projects."""
|
||||
conn = init_baseline_db()
|
||||
c = conn.cursor()
|
||||
|
||||
cutoff = (datetime.utcnow() - timedelta(days=days)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
# Get per-project stats
|
||||
c.execute('''SELECT
|
||||
project,
|
||||
COUNT(*) as total_tasks,
|
||||
SUM(duration_seconds) as total_time,
|
||||
AVG(duration_seconds) as avg_duration,
|
||||
SUM(CASE WHEN exit_code = 0 THEN 1 ELSE 0 END) as success_count
|
||||
FROM task_durations
|
||||
WHERE recorded_at > ?
|
||||
GROUP BY project
|
||||
ORDER BY total_time DESC''',
|
||||
(cutoff,))
|
||||
|
||||
by_project = {}
|
||||
total_tasks = 0
|
||||
total_time = 0
|
||||
|
||||
for row in c.fetchall():
|
||||
project, tasks, time_sec, avg, success = row
|
||||
by_project[project] = {
|
||||
"total_tasks": tasks,
|
||||
"total_time_seconds": time_sec or 0,
|
||||
"total_time_formatted": format_duration(time_sec),
|
||||
"avg_duration_seconds": round(avg or 0, 2),
|
||||
"success_rate": round(100 * success / tasks, 1) if tasks else 0
|
||||
}
|
||||
total_tasks += tasks
|
||||
total_time += (time_sec or 0)
|
||||
|
||||
# Get longest running tasks
|
||||
c.execute('''SELECT project, task_hash, duration_seconds, recorded_at
|
||||
FROM task_durations
|
||||
WHERE recorded_at > ?
|
||||
ORDER BY duration_seconds DESC
|
||||
LIMIT 10''',
|
||||
(cutoff,))
|
||||
|
||||
longest_tasks = []
|
||||
for row in c.fetchall():
|
||||
longest_tasks.append({
|
||||
"project": row[0],
|
||||
"task_hash": row[1],
|
||||
"duration_seconds": row[2],
|
||||
"duration_formatted": format_duration(row[2]),
|
||||
"recorded_at": row[3]
|
||||
})
|
||||
|
||||
conn.close()
|
||||
|
||||
return {
|
||||
"period_days": days,
|
||||
"total_tasks": total_tasks,
|
||||
"total_time_seconds": total_time,
|
||||
"total_time_formatted": format_duration(total_time),
|
||||
"by_project": by_project,
|
||||
"longest_tasks": longest_tasks
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TIME-BUCKETED SUCCESS RATES
|
||||
# =============================================================================
|
||||
|
||||
def get_success_by_duration_bucket(project: str = None) -> Dict[str, Any]:
|
||||
"""Get task success rate by execution time bucket.
|
||||
|
||||
Buckets: <1min, 1-5min, 5-15min, 15-30min, 30-60min, >60min
|
||||
"""
|
||||
conn = init_baseline_db()
|
||||
c = conn.cursor()
|
||||
|
||||
buckets = [
|
||||
("under_1m", 0, 60),
|
||||
("1_to_5m", 60, 300),
|
||||
("5_to_15m", 300, 900),
|
||||
("15_to_30m", 900, 1800),
|
||||
("30_to_60m", 1800, 3600),
|
||||
("over_60m", 3600, float('inf'))
|
||||
]
|
||||
|
||||
results = {}
|
||||
|
||||
for bucket_name, min_dur, max_dur in buckets:
|
||||
if project:
|
||||
c.execute('''SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN exit_code = 0 THEN 1 ELSE 0 END) as success
|
||||
FROM task_durations
|
||||
WHERE project = ? AND duration_seconds >= ? AND duration_seconds < ?''',
|
||||
(project, min_dur, max_dur if max_dur != float('inf') else 1e10))
|
||||
else:
|
||||
c.execute('''SELECT
|
||||
COUNT(*) as total,
|
||||
SUM(CASE WHEN exit_code = 0 THEN 1 ELSE 0 END) as success
|
||||
FROM task_durations
|
||||
WHERE duration_seconds >= ? AND duration_seconds < ?''',
|
||||
(min_dur, max_dur if max_dur != float('inf') else 1e10))
|
||||
|
||||
row = c.fetchone()
|
||||
total, success = row[0] or 0, row[1] or 0
|
||||
|
||||
results[bucket_name] = {
|
||||
"total": total,
|
||||
"success": success,
|
||||
"success_rate": round(100 * success / total, 1) if total else 0
|
||||
}
|
||||
|
||||
conn.close()
|
||||
return results
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DISPLAY FORMATTERS FOR LUZIA
|
||||
# =============================================================================
|
||||
|
||||
def format_job_with_timing(job: Dict[str, Any]) -> str:
|
||||
"""Format a job entry for 'luzia jobs' output with timing info.
|
||||
|
||||
Returns: "agent:dss:002746 | dss | in_progress | 14:30:00 UTC | 00:45:30 | CPU: 0.52"
|
||||
"""
|
||||
job_id = job.get("id", "unknown")
|
||||
project = job.get("project", "unknown")
|
||||
status = job.get("status", "unknown")
|
||||
|
||||
time_metrics = job.get("time_metrics", {})
|
||||
dispatch = time_metrics.get("dispatch", {})
|
||||
completion = time_metrics.get("completion", {})
|
||||
|
||||
# Dispatch time
|
||||
dispatch_time = dispatch.get("utc_time", job.get("started", ""))
|
||||
if dispatch_time:
|
||||
dispatch_display = dispatch_time[11:19] # HH:MM:SS from ISO
|
||||
else:
|
||||
dispatch_display = "--:--:--"
|
||||
|
||||
# Duration
|
||||
if completion.get("duration_formatted"):
|
||||
duration = completion["duration_formatted"]
|
||||
elif dispatch_time:
|
||||
duration = elapsed_since(dispatch_time)
|
||||
else:
|
||||
duration = "--:--:--"
|
||||
|
||||
# System load
|
||||
load = dispatch.get("system_load", [0, 0, 0])
|
||||
if isinstance(load, list) and len(load) > 0:
|
||||
load_display = f"{load[0]:.2f}"
|
||||
else:
|
||||
load_display = "-.--"
|
||||
|
||||
return f"{job_id} | {project} | {status} | {dispatch_display} UTC | {duration} | CPU: {load_display}"
|
||||
|
||||
|
||||
def format_logs_header(job: Dict[str, Any], local_tz: str = DEFAULT_TIMEZONE) -> str:
|
||||
"""Format the header for 'luzia logs' output with timing info.
|
||||
|
||||
Returns a formatted header block with timing and system info.
|
||||
"""
|
||||
job_id = job.get("id", "unknown")
|
||||
project = job.get("project", "unknown")
|
||||
status = job.get("status", "unknown")
|
||||
|
||||
time_metrics = job.get("time_metrics", {})
|
||||
dispatch = time_metrics.get("dispatch", {})
|
||||
completion = time_metrics.get("completion", {})
|
||||
|
||||
lines = [
|
||||
"═" * 65,
|
||||
f"Job: agent:{project}:{job_id}",
|
||||
f"Agent: {project}",
|
||||
]
|
||||
|
||||
# Dispatch time with local
|
||||
dispatch_time = dispatch.get("utc_time", job.get("started", ""))
|
||||
if dispatch_time:
|
||||
local_time = convert_to_local_time(dispatch_time, local_tz)
|
||||
lines.append(f"Dispatched: {dispatch_time} ({local_tz}: {local_time})")
|
||||
|
||||
# Status with duration
|
||||
if completion.get("utc_time"):
|
||||
duration = completion.get("duration_formatted", "--:--:--")
|
||||
lines.append(f"Status: completed (took {duration})")
|
||||
elif dispatch_time:
|
||||
elapsed = elapsed_since(dispatch_time)
|
||||
lines.append(f"Status: {status} (running for {elapsed})")
|
||||
else:
|
||||
lines.append(f"Status: {status}")
|
||||
|
||||
# System context
|
||||
ctx = dispatch if dispatch else completion
|
||||
if ctx:
|
||||
load = ctx.get("system_load", [0, 0, 0])
|
||||
mem = ctx.get("memory_percent", 0)
|
||||
disk = ctx.get("disk_percent", 0)
|
||||
if isinstance(load, list) and len(load) > 0:
|
||||
lines.append(f"System: CPU {load[0]:.2f}, Memory {mem}%, Disk {disk}%")
|
||||
|
||||
lines.append("═" * 65)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CONVENIENCE FUNCTION FOR LUZIA INTEGRATION
|
||||
# =============================================================================
|
||||
|
||||
def create_task_time_metadata(task_id: str, project: str) -> Dict[str, Any]:
|
||||
"""Create initial time metadata for a new task dispatch.
|
||||
|
||||
Call this from spawn_claude_agent() to add time tracking.
|
||||
"""
|
||||
tracker = TaskTimeTracker(task_id, project)
|
||||
dispatch_data = tracker.mark_dispatched()
|
||||
|
||||
return {
|
||||
"time_metrics": dispatch_data,
|
||||
"time_tracker_data": {
|
||||
"task_id": task_id,
|
||||
"project": project,
|
||||
"dispatch_time": tracker.dispatch_time,
|
||||
"agent_timezone": tracker.agent_timezone
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def update_task_completion_metadata(meta: Dict[str, Any], exit_code: int = 0) -> Dict[str, Any]:
|
||||
"""Update task metadata with completion time metrics.
|
||||
|
||||
Call this when a job completes to finalize time tracking.
|
||||
"""
|
||||
tracker_data = meta.get("time_tracker_data", {})
|
||||
|
||||
tracker = TaskTimeTracker(
|
||||
tracker_data.get("task_id", "unknown"),
|
||||
tracker_data.get("project", "unknown")
|
||||
)
|
||||
tracker.dispatch_time = tracker_data.get("dispatch_time")
|
||||
tracker.agent_timezone = tracker_data.get("agent_timezone", DEFAULT_TIMEZONE)
|
||||
|
||||
completion_data = tracker.mark_completed(exit_code)
|
||||
|
||||
# Merge with existing time_metrics
|
||||
time_metrics = meta.get("time_metrics", {})
|
||||
time_metrics.update(completion_data)
|
||||
|
||||
# Record for baseline
|
||||
duration = completion_data.get("completion", {}).get("duration_seconds")
|
||||
if duration:
|
||||
try:
|
||||
record_task_duration(
|
||||
tracker.project,
|
||||
duration,
|
||||
exit_code,
|
||||
system_load=tracker.completion_context.get("system_load", [0])[0],
|
||||
memory_percent=tracker.completion_context.get("memory", {}).get("used_percent")
|
||||
)
|
||||
except Exception:
|
||||
pass # Don't fail on baseline recording errors
|
||||
|
||||
return {"time_metrics": time_metrics}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Quick test
|
||||
print("Time Metrics Module Test")
|
||||
print("=" * 40)
|
||||
print(f"Current UTC: {get_utc_now()}")
|
||||
print(f"System Context: {json.dumps(capture_system_context(), indent=2)}")
|
||||
|
||||
# Test tracker
|
||||
tracker = TaskTimeTracker("test-001", "admin")
|
||||
print(f"\nDispatch: {json.dumps(tracker.mark_dispatched(), indent=2)}")
|
||||
|
||||
import time as t
|
||||
t.sleep(1)
|
||||
|
||||
print(f"\nCompletion: {json.dumps(tracker.mark_completed(0), indent=2)}")
|
||||
print(f"\nFull Metrics: {json.dumps(tracker.get_full_metrics(), indent=2)}")
|
||||
Reference in New Issue
Block a user