Files
llamacpp-ha/src/llamacpp_ha/monitor.py
2026-05-18 01:02:57 +02:00

319 lines
13 KiB
Python

from __future__ import annotations
import time
from dataclasses import dataclass, field
from fastapi import APIRouter
from fastapi.responses import HTMLResponse, JSONResponse
from .queue import RequestQueue
from .registry import BackendRegistry
from .session_store import SessionStore
from .slot_tracker import SlotTracker
@dataclass
class ProxyStats:
"""Per-app counters and timing. Created by create_app, passed to build_router."""
start_time: float = field(default_factory=time.monotonic)
total_requests: int = 0
session_hits: int = 0
session_misses: int = 0
new_sessions: int = 0
model_requests: dict[str, int] = field(default_factory=dict)
model_tokens: dict[str, int] = field(default_factory=dict)
backend_requests: dict[str, int] = field(default_factory=dict)
backend_session_hits: dict[str, int] = field(default_factory=dict)
backend_session_misses: dict[str, int] = field(default_factory=dict)
def increment_requests(self) -> None:
self.total_requests += 1
def record_model(self, model_id: str, tokens: int | None) -> None:
if not model_id:
return
self.model_requests[model_id] = self.model_requests.get(model_id, 0) + 1
if tokens:
self.model_tokens[model_id] = self.model_tokens.get(model_id, 0) + tokens
def record_backend(self, url: str) -> None:
self.backend_requests[url] = self.backend_requests.get(url, 0) + 1
def record_session(self, preferred_url: str | None, actual_url: str) -> None:
if preferred_url:
if actual_url == preferred_url:
self.session_hits += 1
self.backend_session_hits[actual_url] = self.backend_session_hits.get(actual_url, 0) + 1
else:
self.session_misses += 1
# Count miss against the preferred backend (the one that was expected but missed).
self.backend_session_misses[preferred_url] = self.backend_session_misses.get(preferred_url, 0) + 1
else:
self.new_sessions += 1
def session_hit_rate(self) -> int | None:
total = self.session_hits + self.session_misses
return round(self.session_hits / total * 100) if total else None
def uptime_str(self) -> str:
secs = int(time.monotonic() - self.start_time)
h, remainder = divmod(secs, 3600)
m, s = divmod(remainder, 60)
return f"{h:02d}:{m:02d}:{s:02d}"
_HTML = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>llamacpp-ha Monitor</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body { font-family: monospace; background: #0d1117; color: #c9d1d9; padding: 20px; }
h1 { color: #58a6ff; margin-bottom: 4px; font-size: 1.4em; }
.subtitle { color: #8b949e; font-size: 0.85em; margin-bottom: 20px; }
h2 { color: #79c0ff; margin: 20px 0 8px; font-size: 1em; text-transform: uppercase; letter-spacing: 1px; }
table { width: 100%; border-collapse: collapse; font-size: 0.85em; }
th { background: #161b22; color: #8b949e; text-align: left; padding: 6px 10px; border-bottom: 1px solid #30363d; }
td { padding: 6px 10px; border-bottom: 1px solid #21262d; }
tr:hover td { background: #161b22; }
.badge { display: inline-block; padding: 2px 8px; border-radius: 10px; font-size: 0.8em; }
.badge-live { background: #0f2c12; color: #3fb950; }
.badge-dead { background: #2c0f0f; color: #f85149; }
.slots { color: #d29922; }
.empty { color: #484f58; font-style: italic; }
.hit { color: #3fb950; }
.miss { color: #f85149; }
#status { float: right; font-size: 0.8em; color: #8b949e; }
.summary { display: flex; gap: 20px; flex-wrap: wrap; margin: 10px 0 20px; }
.stat { background: #161b22; border: 1px solid #30363d; border-radius: 6px; padding: 10px 16px; }
.stat-val { font-size: 1.6em; color: #58a6ff; }
.stat-label { font-size: 0.75em; color: #8b949e; margin-top: 2px; }
.num { text-align: right; }
</style>
</head>
<body>
<h1>llamacpp-ha <span id="status">loading...</span></h1>
<div class="subtitle">Smart Load Balancer for llama.cpp</div>
<div class="summary">
<div class="stat"><div class="stat-val" id="uptime">-</div><div class="stat-label">Uptime</div></div>
<div class="stat"><div class="stat-val" id="total-req">-</div><div class="stat-label">Requests Served</div></div>
<div class="stat"><div class="stat-val" id="queue-depth">-</div><div class="stat-label">Queue Depth</div></div>
<div class="stat"><div class="stat-val" id="session-count">-</div><div class="stat-label">Active Sessions</div></div>
<div class="stat"><div class="stat-val" id="live-count">-</div><div class="stat-label">Live Backends</div></div>
<div class="stat"><div class="stat-val" id="hit-rate">-</div><div class="stat-label">Session Hit Rate</div></div>
</div>
<h2>Backends</h2>
<table>
<thead><tr><th>URL</th><th>Status</th><th>Active Model</th><th>Models</th><th>Slots</th><th class="num">Requests</th><th>Last Poll</th></tr></thead>
<tbody id="backends-body"><tr><td colspan="7" class="empty">Loading...</td></tr></tbody>
</table>
<h2>Queue</h2>
<table>
<thead><tr><th>Request ID</th><th>Model</th><th>Session</th><th>Wait (s)</th><th>Est. Tokens</th><th>Skips</th></tr></thead>
<tbody id="queue-body"><tr><td colspan="6" class="empty">Queue is empty</td></tr></tbody>
</table>
<h2>Model Stats</h2>
<table>
<thead><tr><th>Model</th><th class="num">Requests</th><th class="num">Est. Tokens</th><th class="num">Active Sessions</th></tr></thead>
<tbody id="model-body"><tr><td colspan="4" class="empty">No data yet</td></tr></tbody>
</table>
<h2>Backend Stats</h2>
<table>
<thead><tr><th>Backend</th><th class="num">Requests</th><th class="num">Share</th><th>Session Affinity</th></tr></thead>
<tbody id="backend-stats-body"><tr><td colspan="4" class="empty">No data yet</td></tr></tbody>
</table>
<script>
(function() {
function esc(s) {
return String(s).replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');
}
function fmt(n) {
return n >= 1000 ? (n/1000).toFixed(1) + 'k' : String(n);
}
function render(data) {
document.getElementById('uptime').textContent = data.uptime;
document.getElementById('total-req').textContent = data.total_requests;
document.getElementById('queue-depth').textContent = data.queue_depth;
document.getElementById('session-count').textContent = data.session_count;
document.getElementById('live-count').textContent = data.live_backend_count;
const hr = data.session_hit_rate;
const hrEl = document.getElementById('hit-rate');
if (hr == null) {
hrEl.textContent = 'N/A';
hrEl.className = 'stat-val';
} else {
hrEl.textContent = hr + '%';
hrEl.className = 'stat-val ' + (hr >= 80 ? 'hit' : hr >= 50 ? 'slots' : 'miss');
}
const bBody = document.getElementById('backends-body');
if (!data.backends.length) {
bBody.innerHTML = '<tr><td colspan="7" class="empty">No backends configured</td></tr>';
} else {
bBody.innerHTML = data.backends.map(b => {
const badge = b.live
? '<span class="badge badge-live">live</span>'
: '<span class="badge badge-dead">dead</span>';
const active = b.active_models.length
? b.active_models.map(m => `<span class="badge badge-live">${esc(m)}</span>`).join(' ')
: '<span class="empty">idle</span>';
const models = b.models.length ? esc(b.models.join(', ')) : '<span class="empty">none</span>';
const slots = `<span class="slots">${b.slots_acquired}/${b.slots_total}</span>`;
const age = b.last_poll_age == null ? '<span class="empty">never</span>' : esc(b.last_poll_age.toFixed(1)) + 's';
const reqs = b.requests > 0 ? fmt(b.requests) : '<span class="empty">0</span>';
return `<tr><td>${esc(b.url)}</td><td>${badge}</td><td>${active}</td><td>${models}</td><td>${slots}</td><td class="num">${reqs}</td><td>${age}</td></tr>`;
}).join('');
}
const qBody = document.getElementById('queue-body');
if (!data.queue.length) {
qBody.innerHTML = '<tr><td colspan="6" class="empty">Queue is empty</td></tr>';
} else {
qBody.innerHTML = data.queue.map(e => {
const tok = e.estimated_tokens != null ? esc(e.estimated_tokens) : '<span class="empty">-</span>';
const sid = e.session_id ? esc(e.session_id) : '<span class="empty">-</span>';
return `<tr><td>${esc(e.request_id.slice(0,12))}</td><td>${esc(e.model_id||'-')}</td><td>${sid}</td><td>${esc(e.wait_seconds.toFixed(2))}</td><td>${tok}</td><td>${esc(e.skip_count)}</td></tr>`;
}).join('');
}
const mBody = document.getElementById('model-body');
const ms = data.model_stats;
const mKeys = Object.keys(ms).sort((a,b) => ms[b].requests - ms[a].requests);
if (!mKeys.length) {
mBody.innerHTML = '<tr><td colspan="4" class="empty">No data yet</td></tr>';
} else {
mBody.innerHTML = mKeys.map(m => {
const s = ms[m];
const tok = s.estimated_tokens > 0 ? fmt(s.estimated_tokens) : '<span class="empty">-</span>';
const sess = s.active_sessions > 0 ? s.active_sessions : '<span class="empty">0</span>';
return `<tr><td>${esc(m)}</td><td class="num">${fmt(s.requests)}</td><td class="num">${tok}</td><td class="num">${sess}</td></tr>`;
}).join('');
}
const bsBody = document.getElementById('backend-stats-body');
const bs = data.backend_stats;
const bsKeys = Object.keys(bs).sort((a,b) => bs[b].requests - bs[a].requests);
if (!bsKeys.length || data.total_requests === 0) {
bsBody.innerHTML = '<tr><td colspan="4" class="empty">No data yet</td></tr>';
} else {
bsBody.innerHTML = bsKeys.map(url => {
const s = bs[url];
const share = data.total_requests > 0
? Math.round(s.requests / data.total_requests * 100) + '%'
: '<span class="empty">-</span>';
const affinity = s.session_hits + s.session_misses > 0
? Math.round(s.session_hits / (s.session_hits + s.session_misses) * 100) + '% hit'
: '<span class="empty">-</span>';
return `<tr><td>${esc(url)}</td><td class="num">${fmt(s.requests)}</td><td class="num">${share}</td><td>${affinity}</td></tr>`;
}).join('');
}
document.getElementById('status').textContent = 'updated ' + new Date().toLocaleTimeString();
}
function poll() {
fetch('/monitor/data')
.then(r => r.json())
.then(render)
.catch(err => {
document.getElementById('status').textContent = 'error: ' + err.message;
});
}
poll();
setInterval(poll, 3000);
})();
</script>
</body>
</html>
"""
def build_router(
registry: BackendRegistry,
slot_tracker: SlotTracker,
request_queue: RequestQueue,
session_store: SessionStore,
stats: ProxyStats,
) -> APIRouter:
router = APIRouter()
@router.get("/monitor", response_class=HTMLResponse, include_in_schema=False)
async def monitor_page() -> HTMLResponse:
return HTMLResponse(content=_HTML)
@router.get("/monitor/data", include_in_schema=False)
async def monitor_data() -> JSONResponse:
states = registry.get_all_states()
backends_data = []
for state in states:
acquired, total = slot_tracker.usage(state.url)
age = state.last_poll_age
backends_data.append(
{
"url": state.url,
"live": state.live,
"active_models": sorted(slot_tracker.active_model_set(state.url)),
"models": list(state.models),
"slots_acquired": acquired,
"slots_total": total,
"requests": stats.backend_requests.get(state.url, 0),
"last_poll_age": None if age == float("inf") else round(age, 1),
}
)
queue_snapshot = await request_queue.snapshot()
session_count = await session_store.count()
sessions_by_model = await session_store.count_by_model()
live_count = sum(1 for s in states if s.live)
# Merge per-model request stats with active session counts.
all_models = set(stats.model_requests) | set(sessions_by_model)
model_stats = {
m: {
"requests": stats.model_requests.get(m, 0),
"estimated_tokens": stats.model_tokens.get(m, 0),
"active_sessions": sessions_by_model.get(m, 0),
}
for m in all_models
}
# Per-backend cumulative stats with session affinity breakdown.
backend_stats = {
url: {
"requests": count,
"session_hits": stats.backend_session_hits.get(url, 0),
"session_misses": stats.backend_session_misses.get(url, 0),
}
for url, count in stats.backend_requests.items()
}
return JSONResponse(
{
"uptime": stats.uptime_str(),
"total_requests": stats.total_requests,
"queue_depth": len(queue_snapshot),
"session_count": session_count,
"live_backend_count": live_count,
"session_hits": stats.session_hits,
"session_misses": stats.session_misses,
"session_hit_rate": stats.session_hit_rate(),
"backends": backends_data,
"queue": queue_snapshot,
"model_stats": model_stats,
"backend_stats": backend_stats,
}
)
return router