This commit is contained in:
2026-05-19 21:21:42 +02:00
parent 3c536241f6
commit 579185654b
9 changed files with 44 additions and 9 deletions

View File

@@ -7,7 +7,8 @@
"Bash(python -m pytest tests/test_scheduler.py -v)",
"Bash(python -m pytest tests/test_monitor.py::TestMonitorEndpoints::test_monitor_data_structure -v)",
"Bash(python -m pytest -q)",
"Bash(python -m pytest tests/test_config.py -q)"
"Bash(python -m pytest tests/test_config.py -q)",
"Bash(python -m pytest -x -q)"
]
}
}

View File

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
[project]
name = "llamacpp-ha"
version = "0.8.0"
version = "0.9.0"
description = "Smart load balancer for llama.cpp servers"
requires-python = ">=3.13"
dependencies = [

View File

@@ -5,7 +5,7 @@ from fastapi.responses import JSONResponse
from starlette.middleware.base import BaseHTTPMiddleware
from starlette.types import ASGIApp
_EXEMPT_PATHS = frozenset(["/monitor", "/monitor/", "/monitor/data", "/monitor/data/"])
_EXEMPT_PATHS = frozenset(["/monitor", "/monitor/", "/monitor/data", "/monitor/data/", "/health", "/health/"])
class ApiKeyMiddleware(BaseHTTPMiddleware):

View File

@@ -57,8 +57,13 @@ class RequestQueue:
async def snapshot(self, score_fn=None) -> list[dict]:
async with self._lock:
entries = (
sorted(self._entries, key=score_fn, reverse=True)
if score_fn
else self._entries
)
rows = []
for e in self._entries:
for e in entries:
d: dict = {
"request_id": e.request_id,
"model_id": e.model_id,

View File

@@ -139,8 +139,8 @@ class BackendRegistry:
models = state.models
capacity = state.slot_capacity
was_live = state.live
async with self._lock:
was_live = state.live
state.live = live
if live:
state.models = models

View File

@@ -72,7 +72,7 @@ class Scheduler:
else 0.0
)
# Periodic retry covers cases where no explicit wakeup fires (e.g. sticky-window expiry).
self._wakeup_interval = max(wakeup_interval, 0.1)
self._wakeup_interval = max(wakeup_interval, 1.0)
self._task: asyncio.Task | None = None
def notify_slot_released(self) -> None:

View File

@@ -60,7 +60,7 @@ class SessionStore:
session = self._sessions[session_id]
if model_id is not None:
session.model_id = model_id
if messages is not None:
if messages:
session.last_message_index = len(messages)
session.prefix_hash = compute_prefix_hash([messages[-1]])
if preferred_backend is not None:

View File

@@ -44,12 +44,13 @@ class TestHealthEndpoint(unittest.TestCase):
resp = client.get("/health")
self.assertEqual(resp.status_code, 503)
def test_health_with_api_key_required(self):
def test_health_exempt_from_auth(self):
cfg = _proxy_config(["http://b1"], api_keys=["mykey"])
app = create_app(cfg)
client = TestClient(app, raise_server_exceptions=False)
resp = client.get("/health")
self.assertEqual(resp.status_code, 401)
# /health is auth-exempt; no live backends so 503, never 401
self.assertNotEqual(resp.status_code, 401)
def test_health_with_valid_api_key(self):
cfg = _proxy_config([], api_keys=["mykey"])

View File

@@ -270,6 +270,34 @@ class TestScheduler(unittest.IsolatedAsyncioTestCase):
self.assertTrue(e_m1.future.done())
self.assertFalse(e_m2.future.done())
async def test_loop_retries_after_sticky_window_expires(self):
"""Scheduler loop dispatches a blocked entry once the sticky window expires."""
b1 = _make_state("http://b1", models=["m1", "m2"])
scheduler, queue, _, slots, _ = self._make_scheduler([b1])
slots.set_capacity("http://b1", 2)
slots.set_model_unload_delay(0.05)
# Acquire and release m1 — starts a 0.05 s sticky window that blocks m2.
async with asyncio.timeout(1.0):
await slots.acquire("http://b1", "m1")
await slots.release("http://b1", "m1")
entry = _entry(model_id="m2")
await queue.enqueue(entry)
await scheduler._dispatch_all()
self.assertFalse(entry.future.done(), "m2 should be blocked by sticky window")
# Run the loop with a short interval so it retries before the test times out.
scheduler._wakeup_interval = 0.1
scheduler.start()
try:
async with asyncio.timeout(1.0):
result = await entry.future
finally:
await scheduler.stop()
self.assertEqual(result.url, "http://b1")
async def test_aging_overtakes_warm_bonus(self):
"""After equalization time, an aged cold request outranks the warm bonus."""
b1 = _make_state("http://b1", models=["m1", "m2"])