polish
This commit is contained in:
@@ -7,7 +7,8 @@
|
||||
"Bash(python -m pytest tests/test_scheduler.py -v)",
|
||||
"Bash(python -m pytest tests/test_monitor.py::TestMonitorEndpoints::test_monitor_data_structure -v)",
|
||||
"Bash(python -m pytest -q)",
|
||||
"Bash(python -m pytest tests/test_config.py -q)"
|
||||
"Bash(python -m pytest tests/test_config.py -q)",
|
||||
"Bash(python -m pytest -x -q)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "llamacpp-ha"
|
||||
version = "0.8.0"
|
||||
version = "0.9.0"
|
||||
description = "Smart load balancer for llama.cpp servers"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
|
||||
@@ -5,7 +5,7 @@ from fastapi.responses import JSONResponse
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
from starlette.types import ASGIApp
|
||||
|
||||
_EXEMPT_PATHS = frozenset(["/monitor", "/monitor/", "/monitor/data", "/monitor/data/"])
|
||||
_EXEMPT_PATHS = frozenset(["/monitor", "/monitor/", "/monitor/data", "/monitor/data/", "/health", "/health/"])
|
||||
|
||||
|
||||
class ApiKeyMiddleware(BaseHTTPMiddleware):
|
||||
|
||||
@@ -57,8 +57,13 @@ class RequestQueue:
|
||||
|
||||
async def snapshot(self, score_fn=None) -> list[dict]:
|
||||
async with self._lock:
|
||||
entries = (
|
||||
sorted(self._entries, key=score_fn, reverse=True)
|
||||
if score_fn
|
||||
else self._entries
|
||||
)
|
||||
rows = []
|
||||
for e in self._entries:
|
||||
for e in entries:
|
||||
d: dict = {
|
||||
"request_id": e.request_id,
|
||||
"model_id": e.model_id,
|
||||
|
||||
@@ -139,8 +139,8 @@ class BackendRegistry:
|
||||
models = state.models
|
||||
capacity = state.slot_capacity
|
||||
|
||||
was_live = state.live
|
||||
async with self._lock:
|
||||
was_live = state.live
|
||||
state.live = live
|
||||
if live:
|
||||
state.models = models
|
||||
|
||||
@@ -72,7 +72,7 @@ class Scheduler:
|
||||
else 0.0
|
||||
)
|
||||
# Periodic retry covers cases where no explicit wakeup fires (e.g. sticky-window expiry).
|
||||
self._wakeup_interval = max(wakeup_interval, 0.1)
|
||||
self._wakeup_interval = max(wakeup_interval, 1.0)
|
||||
self._task: asyncio.Task | None = None
|
||||
|
||||
def notify_slot_released(self) -> None:
|
||||
|
||||
@@ -60,7 +60,7 @@ class SessionStore:
|
||||
session = self._sessions[session_id]
|
||||
if model_id is not None:
|
||||
session.model_id = model_id
|
||||
if messages is not None:
|
||||
if messages:
|
||||
session.last_message_index = len(messages)
|
||||
session.prefix_hash = compute_prefix_hash([messages[-1]])
|
||||
if preferred_backend is not None:
|
||||
|
||||
@@ -44,12 +44,13 @@ class TestHealthEndpoint(unittest.TestCase):
|
||||
resp = client.get("/health")
|
||||
self.assertEqual(resp.status_code, 503)
|
||||
|
||||
def test_health_with_api_key_required(self):
|
||||
def test_health_exempt_from_auth(self):
|
||||
cfg = _proxy_config(["http://b1"], api_keys=["mykey"])
|
||||
app = create_app(cfg)
|
||||
client = TestClient(app, raise_server_exceptions=False)
|
||||
resp = client.get("/health")
|
||||
self.assertEqual(resp.status_code, 401)
|
||||
# /health is auth-exempt; no live backends so 503, never 401
|
||||
self.assertNotEqual(resp.status_code, 401)
|
||||
|
||||
def test_health_with_valid_api_key(self):
|
||||
cfg = _proxy_config([], api_keys=["mykey"])
|
||||
|
||||
@@ -270,6 +270,34 @@ class TestScheduler(unittest.IsolatedAsyncioTestCase):
|
||||
self.assertTrue(e_m1.future.done())
|
||||
self.assertFalse(e_m2.future.done())
|
||||
|
||||
async def test_loop_retries_after_sticky_window_expires(self):
|
||||
"""Scheduler loop dispatches a blocked entry once the sticky window expires."""
|
||||
b1 = _make_state("http://b1", models=["m1", "m2"])
|
||||
scheduler, queue, _, slots, _ = self._make_scheduler([b1])
|
||||
slots.set_capacity("http://b1", 2)
|
||||
slots.set_model_unload_delay(0.05)
|
||||
|
||||
# Acquire and release m1 — starts a 0.05 s sticky window that blocks m2.
|
||||
async with asyncio.timeout(1.0):
|
||||
await slots.acquire("http://b1", "m1")
|
||||
await slots.release("http://b1", "m1")
|
||||
|
||||
entry = _entry(model_id="m2")
|
||||
await queue.enqueue(entry)
|
||||
await scheduler._dispatch_all()
|
||||
self.assertFalse(entry.future.done(), "m2 should be blocked by sticky window")
|
||||
|
||||
# Run the loop with a short interval so it retries before the test times out.
|
||||
scheduler._wakeup_interval = 0.1
|
||||
scheduler.start()
|
||||
try:
|
||||
async with asyncio.timeout(1.0):
|
||||
result = await entry.future
|
||||
finally:
|
||||
await scheduler.stop()
|
||||
|
||||
self.assertEqual(result.url, "http://b1")
|
||||
|
||||
async def test_aging_overtakes_warm_bonus(self):
|
||||
"""After equalization time, an aged cold request outranks the warm bonus."""
|
||||
b1 = _make_state("http://b1", models=["m1", "m2"])
|
||||
|
||||
Reference in New Issue
Block a user