polish

2026-05-19 21:21:42 +02:00
parent 3c536241f6
commit 579185654b
9 changed files with 44 additions and 9 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -7,7 +7,8 @@
      "Bash(python -m pytest tests/test_scheduler.py -v)",
      "Bash(python -m pytest tests/test_monitor.py::TestMonitorEndpoints::test_monitor_data_structure -v)",
      "Bash(python -m pytest -q)",
-      "Bash(python -m pytest tests/test_config.py -q)"
+      "Bash(python -m pytest tests/test_config.py -q)",
+      "Bash(python -m pytest -x -q)"
    ]
  }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"

 [project]
 name = "llamacpp-ha"
-version = "0.8.0"
+version = "0.9.0"
 description = "Smart load balancer for llama.cpp servers"
 requires-python = ">=3.13"
 dependencies = [
--- a/src/llamacpp_ha/middleware.py
+++ b/src/llamacpp_ha/middleware.py
@@ -5,7 +5,7 @@ from fastapi.responses import JSONResponse
 from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.types import ASGIApp

-_EXEMPT_PATHS = frozenset(["/monitor", "/monitor/", "/monitor/data", "/monitor/data/"])
+_EXEMPT_PATHS = frozenset(["/monitor", "/monitor/", "/monitor/data", "/monitor/data/", "/health", "/health/"])


 class ApiKeyMiddleware(BaseHTTPMiddleware):
--- a/src/llamacpp_ha/queue.py
+++ b/src/llamacpp_ha/queue.py
@@ -57,8 +57,13 @@ class RequestQueue:

    async def snapshot(self, score_fn=None) -> list[dict]:
        async with self._lock:
+            entries = (
+                sorted(self._entries, key=score_fn, reverse=True)
+                if score_fn
+                else self._entries
+            )
            rows = []
-            for e in self._entries:
+            for e in entries:
                d: dict = {
                    "request_id": e.request_id,
                    "model_id": e.model_id,
--- a/src/llamacpp_ha/registry.py
+++ b/src/llamacpp_ha/registry.py
@@ -139,8 +139,8 @@ class BackendRegistry:
            models = state.models
            capacity = state.slot_capacity

-        was_live = state.live
        async with self._lock:
+            was_live = state.live
            state.live = live
            if live:
                state.models = models
--- a/src/llamacpp_ha/scheduler.py
+++ b/src/llamacpp_ha/scheduler.py
@@ -72,7 +72,7 @@ class Scheduler:
            else 0.0
        )
        # Periodic retry covers cases where no explicit wakeup fires (e.g. sticky-window expiry).
-        self._wakeup_interval = max(wakeup_interval, 0.1)
+        self._wakeup_interval = max(wakeup_interval, 1.0)
        self._task: asyncio.Task | None = None

    def notify_slot_released(self) -> None:
--- a/src/llamacpp_ha/session_store.py
+++ b/src/llamacpp_ha/session_store.py
@@ -60,7 +60,7 @@ class SessionStore:
            session = self._sessions[session_id]
            if model_id is not None:
                session.model_id = model_id
-            if messages is not None:
+            if messages:
                session.last_message_index = len(messages)
                session.prefix_hash = compute_prefix_hash([messages[-1]])
            if preferred_backend is not None:
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -44,12 +44,13 @@ class TestHealthEndpoint(unittest.TestCase):
        resp = client.get("/health")
        self.assertEqual(resp.status_code, 503)

-    def test_health_with_api_key_required(self):
+    def test_health_exempt_from_auth(self):
        cfg = _proxy_config(["http://b1"], api_keys=["mykey"])
        app = create_app(cfg)
        client = TestClient(app, raise_server_exceptions=False)
        resp = client.get("/health")
-        self.assertEqual(resp.status_code, 401)
+        # /health is auth-exempt; no live backends so 503, never 401
+        self.assertNotEqual(resp.status_code, 401)

    def test_health_with_valid_api_key(self):
        cfg = _proxy_config([], api_keys=["mykey"])
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -270,6 +270,34 @@ class TestScheduler(unittest.IsolatedAsyncioTestCase):
        self.assertTrue(e_m1.future.done())
        self.assertFalse(e_m2.future.done())

+    async def test_loop_retries_after_sticky_window_expires(self):
+        """Scheduler loop dispatches a blocked entry once the sticky window expires."""
+        b1 = _make_state("http://b1", models=["m1", "m2"])
+        scheduler, queue, _, slots, _ = self._make_scheduler([b1])
+        slots.set_capacity("http://b1", 2)
+        slots.set_model_unload_delay(0.05)
+
+        # Acquire and release m1 — starts a 0.05 s sticky window that blocks m2.
+        async with asyncio.timeout(1.0):
+            await slots.acquire("http://b1", "m1")
+        await slots.release("http://b1", "m1")
+
+        entry = _entry(model_id="m2")
+        await queue.enqueue(entry)
+        await scheduler._dispatch_all()
+        self.assertFalse(entry.future.done(), "m2 should be blocked by sticky window")
+
+        # Run the loop with a short interval so it retries before the test times out.
+        scheduler._wakeup_interval = 0.1
+        scheduler.start()
+        try:
+            async with asyncio.timeout(1.0):
+                result = await entry.future
+        finally:
+            await scheduler.stop()
+
+        self.assertEqual(result.url, "http://b1")
+
    async def test_aging_overtakes_warm_bonus(self):
        """After equalization time, an aged cold request outranks the warm bonus."""
        b1 = _make_state("http://b1", models=["m1", "m2"])