From 7cf16dcaceac2eb08aaa2a95688628401df4ad1a Mon Sep 17 00:00:00 2001 From: chacha Date: Mon, 18 May 2026 01:02:57 +0200 Subject: [PATCH] improve cache --- .claude/settings.local.json | 3 ++- src/llamacpp_ha/monitor.py | 15 ++++++++++----- src/llamacpp_ha/proxy.py | 6 ++---- src/llamacpp_ha/session_store.py | 4 ++-- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 787d316..36754ea 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -5,7 +5,8 @@ "Bash(python -m pytest --tb=short -q)", "Bash(python -m pytest tests/test_slot_tracker.py -v)", "Bash(python -m pytest tests/test_scheduler.py -v)", - "Bash(python -m pytest tests/test_monitor.py::TestMonitorEndpoints::test_monitor_data_structure -v)" + "Bash(python -m pytest tests/test_monitor.py::TestMonitorEndpoints::test_monitor_data_structure -v)", + "Bash(python -m pytest -q)" ] } } diff --git a/src/llamacpp_ha/monitor.py b/src/llamacpp_ha/monitor.py index 9d3c9c7..34663b3 100644 --- a/src/llamacpp_ha/monitor.py +++ b/src/llamacpp_ha/monitor.py @@ -23,6 +23,8 @@ class ProxyStats: model_requests: dict[str, int] = field(default_factory=dict) model_tokens: dict[str, int] = field(default_factory=dict) backend_requests: dict[str, int] = field(default_factory=dict) + backend_session_hits: dict[str, int] = field(default_factory=dict) + backend_session_misses: dict[str, int] = field(default_factory=dict) def increment_requests(self) -> None: self.total_requests += 1 @@ -37,13 +39,16 @@ class ProxyStats: def record_backend(self, url: str) -> None: self.backend_requests[url] = self.backend_requests.get(url, 0) + 1 - def record_session(self, had_session: bool, preferred_url: str | None, actual_url: str) -> None: - if had_session and preferred_url: + def record_session(self, preferred_url: str | None, actual_url: str) -> None: + if preferred_url: if actual_url == preferred_url: self.session_hits += 1 + self.backend_session_hits[actual_url] = self.backend_session_hits.get(actual_url, 0) + 1 else: self.session_misses += 1 - elif not had_session: + # Count miss against the preferred backend (the one that was expected but missed). + self.backend_session_misses[preferred_url] = self.backend_session_misses.get(preferred_url, 0) + 1 + else: self.new_sessions += 1 def session_hit_rate(self) -> int | None: @@ -287,8 +292,8 @@ def build_router( backend_stats = { url: { "requests": count, - "session_hits": 0, - "session_misses": 0, + "session_hits": stats.backend_session_hits.get(url, 0), + "session_misses": stats.backend_session_misses.get(url, 0), } for url, count in stats.backend_requests.items() } diff --git a/src/llamacpp_ha/proxy.py b/src/llamacpp_ha/proxy.py index bd743b4..3ac80df 100644 --- a/src/llamacpp_ha/proxy.py +++ b/src/llamacpp_ha/proxy.py @@ -281,9 +281,7 @@ async def _inference_endpoint( if not incoming_session_id: await _recover_session_affinity(session_id, body.get("messages") or [], session_store) - preferred_url: str | None = None - if incoming_session_id: - preferred_url = await session_store.get_preferred_backend(session_id) + preferred_url = await session_store.get_preferred_backend(session_id) result = await _dispatch_entry( request_queue, stats, config, slot_tracker, scheduler, model_id, session_id, body @@ -319,7 +317,7 @@ async def _inference_endpoint( stats.record_model(model_id, _estimate_tokens(body)) stats.record_backend(backend.url) - stats.record_session(bool(incoming_session_id), preferred_url, backend.url) + stats.record_session(preferred_url, backend.url) if model_id: messages = body.get("messages", []) diff --git a/src/llamacpp_ha/session_store.py b/src/llamacpp_ha/session_store.py index 864b245..4d90cea 100644 --- a/src/llamacpp_ha/session_store.py +++ b/src/llamacpp_ha/session_store.py @@ -62,7 +62,7 @@ class SessionStore: session.model_id = model_id if messages is not None: session.last_message_index = len(messages) - session.prefix_hash = compute_prefix_hash(messages) + session.prefix_hash = compute_prefix_hash([messages[-1]]) if preferred_backend is not None: session.preferred_backend = preferred_backend session.touch() @@ -105,7 +105,7 @@ class SessionStore: k = s.last_message_index if s.is_expired(self._ttl) or not s.preferred_backend or not s.prefix_hash or k == 0 or k > len(messages): return 0 - return k if compute_prefix_hash(messages[:k]) == s.prefix_hash else 0 + return k if compute_prefix_hash([messages[k - 1]]) == s.prefix_hash else 0 async def find_by_prefix(self, messages: list[dict]) -> str | None: """Return the preferred backend whose stored conversation is a prefix of messages.