fix: better token counting and fixes cache

dartpain · dartpain · commit 552bfe016ad1 · 2026-04-28T01:47:53.000+01:00
diff --git a/application/api/answer/services/compression/token_counter.py b/application/api/answer/services/compression/token_counter.py
@@ -12,6 +12,12 @@
 class TokenCounter:
     """Centralized token counting for conversations and messages."""
 
+    # Per-image token estimate. Provider tokenizers vary widely
+    # (Gemini ~258, GPT-4o 85-1500, Claude ~1500) and the actual cost
+    # depends on resolution/detail we can't see here. Errs slightly high
+    # so the threshold check stays conservative.
+    _IMAGE_PART_TOKEN_ESTIMATE = 1500
+
     @staticmethod
     def count_message_tokens(messages: List[Dict]) -> int:
         """
@@ -29,12 +35,36 @@ def count_message_tokens(messages: List[Dict]) -> int:
             if isinstance(content, str):
                 total_tokens += num_tokens_from_string(content)
             elif isinstance(content, list):
-                # Handle structured content (tool calls, etc.)
+                # Handle structured content (tool calls, image parts, etc.)
                 for item in content:
                     if isinstance(item, dict):
-                        total_tokens += num_tokens_from_string(str(item))
+                        total_tokens += TokenCounter._count_content_part(item)
         return total_tokens
 
+    @staticmethod
+    def _count_content_part(item: Dict) -> int:
+        # Image/file attachments are billed by the provider per image,
+        # not proportional to the inline bytes/base64 string.
+        # ``str(item)`` on a 1MB image inflates the count by ~10000x,
+        # which trips spurious compression and overflows downstream
+        # input limits.
+        item_type = item.get("type")
+
+        if "files" in item:
+            files = item.get("files")
+            count = len(files) if isinstance(files, list) and files else 1
+            return TokenCounter._IMAGE_PART_TOKEN_ESTIMATE * count
+
+        if "image_url" in item or item_type in {
+            "image",
+            "image_url",
+            "input_image",
+            "file",
+        }:
+            return TokenCounter._IMAGE_PART_TOKEN_ESTIMATE
+
+        return num_tokens_from_string(str(item))
+
     @staticmethod
     def count_query_tokens(
         queries: List[Dict[str, Any]], include_tool_calls: bool = True
diff --git a/application/cache.py b/application/cache.py
@@ -1,3 +1,4 @@
+import hashlib
 import json
 import logging
 import time
@@ -10,6 +11,14 @@
 
 logger = logging.getLogger(__name__)
 
+
+def _cache_default(value):
+    # Image attachments arrive inline as bytes (see GoogleLLM.prepare_messages_with_attachments);
+    # hash so the cache key stays bounded in size and stable across identical content.
+    if isinstance(value, (bytes, bytearray, memoryview)):
+        return f"<bytes:sha256:{hashlib.sha256(bytes(value)).hexdigest()}>"
+    return repr(value)
+
 _redis_instance = None
 _redis_creation_failed = False
 _instance_lock = Lock()
@@ -36,7 +45,7 @@ def get_redis_instance():
 def gen_cache_key(messages, model="docgpt", tools=None):
     if not all(isinstance(msg, dict) for msg in messages):
         raise ValueError("All messages must be dictionaries.")
-    messages_str = json.dumps(messages)
+    messages_str = json.dumps(messages, default=_cache_default)
     tools_str = json.dumps(str(tools)) if tools else ""
     combined = f"{model}_{messages_str}_{tools_str}"
     cache_key = get_hash(combined)
diff --git a/application/llm/base.py b/application/llm/base.py
@@ -166,7 +166,7 @@ def decorated_method():
 
         if is_stream:
             return self._stream_with_fallback(
-                decorated_method, method_name, *args, **kwargs
+                decorated_method, method_name, decorators, *args, **kwargs
             )
 
         try:
@@ -187,14 +187,27 @@ def decorated_method():
                 f"{fallback.model_id}. Error: {str(e)}"
             )
 
-            fallback_method = getattr(
-                fallback, method_name.replace("_raw_", "")
-            )
+            # Apply decorators to fallback's raw method directly — calling
+            # fallback.gen() would re-enter the orchestrator and recurse via
+            # fallback.fallback_llm.
+            fallback_method = getattr(fallback, method_name)
+            for decorator in decorators:
+                fallback_method = decorator(fallback_method)
             fallback_kwargs = {**kwargs, "model": fallback.model_id}
-            return fallback_method(*args, **fallback_kwargs)
+            try:
+                return fallback_method(fallback, *args, **fallback_kwargs)
+            except Exception as e2:
+                if self._is_non_retriable_client_error(e2):
+                    logger.error(
+                        f"Fallback LLM failed with non-retriable client "
+                        f"error; giving up: {str(e2)}"
+                    )
+                else:
+                    logger.error(f"Fallback LLM also failed; giving up: {str(e2)}")
+                raise
 
     def _stream_with_fallback(
-        self, decorated_method, method_name, *args, **kwargs
+        self, decorated_method, method_name, decorators, *args, **kwargs
     ):
         """
         Wrapper generator that catches mid-stream errors and falls back.
@@ -223,11 +236,37 @@ def _stream_with_fallback(
                 f"Primary LLM failed mid-stream. Falling back to "
                 f"{fallback.model_id}. Error: {str(e)}"
             )
-            fallback_method = getattr(
-                fallback, method_name.replace("_raw_", "")
+            # Apply decorators to fallback's raw stream method directly —
+            # calling fallback.gen_stream() would re-enter the orchestrator
+            # and recurse via fallback.fallback_llm. Emit the stream-start
+            # event manually so dashboards still see the fallback's
+            # provider/model when the response actually comes from it.
+            fallback._emit_stream_start_log(
+                fallback.model_id,
+                kwargs.get("messages"),
+                kwargs.get("tools"),
+                bool(
+                    kwargs.get("_usage_attachments")
+                    or kwargs.get("attachments")
+                ),
             )
+            fallback_method = getattr(fallback, method_name)
+            for decorator in decorators:
+                fallback_method = decorator(fallback_method)
             fallback_kwargs = {**kwargs, "model": fallback.model_id}
-            yield from fallback_method(*args, **fallback_kwargs)
+            try:
+                yield from fallback_method(fallback, *args, **fallback_kwargs)
+            except Exception as e2:
+                if self._is_non_retriable_client_error(e2):
+                    logger.error(
+                        f"Fallback LLM failed mid-stream with non-retriable "
+                        f"client error; giving up: {str(e2)}"
+                    )
+                else:
+                    logger.error(
+                        f"Fallback LLM also failed mid-stream; giving up: {str(e2)}"
+                    )
+                raise
 
     def gen(self, model, messages, stream=False, tools=None, *args, **kwargs):
         decorators = [gen_token_usage, gen_cache]
@@ -242,22 +281,29 @@ def gen(self, model, messages, stream=False, tools=None, *args, **kwargs):
             **kwargs,
         )
 
-    def gen_stream(self, model, messages, stream=True, tools=None, *args, **kwargs):
-        # Attachments arrive as ``_usage_attachments`` from ``Agent._llm_gen``;
-        # the ``stream_token_usage`` decorator pops that key, but the log
-        # fires before the decorator runs so it's still in ``kwargs`` here.
+    def _emit_stream_start_log(self, model, messages, tools, has_attachments):
+        # Stamped with ``self.provider_name`` so dashboards can group calls
+        # by vendor; the fallback path emits its own copy on the fallback
+        # instance so the actual responding provider is recorded.
         logging.info(
             "llm_stream_start",
             extra={
                 "model": model,
                 "provider": self.provider_name,
                 "message_count": len(messages) if messages is not None else 0,
-                "has_attachments": bool(
-                    kwargs.get("_usage_attachments") or kwargs.get("attachments")
-                ),
+                "has_attachments": bool(has_attachments),
                 "has_tools": bool(tools),
             },
         )
+
+    def gen_stream(self, model, messages, stream=True, tools=None, *args, **kwargs):
+        # Attachments arrive as ``_usage_attachments`` from ``Agent._llm_gen``;
+        # the ``stream_token_usage`` decorator pops that key, but the log
+        # fires before the decorator runs so it's still in ``kwargs`` here.
+        has_attachments = bool(
+            kwargs.get("_usage_attachments") or kwargs.get("attachments")
+        )
+        self._emit_stream_start_log(model, messages, tools, has_attachments)
         decorators = [stream_cache, stream_token_usage]
         return self._execute_with_fallback(
             "_raw_gen_stream",
diff --git a/application/llm/handlers/base.py b/application/llm/handlers/base.py
@@ -280,7 +280,26 @@ def _extract_text_from_content(self, content: Any) -> str:
                         # Keep serialized function calls/responses so the compressor sees actions
                         parts_text.append(str(item))
                     elif "files" in item:
-                        parts_text.append(str(item))
+                        # Image attachments arrive with raw bytes / base64
+                        # inline (see GoogleLLM.prepare_messages_with_attachments).
+                        # ``str(item)`` would dump the whole byte/base64
+                        # blob into the compression prompt and bust the
+                        # compression LLM's input limit.
+                        files = item.get("files") or []
+                        descriptors = []
+                        if isinstance(files, list):
+                            for f in files:
+                                if isinstance(f, dict):
+                                    descriptors.append(
+                                        f.get("mime_type") or "file"
+                                    )
+                                elif isinstance(f, str):
+                                    descriptors.append(f)
+                        if not descriptors:
+                            descriptors = ["file"]
+                        parts_text.append(
+                            f"[attachment: {', '.join(descriptors)}]"
+                        )
             return "\n".join(parts_text)
         return ""
 
diff --git a/tests/llm/handlers/test_llm_handlers.py b/tests/llm/handlers/test_llm_handlers.py
@@ -360,7 +360,38 @@ def test_list_with_files(self):
         handler = ConcreteHandler()
         content = [{"files": ["/tmp/a.txt"]}]
         result = handler._extract_text_from_content(content)
-        assert "files" in result
+        assert result == "[attachment: /tmp/a.txt]"
+
+    def test_list_with_inline_image_bytes(self):
+        # Google attaches images as inline bytes; stringifying them into
+        # the compression prompt would bust the compression LLM's input
+        # limit. The placeholder must describe the attachment without
+        # embedding the bytes.
+        handler = ConcreteHandler()
+        content = [
+            {
+                "files": [
+                    {"file_bytes": b"\x89PNG" + b"\x00" * 1000, "mime_type": "image/png"}
+                ]
+            }
+        ]
+        result = handler._extract_text_from_content(content)
+        assert result == "[attachment: image/png]"
+        assert "PNG" not in result
+        assert "\\x" not in result
+
+    def test_list_with_multiple_files(self):
+        handler = ConcreteHandler()
+        content = [
+            {
+                "files": [
+                    {"file_bytes": b"a", "mime_type": "image/png"},
+                    {"file_uri": "https://x", "mime_type": "image/jpeg"},
+                ]
+            }
+        ]
+        result = handler._extract_text_from_content(content)
+        assert result == "[attachment: image/png, image/jpeg]"
 
     def test_list_with_none_text(self):
         handler = ConcreteHandler()
diff --git a/tests/llm/test_base.py b/tests/llm/test_base.py
@@ -49,6 +49,9 @@ def _raw_gen_stream(self, baseself, model, messages, stream=True, tools=None, **
 
 
 class FallbackLLM(BaseLLM):
+    # _execute_with_fallback applies decorators to the fallback's raw method
+    # directly and never calls .gen() / .gen_stream() on it, so
+    # tracking lives on the raw methods.
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.gen_called = False
@@ -62,14 +65,6 @@ def _raw_gen_stream(self, baseself, model, messages, stream=True, tools=None, **
         self.gen_stream_called = True
         yield "fallback_chunk"
 
-    def gen(self, *args, **kwargs):
-        self.gen_called = True
-        return "fallback_gen_result"
-
-    def gen_stream(self, *args, **kwargs):
-        self.gen_stream_called = True
-        yield "fallback_stream_chunk"
-
 
 # ---------------------------------------------------------------------------
 # gen / gen_stream decorator application
@@ -230,7 +225,7 @@ def test_fallback_called_on_failure(self):
         llm._fallback_llm = fallback
 
         result = llm.gen(model="m", messages=[])
-        assert result == "fallback_gen_result"
+        assert result == "fallback_result"
         assert fallback.gen_called
 
 
@@ -257,7 +252,7 @@ def test_fallback_called_on_stream_failure(self):
         llm._fallback_llm = fallback
 
         result = list(llm.gen_stream(model="m", messages=[]))
-        assert "fallback_stream_chunk" in result
+        assert "fallback_chunk" in result
         assert fallback.gen_stream_called
 
 
@@ -344,7 +339,7 @@ def test_5xx_still_falls_back(self):
         llm._fallback_llm = fallback
 
         result = llm.gen(model="m", messages=[])
-        assert result == "fallback_gen_result"
+        assert result == "fallback_result"
         assert fallback.gen_called
 
 
diff --git a/tests/llm/test_fallback.py b/tests/llm/test_fallback.py
diff --git a/tests/test_cache.py b/tests/test_cache.py
diff --git a/tests/test_compression_service.py b/tests/test_compression_service.py