arc53
diff --git a/‎application/agents/tool_executor.py‎
Lines changed: 32 additions & 7 deletions b/‎application/agents/tool_executor.py‎
Lines changed: 32 additions & 7 deletions
diff --git a/‎application/app.py‎
Lines changed: 48 additions & 0 deletions b/‎application/app.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎application/celery_init.py‎
Lines changed: 58 additions & 1 deletion b/‎application/celery_init.py‎
Lines changed: 58 additions & 1 deletion
diff --git a/‎application/core/log_context.py‎
Lines changed: 57 additions & 0 deletions b/‎application/core/log_context.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎application/core/logging_config.py‎
Lines changed: 50 additions & 0 deletions b/‎application/core/logging_config.py‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎application/llm/anthropic.py‎
Lines changed: 1 addition & 0 deletions b/‎application/llm/anthropic.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎application/llm/base.py‎
Lines changed: 20 additions & 0 deletions b/‎application/llm/base.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎application/llm/docsgpt_provider.py‎
Lines changed: 2 additions & 0 deletions b/‎application/llm/docsgpt_provider.py‎
Lines changed: 2 additions & 0 deletions
@@ -274,7 +274,14 @@ def execute(self, tools_dict: Dict, call, llm_class_name: str):
 
         if tool_id is None or action_name is None:
             error_message = f"Error: Failed to parse LLM tool call. Tool name: {llm_name}"
-            logger.error(error_message)
+            logger.error(
+                "tool_call_parse_failed",
+                extra={
+                    "llm_class_name": llm_class_name,
+                    "llm_tool_name": llm_name,
+                    "call_id": call_id,
+                },
+            )
 
             tool_call_data = {
                 "tool_name": "unknown",
@@ -289,7 +296,15 @@ def execute(self, tools_dict: Dict, call, llm_class_name: str):
 
         if tool_id not in tools_dict:
             error_message = f"Error: Tool ID '{tool_id}' extracted from LLM call not found in available tools_dict. Available IDs: {list(tools_dict.keys())}"
-            logger.error(error_message)
+            logger.error(
+                "tool_id_not_found",
+                extra={
+                    "tool_id": tool_id,
+                    "llm_tool_name": llm_name,
+                    "call_id": call_id,
+                    "available_tool_count": len(tools_dict),
+                },
+            )
 
             tool_call_data = {
                 "tool_name": "unknown",
@@ -356,7 +371,15 @@ def execute(self, tools_dict: Dict, call, llm_class_name: str):
                 f"Failed to load tool '{tool_data.get('name')}' (tool_id key={tool_id}): "
                 "missing 'id' on tool row."
             )
-            logger.error(error_message)
+            logger.error(
+                "tool_load_failed",
+                extra={
+                    "tool_name": tool_data.get("name"),
+                    "tool_id": tool_id,
+                    "action_name": action_name,
+                    "call_id": call_id,
+                },
+            )
             tool_call_data["result"] = error_message
             yield {"type": "tool_call", "data": {**tool_call_data, "status": "error"}}
             self.tool_calls.append(tool_call_data)
@@ -451,10 +474,12 @@ def _get_or_load_tool(
             row_id = tool_data.get("id")
             if not row_id:
                 logger.error(
-                    "Tool data missing 'id' for tool name=%s (enumerate-key tool_id=%s); "
-                    "skipping load to avoid binding a non-UUID downstream.",
-                    tool_data.get("name"),
-                    tool_id,
+                    "tool_missing_row_id",
+                    extra={
+                        "tool_name": tool_data.get("name"),
+                        "tool_id": tool_id,
+                        "action_name": action_name,
+                    },
                 )
                 return None
             tool_config["tool_id"] = str(row_id)
 
@@ -9,6 +9,7 @@
 
 from application.auth import handle_auth
 
+from application.core import log_context
 from application.core.logging_config import setup_logging
 
 setup_logging()
@@ -112,6 +113,38 @@ def generate_token():
     return jsonify({"error": "Token generation not allowed in current auth mode"}), 400
 
 
+_LOG_CTX_TOKEN_ATTR = "_log_ctx_token"
+
+
+@app.before_request
+def _bind_log_context():
+    """Bind activity_id + endpoint for the duration of this request.
+
+    Runs before ``authenticate_request``; ``user_id`` is overlaid in a
+    follow-up handler once the JWT has been decoded.
+    """
+    if request.method == "OPTIONS":
+        return None
+    activity_id = str(uuid.uuid4())
+    request.activity_id = activity_id
+    token = log_context.bind(
+        activity_id=activity_id,
+        endpoint=request.endpoint,
+    )
+    setattr(request, _LOG_CTX_TOKEN_ATTR, token)
+    return None
+
+
+@app.teardown_request
+def _reset_log_context(_exc):
+    # SSE streams keep yielding after teardown fires, but a2wsgi runs each
+    # request inside ``copy_context().run(...)``, so this reset doesn't
+    # leak into the stream's view of the context.
+    token = getattr(request, _LOG_CTX_TOKEN_ATTR, None)
+    if token is not None:
+        log_context.reset(token)
+
+
 @app.before_request
 def enforce_stt_request_size_limits():
     if request.method == "OPTIONS":
@@ -148,6 +181,21 @@ def authenticate_request():
         request.decoded_token = decoded_token
 
 
+@app.before_request
+def _bind_user_id_to_log_context():
+    # Registered after ``authenticate_request`` (Flask runs before_request
+    # handlers in registration order), so ``request.decoded_token`` is
+    # populated by the time we read it. ``teardown_request`` unwinds the
+    # whole request-level bind, so no separate reset token is needed here.
+    if request.method == "OPTIONS":
+        return None
+    decoded_token = getattr(request, "decoded_token", None)
+    user_id = decoded_token.get("sub") if isinstance(decoded_token, dict) else None
+    if user_id:
+        log_context.bind(user_id=user_id)
+    return None
+
+
 @app.after_request
 def after_request(response: Response) -> Response:
     """Add CORS headers for the pure Flask development entrypoint."""
 
@@ -1,8 +1,17 @@
+import inspect
+import logging
 import threading
 
 from celery import Celery
+from application.core import log_context
 from application.core.settings import settings
-from celery.signals import setup_logging, worker_process_init, worker_ready
+from celery.signals import (
+    setup_logging,
+    task_postrun,
+    task_prerun,
+    worker_process_init,
+    worker_ready,
+)
 
 
 def make_celery(app_name=__name__):
@@ -41,6 +50,54 @@ def _dispose_db_engine_on_fork(*args, **kwargs):
     dispose_engine()
 
 
+# Most tasks in this repo accept ``user`` where the log context wants
+# ``user_id``; map task parameter names to context keys explicitly.
+_TASK_PARAM_TO_CTX_KEY: dict[str, str] = {
+    "user": "user_id",
+    "user_id": "user_id",
+    "agent_id": "agent_id",
+    "conversation_id": "conversation_id",
+}
+
+_task_log_tokens: dict[str, object] = {}
+
+
+@task_prerun.connect
+def _bind_task_log_context(task_id, task, args, kwargs, **_):
+    # Resolve task args by parameter name — nearly every task in this repo
+    # is called positionally, so ``kwargs.get('user')`` would bind nothing.
+    ctx = {"activity_id": task_id}
+    try:
+        sig = inspect.signature(task.run)
+        bound = sig.bind_partial(*args, **kwargs).arguments
+    except (TypeError, ValueError):
+        bound = dict(kwargs)
+    for param_name, value in bound.items():
+        ctx_key = _TASK_PARAM_TO_CTX_KEY.get(param_name)
+        if ctx_key and value:
+            ctx[ctx_key] = value
+    _task_log_tokens[task_id] = log_context.bind(**ctx)
+
+
+@task_postrun.connect
+def _unbind_task_log_context(task_id, **_):
+    # ``task_postrun`` fires on both success and failure. Required for
+    # Celery: unlike the Flask path, tasks aren't isolated in their own
+    # ``copy_context().run(...)``, so a missing reset would leak the
+    # bind onto the next task on the same worker.
+    token = _task_log_tokens.pop(task_id, None)
+    if token is None:
+        return
+    try:
+        log_context.reset(token)
+    except ValueError:
+        # task_prerun and task_postrun ran on different threads (non-default
+        # Celery pool); the token isn't valid in this context. Drop it.
+        logging.getLogger(__name__).debug(
+            "log_context reset skipped for task %s", task_id
+        )
+
+
 @worker_ready.connect
 def _run_version_check(*args, **kwargs):
     """Kick off the anonymous version check on worker startup.
 
@@ -0,0 +1,57 @@
+"""Per-activity logging context backed by ``contextvars``.
+
+The ``_ContextFilter`` installed by ``logging_config.setup_logging`` stamps
+every ``LogRecord`` emitted inside a ``bind`` block with the bound keys, so
+they land as first-class attributes on the OTLP log export rather than being
+buried inside formatted message bodies.
+
+A single ``ContextVar`` holds a dict so nested binds reset atomically (LIFO)
+via the token returned by ``bind``.
+"""
+
+from __future__ import annotations
+
+from contextvars import ContextVar, Token
+from typing import Mapping
+
+
+_CTX_KEYS: frozenset[str] = frozenset(
+    {
+        "activity_id",
+        "parent_activity_id",
+        "user_id",
+        "agent_id",
+        "conversation_id",
+        "endpoint",
+        "model",
+    }
+)
+
+_ctx: ContextVar[Mapping[str, str]] = ContextVar("log_ctx", default={})
+
+
+def bind(**kwargs: object) -> Token:
+    """Overlay the given keys onto the current context.
+
+    Returns a ``Token`` so the caller can ``reset`` in a ``finally`` block.
+    Keys outside :data:`_CTX_KEYS` are silently dropped (so a typo can't
+    stamp a stray field name onto every record), as are ``None`` values
+    (a missing attribute is more useful than the literal string ``"None"``).
+    """
+    overlay = {
+        k: str(v)
+        for k, v in kwargs.items()
+        if k in _CTX_KEYS and v is not None
+    }
+    new = {**_ctx.get(), **overlay}
+    return _ctx.set(new)
+
+
+def reset(token: Token) -> None:
+    """Restore the context to the snapshot captured by the matching ``bind``."""
+    _ctx.reset(token)
+
+
+def snapshot() -> Mapping[str, str]:
+    """Return the current context dict. Treat as read-only; use :func:`bind`."""
+    return _ctx.get()
@@ -2,6 +2,36 @@
 import os
 from logging.config import dictConfig
 
+from application.core.log_context import snapshot as _ctx_snapshot
+
+
+# Loggers with ``propagate=False`` don't share root's handlers, so the
+# context filter has to be installed on their handlers directly.
+_NON_PROPAGATING_LOGGERS: tuple[str, ...] = (
+    "uvicorn",
+    "uvicorn.access",
+    "uvicorn.error",
+    "celery.app.trace",
+    "celery.worker.strategy",
+    "gunicorn.error",
+    "gunicorn.access",
+)
+
+
+class _ContextFilter(logging.Filter):
+    """Stamp the current ``log_context`` snapshot onto every ``LogRecord``.
+
+    Must be installed on **handlers**, not loggers: Python skips logger-level
+    filters when a child logger's record propagates up. The ``hasattr`` guard
+    keeps an explicit ``logger.info(..., extra={...})`` from being overwritten.
+    """
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        for key, value in _ctx_snapshot().items():
+            if not hasattr(record, key):
+                setattr(record, key, value)
+        return True
+
 
 def _otlp_logs_enabled() -> bool:
     """Return True when the user has opted in to OTLP log export.
@@ -60,3 +90,23 @@ def setup_logging() -> None:
         for handler in preserved_handlers:
             if handler not in root.handlers:
                 root.addHandler(handler)
+
+    _install_context_filter()
+
+
+def _install_context_filter() -> None:
+    """Attach :class:`_ContextFilter` to root's handlers + every handler on
+    the known non-propagating loggers. Skipping handlers that already carry
+    one keeps repeat ``setup_logging`` calls from stacking filters.
+    """
+
+    def _has_ctx_filter(handler: logging.Handler) -> bool:
+        return any(isinstance(f, _ContextFilter) for f in handler.filters)
+
+    for handler in logging.getLogger().handlers:
+        if not _has_ctx_filter(handler):
+            handler.addFilter(_ContextFilter())
+    for name in _NON_PROPAGATING_LOGGERS:
+        for handler in logging.getLogger(name).handlers:
+            if not _has_ctx_filter(handler):
+                handler.addFilter(_ContextFilter())
@@ -11,6 +11,7 @@
 
 
 class AnthropicLLM(BaseLLM):
+    provider_name = "anthropic"
 
     def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
 
 
@@ -1,5 +1,6 @@
 import logging
 from abc import ABC, abstractmethod
+from typing import ClassVar
 
 from application.cache import gen_cache, stream_cache
 
@@ -10,6 +11,10 @@
 
 
 class BaseLLM(ABC):
+    # Stamped onto the ``llm_stream_start`` event so dashboards can group
+    # calls by vendor. Subclasses override.
+    provider_name: ClassVar[str] = "unknown"
+
     def __init__(
         self,
         decoded_token=None,
@@ -206,6 +211,21 @@ def gen(self, model, messages, stream=False, tools=None, *args, **kwargs):
         )
 
     def gen_stream(self, model, messages, stream=True, tools=None, *args, **kwargs):
+        # Attachments arrive as ``_usage_attachments`` from ``Agent._llm_gen``;
+        # the ``stream_token_usage`` decorator pops that key, but the log
+        # fires before the decorator runs so it's still in ``kwargs`` here.
+        logging.info(
+            "llm_stream_start",
+            extra={
+                "model": model,
+                "provider": self.provider_name,
+                "message_count": len(messages) if messages is not None else 0,
+                "has_attachments": bool(
+                    kwargs.get("_usage_attachments") or kwargs.get("attachments")
+                ),
+                "has_tools": bool(tools),
+            },
+        )
         decorators = [stream_cache, stream_token_usage]
         return self._execute_with_fallback(
             "_raw_gen_stream",
 
@@ -6,6 +6,8 @@
 DOCSGPT_MODEL = "docsgpt"
 
 class DocsGPTAPILLM(OpenAILLM):
+    provider_name = "docsgpt"
+
     def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
         super().__init__(
             api_key=DOCSGPT_API_KEY,
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`
`12`	`12`
`13`	`13`	`class AnthropicLLM(BaseLLM):`
	`14`	`+ provider_name = "anthropic"`
`14`	`15`
`15`	`16`	`def __init__(self, api_key=None, user_api_key=None, base_url=None, args, *kwargs):`
`16`	`17`