Skip to content

Commit 82c71be

Browse files
committed
feat: better logging
1 parent 318de18 commit 82c71be

21 files changed

Lines changed: 724 additions & 28 deletions

application/agents/tool_executor.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,14 @@ def execute(self, tools_dict: Dict, call, llm_class_name: str):
274274

275275
if tool_id is None or action_name is None:
276276
error_message = f"Error: Failed to parse LLM tool call. Tool name: {llm_name}"
277-
logger.error(error_message)
277+
logger.error(
278+
"tool_call_parse_failed",
279+
extra={
280+
"llm_class_name": llm_class_name,
281+
"llm_tool_name": llm_name,
282+
"call_id": call_id,
283+
},
284+
)
278285

279286
tool_call_data = {
280287
"tool_name": "unknown",
@@ -289,7 +296,15 @@ def execute(self, tools_dict: Dict, call, llm_class_name: str):
289296

290297
if tool_id not in tools_dict:
291298
error_message = f"Error: Tool ID '{tool_id}' extracted from LLM call not found in available tools_dict. Available IDs: {list(tools_dict.keys())}"
292-
logger.error(error_message)
299+
logger.error(
300+
"tool_id_not_found",
301+
extra={
302+
"tool_id": tool_id,
303+
"llm_tool_name": llm_name,
304+
"call_id": call_id,
305+
"available_tool_count": len(tools_dict),
306+
},
307+
)
293308

294309
tool_call_data = {
295310
"tool_name": "unknown",
@@ -356,7 +371,15 @@ def execute(self, tools_dict: Dict, call, llm_class_name: str):
356371
f"Failed to load tool '{tool_data.get('name')}' (tool_id key={tool_id}): "
357372
"missing 'id' on tool row."
358373
)
359-
logger.error(error_message)
374+
logger.error(
375+
"tool_load_failed",
376+
extra={
377+
"tool_name": tool_data.get("name"),
378+
"tool_id": tool_id,
379+
"action_name": action_name,
380+
"call_id": call_id,
381+
},
382+
)
360383
tool_call_data["result"] = error_message
361384
yield {"type": "tool_call", "data": {**tool_call_data, "status": "error"}}
362385
self.tool_calls.append(tool_call_data)
@@ -451,10 +474,12 @@ def _get_or_load_tool(
451474
row_id = tool_data.get("id")
452475
if not row_id:
453476
logger.error(
454-
"Tool data missing 'id' for tool name=%s (enumerate-key tool_id=%s); "
455-
"skipping load to avoid binding a non-UUID downstream.",
456-
tool_data.get("name"),
457-
tool_id,
477+
"tool_missing_row_id",
478+
extra={
479+
"tool_name": tool_data.get("name"),
480+
"tool_id": tool_id,
481+
"action_name": action_name,
482+
},
458483
)
459484
return None
460485
tool_config["tool_id"] = str(row_id)

application/app.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from application.auth import handle_auth
1111

12+
from application.core import log_context
1213
from application.core.logging_config import setup_logging
1314

1415
setup_logging()
@@ -112,6 +113,38 @@ def generate_token():
112113
return jsonify({"error": "Token generation not allowed in current auth mode"}), 400
113114

114115

116+
_LOG_CTX_TOKEN_ATTR = "_log_ctx_token"
117+
118+
119+
@app.before_request
120+
def _bind_log_context():
121+
"""Bind activity_id + endpoint for the duration of this request.
122+
123+
Runs before ``authenticate_request``; ``user_id`` is overlaid in a
124+
follow-up handler once the JWT has been decoded.
125+
"""
126+
if request.method == "OPTIONS":
127+
return None
128+
activity_id = str(uuid.uuid4())
129+
request.activity_id = activity_id
130+
token = log_context.bind(
131+
activity_id=activity_id,
132+
endpoint=request.endpoint,
133+
)
134+
setattr(request, _LOG_CTX_TOKEN_ATTR, token)
135+
return None
136+
137+
138+
@app.teardown_request
139+
def _reset_log_context(_exc):
140+
# SSE streams keep yielding after teardown fires, but a2wsgi runs each
141+
# request inside ``copy_context().run(...)``, so this reset doesn't
142+
# leak into the stream's view of the context.
143+
token = getattr(request, _LOG_CTX_TOKEN_ATTR, None)
144+
if token is not None:
145+
log_context.reset(token)
146+
147+
115148
@app.before_request
116149
def enforce_stt_request_size_limits():
117150
if request.method == "OPTIONS":
@@ -148,6 +181,21 @@ def authenticate_request():
148181
request.decoded_token = decoded_token
149182

150183

184+
@app.before_request
185+
def _bind_user_id_to_log_context():
186+
# Registered after ``authenticate_request`` (Flask runs before_request
187+
# handlers in registration order), so ``request.decoded_token`` is
188+
# populated by the time we read it. ``teardown_request`` unwinds the
189+
# whole request-level bind, so no separate reset token is needed here.
190+
if request.method == "OPTIONS":
191+
return None
192+
decoded_token = getattr(request, "decoded_token", None)
193+
user_id = decoded_token.get("sub") if isinstance(decoded_token, dict) else None
194+
if user_id:
195+
log_context.bind(user_id=user_id)
196+
return None
197+
198+
151199
@app.after_request
152200
def after_request(response: Response) -> Response:
153201
"""Add CORS headers for the pure Flask development entrypoint."""

application/celery_init.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
1+
import inspect
2+
import logging
13
import threading
24

35
from celery import Celery
6+
from application.core import log_context
47
from application.core.settings import settings
5-
from celery.signals import setup_logging, worker_process_init, worker_ready
8+
from celery.signals import (
9+
setup_logging,
10+
task_postrun,
11+
task_prerun,
12+
worker_process_init,
13+
worker_ready,
14+
)
615

716

817
def make_celery(app_name=__name__):
@@ -41,6 +50,54 @@ def _dispose_db_engine_on_fork(*args, **kwargs):
4150
dispose_engine()
4251

4352

53+
# Most tasks in this repo accept ``user`` where the log context wants
54+
# ``user_id``; map task parameter names to context keys explicitly.
55+
_TASK_PARAM_TO_CTX_KEY: dict[str, str] = {
56+
"user": "user_id",
57+
"user_id": "user_id",
58+
"agent_id": "agent_id",
59+
"conversation_id": "conversation_id",
60+
}
61+
62+
_task_log_tokens: dict[str, object] = {}
63+
64+
65+
@task_prerun.connect
66+
def _bind_task_log_context(task_id, task, args, kwargs, **_):
67+
# Resolve task args by parameter name — nearly every task in this repo
68+
# is called positionally, so ``kwargs.get('user')`` would bind nothing.
69+
ctx = {"activity_id": task_id}
70+
try:
71+
sig = inspect.signature(task.run)
72+
bound = sig.bind_partial(*args, **kwargs).arguments
73+
except (TypeError, ValueError):
74+
bound = dict(kwargs)
75+
for param_name, value in bound.items():
76+
ctx_key = _TASK_PARAM_TO_CTX_KEY.get(param_name)
77+
if ctx_key and value:
78+
ctx[ctx_key] = value
79+
_task_log_tokens[task_id] = log_context.bind(**ctx)
80+
81+
82+
@task_postrun.connect
83+
def _unbind_task_log_context(task_id, **_):
84+
# ``task_postrun`` fires on both success and failure. Required for
85+
# Celery: unlike the Flask path, tasks aren't isolated in their own
86+
# ``copy_context().run(...)``, so a missing reset would leak the
87+
# bind onto the next task on the same worker.
88+
token = _task_log_tokens.pop(task_id, None)
89+
if token is None:
90+
return
91+
try:
92+
log_context.reset(token)
93+
except ValueError:
94+
# task_prerun and task_postrun ran on different threads (non-default
95+
# Celery pool); the token isn't valid in this context. Drop it.
96+
logging.getLogger(__name__).debug(
97+
"log_context reset skipped for task %s", task_id
98+
)
99+
100+
44101
@worker_ready.connect
45102
def _run_version_check(*args, **kwargs):
46103
"""Kick off the anonymous version check on worker startup.

application/core/log_context.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Per-activity logging context backed by ``contextvars``.
2+
3+
The ``_ContextFilter`` installed by ``logging_config.setup_logging`` stamps
4+
every ``LogRecord`` emitted inside a ``bind`` block with the bound keys, so
5+
they land as first-class attributes on the OTLP log export rather than being
6+
buried inside formatted message bodies.
7+
8+
A single ``ContextVar`` holds a dict so nested binds reset atomically (LIFO)
9+
via the token returned by ``bind``.
10+
"""
11+
12+
from __future__ import annotations
13+
14+
from contextvars import ContextVar, Token
15+
from typing import Mapping
16+
17+
18+
_CTX_KEYS: frozenset[str] = frozenset(
19+
{
20+
"activity_id",
21+
"parent_activity_id",
22+
"user_id",
23+
"agent_id",
24+
"conversation_id",
25+
"endpoint",
26+
"model",
27+
}
28+
)
29+
30+
_ctx: ContextVar[Mapping[str, str]] = ContextVar("log_ctx", default={})
31+
32+
33+
def bind(**kwargs: object) -> Token:
34+
"""Overlay the given keys onto the current context.
35+
36+
Returns a ``Token`` so the caller can ``reset`` in a ``finally`` block.
37+
Keys outside :data:`_CTX_KEYS` are silently dropped (so a typo can't
38+
stamp a stray field name onto every record), as are ``None`` values
39+
(a missing attribute is more useful than the literal string ``"None"``).
40+
"""
41+
overlay = {
42+
k: str(v)
43+
for k, v in kwargs.items()
44+
if k in _CTX_KEYS and v is not None
45+
}
46+
new = {**_ctx.get(), **overlay}
47+
return _ctx.set(new)
48+
49+
50+
def reset(token: Token) -> None:
51+
"""Restore the context to the snapshot captured by the matching ``bind``."""
52+
_ctx.reset(token)
53+
54+
55+
def snapshot() -> Mapping[str, str]:
56+
"""Return the current context dict. Treat as read-only; use :func:`bind`."""
57+
return _ctx.get()

application/core/logging_config.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,36 @@
22
import os
33
from logging.config import dictConfig
44

5+
from application.core.log_context import snapshot as _ctx_snapshot
6+
7+
8+
# Loggers with ``propagate=False`` don't share root's handlers, so the
9+
# context filter has to be installed on their handlers directly.
10+
_NON_PROPAGATING_LOGGERS: tuple[str, ...] = (
11+
"uvicorn",
12+
"uvicorn.access",
13+
"uvicorn.error",
14+
"celery.app.trace",
15+
"celery.worker.strategy",
16+
"gunicorn.error",
17+
"gunicorn.access",
18+
)
19+
20+
21+
class _ContextFilter(logging.Filter):
22+
"""Stamp the current ``log_context`` snapshot onto every ``LogRecord``.
23+
24+
Must be installed on **handlers**, not loggers: Python skips logger-level
25+
filters when a child logger's record propagates up. The ``hasattr`` guard
26+
keeps an explicit ``logger.info(..., extra={...})`` from being overwritten.
27+
"""
28+
29+
def filter(self, record: logging.LogRecord) -> bool:
30+
for key, value in _ctx_snapshot().items():
31+
if not hasattr(record, key):
32+
setattr(record, key, value)
33+
return True
34+
535

636
def _otlp_logs_enabled() -> bool:
737
"""Return True when the user has opted in to OTLP log export.
@@ -60,3 +90,23 @@ def setup_logging() -> None:
6090
for handler in preserved_handlers:
6191
if handler not in root.handlers:
6292
root.addHandler(handler)
93+
94+
_install_context_filter()
95+
96+
97+
def _install_context_filter() -> None:
98+
"""Attach :class:`_ContextFilter` to root's handlers + every handler on
99+
the known non-propagating loggers. Skipping handlers that already carry
100+
one keeps repeat ``setup_logging`` calls from stacking filters.
101+
"""
102+
103+
def _has_ctx_filter(handler: logging.Handler) -> bool:
104+
return any(isinstance(f, _ContextFilter) for f in handler.filters)
105+
106+
for handler in logging.getLogger().handlers:
107+
if not _has_ctx_filter(handler):
108+
handler.addFilter(_ContextFilter())
109+
for name in _NON_PROPAGATING_LOGGERS:
110+
for handler in logging.getLogger(name).handlers:
111+
if not _has_ctx_filter(handler):
112+
handler.addFilter(_ContextFilter())

application/llm/anthropic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212

1313
class AnthropicLLM(BaseLLM):
14+
provider_name = "anthropic"
1415

1516
def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
1617

application/llm/base.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
from abc import ABC, abstractmethod
3+
from typing import ClassVar
34

45
from application.cache import gen_cache, stream_cache
56

@@ -10,6 +11,10 @@
1011

1112

1213
class BaseLLM(ABC):
14+
# Stamped onto the ``llm_stream_start`` event so dashboards can group
15+
# calls by vendor. Subclasses override.
16+
provider_name: ClassVar[str] = "unknown"
17+
1318
def __init__(
1419
self,
1520
decoded_token=None,
@@ -206,6 +211,21 @@ def gen(self, model, messages, stream=False, tools=None, *args, **kwargs):
206211
)
207212

208213
def gen_stream(self, model, messages, stream=True, tools=None, *args, **kwargs):
214+
# Attachments arrive as ``_usage_attachments`` from ``Agent._llm_gen``;
215+
# the ``stream_token_usage`` decorator pops that key, but the log
216+
# fires before the decorator runs so it's still in ``kwargs`` here.
217+
logging.info(
218+
"llm_stream_start",
219+
extra={
220+
"model": model,
221+
"provider": self.provider_name,
222+
"message_count": len(messages) if messages is not None else 0,
223+
"has_attachments": bool(
224+
kwargs.get("_usage_attachments") or kwargs.get("attachments")
225+
),
226+
"has_tools": bool(tools),
227+
},
228+
)
209229
decorators = [stream_cache, stream_token_usage]
210230
return self._execute_with_fallback(
211231
"_raw_gen_stream",

application/llm/docsgpt_provider.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
DOCSGPT_MODEL = "docsgpt"
77

88
class DocsGPTAPILLM(OpenAILLM):
9+
provider_name = "docsgpt"
10+
911
def __init__(self, api_key=None, user_api_key=None, base_url=None, *args, **kwargs):
1012
super().__init__(
1113
api_key=DOCSGPT_API_KEY,

0 commit comments

Comments
 (0)