From f864566faed6be7a1ed54cd23c0f44033021100c Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Wed, 9 Jul 2025 16:43:48 +0200 Subject: [PATCH 1/4] Updated recording of token usage --- sentry_sdk/ai/monitoring.py | 42 ++++++---- sentry_sdk/integrations/anthropic.py | 15 +++- sentry_sdk/integrations/cohere.py | 10 +-- sentry_sdk/integrations/huggingface_hub.py | 10 ++- sentry_sdk/integrations/langchain.py | 10 +-- sentry_sdk/integrations/openai.py | 91 ++++++++++++++-------- tests/integrations/openai/test_openai.py | 67 ++++++++++++---- 7 files changed, 169 insertions(+), 76 deletions(-) diff --git a/sentry_sdk/ai/monitoring.py b/sentry_sdk/ai/monitoring.py index ed33acd0f1..07fbb0574e 100644 --- a/sentry_sdk/ai/monitoring.py +++ b/sentry_sdk/ai/monitoring.py @@ -96,21 +96,37 @@ async def async_wrapped(*args, **kwargs): def record_token_usage( - span, prompt_tokens=None, completion_tokens=None, total_tokens=None + span, + input_tokens=None, + input_tokens_cached=None, + output_tokens=None, + output_tokens_reasoning=None, + total_tokens=None, ): - # type: (Span, Optional[int], Optional[int], Optional[int]) -> None + # type: (Span, Optional[int], Optional[int], Optional[int], Optional[int], Optional[int]) -> None + + # TODO: move pipeline name elsewhere ai_pipeline_name = get_ai_pipeline_name() if ai_pipeline_name: span.set_data(SPANDATA.AI_PIPELINE_NAME, ai_pipeline_name) - if prompt_tokens is not None: - span.set_measurement("ai_prompt_tokens_used", value=prompt_tokens) - if completion_tokens is not None: - span.set_measurement("ai_completion_tokens_used", value=completion_tokens) - if ( - total_tokens is None - and prompt_tokens is not None - and completion_tokens is not None - ): - total_tokens = prompt_tokens + completion_tokens + + if input_tokens is not None: + span.set_data(SPANDATA.GEN_AI_USAGE_INPUT_TOKENS, input_tokens) + + if input_tokens_cached is not None: + span.set_data( + SPANDATA.GEN_AI_USAGE_INPUT_TOKENS_CACHED, + input_tokens_cached, + ) + + if output_tokens is not None: + span.set_data(SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens) + + if output_tokens_reasoning is not None: + span.set_data( + SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS_REASONING, + output_tokens_reasoning, + ) + if total_tokens is not None: - span.set_measurement("ai_total_tokens_used", total_tokens) + span.set_data(SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS, total_tokens) diff --git a/sentry_sdk/integrations/anthropic.py b/sentry_sdk/integrations/anthropic.py index 76a3bb9f13..1e1f9112a1 100644 --- a/sentry_sdk/integrations/anthropic.py +++ b/sentry_sdk/integrations/anthropic.py @@ -65,7 +65,13 @@ def _calculate_token_usage(result, span): output_tokens = usage.output_tokens total_tokens = input_tokens + output_tokens - record_token_usage(span, input_tokens, output_tokens, total_tokens) + + record_token_usage( + span, + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=total_tokens, + ) def _get_responses(content): @@ -126,7 +132,12 @@ def _add_ai_data_to_span( [{"type": "text", "text": complete_message}], ) total_tokens = input_tokens + output_tokens - record_token_usage(span, input_tokens, output_tokens, total_tokens) + record_token_usage( + span, + input_tokens=input_tokens, + output_tokens=output_tokens, + total_tokens=total_tokens, + ) span.set_data(SPANDATA.AI_STREAMING, True) diff --git a/sentry_sdk/integrations/cohere.py b/sentry_sdk/integrations/cohere.py index 433b285bf0..57ffdb908a 100644 --- a/sentry_sdk/integrations/cohere.py +++ b/sentry_sdk/integrations/cohere.py @@ -116,14 +116,14 @@ def collect_chat_response_fields(span, res, include_pii): if hasattr(res.meta, "billed_units"): record_token_usage( span, - prompt_tokens=res.meta.billed_units.input_tokens, - completion_tokens=res.meta.billed_units.output_tokens, + input_tokens=res.meta.billed_units.input_tokens, + output_tokens=res.meta.billed_units.output_tokens, ) elif hasattr(res.meta, "tokens"): record_token_usage( span, - prompt_tokens=res.meta.tokens.input_tokens, - completion_tokens=res.meta.tokens.output_tokens, + input_tokens=res.meta.tokens.input_tokens, + output_tokens=res.meta.tokens.output_tokens, ) if hasattr(res.meta, "warnings"): @@ -262,7 +262,7 @@ def new_embed(*args, **kwargs): ): record_token_usage( span, - prompt_tokens=res.meta.billed_units.input_tokens, + input_tokens=res.meta.billed_units.input_tokens, total_tokens=res.meta.billed_units.input_tokens, ) return res diff --git a/sentry_sdk/integrations/huggingface_hub.py b/sentry_sdk/integrations/huggingface_hub.py index dfac77e996..2dfcb5925a 100644 --- a/sentry_sdk/integrations/huggingface_hub.py +++ b/sentry_sdk/integrations/huggingface_hub.py @@ -111,7 +111,10 @@ def new_text_generation(*args, **kwargs): [res.generated_text], ) if res.details is not None and res.details.generated_tokens > 0: - record_token_usage(span, total_tokens=res.details.generated_tokens) + record_token_usage( + span, + total_tokens=res.details.generated_tokens, + ) span.__exit__(None, None, None) return res @@ -145,7 +148,10 @@ def new_details_iterator(): span, SPANDATA.AI_RESPONSES, "".join(data_buf) ) if tokens_used > 0: - record_token_usage(span, total_tokens=tokens_used) + record_token_usage( + span, + total_tokens=tokens_used, + ) span.__exit__(None, None, None) return new_details_iterator() diff --git a/sentry_sdk/integrations/langchain.py b/sentry_sdk/integrations/langchain.py index 0b8bbd8049..a55cae2f3c 100644 --- a/sentry_sdk/integrations/langchain.py +++ b/sentry_sdk/integrations/langchain.py @@ -278,15 +278,15 @@ def on_llm_end(self, response, *, run_id, **kwargs): if token_usage: record_token_usage( span_data.span, - token_usage.get("prompt_tokens"), - token_usage.get("completion_tokens"), - token_usage.get("total_tokens"), + input_tokens=token_usage.get("prompt_tokens"), + output_tokens=token_usage.get("completion_tokens"), + total_tokens=token_usage.get("total_tokens"), ) else: record_token_usage( span_data.span, - span_data.num_prompt_tokens, - span_data.num_completion_tokens, + input_tokens=span_data.num_prompt_tokens, + output_tokens=span_data.num_completion_tokens, ) self._exit_span(span_data, run_id) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index e95753f6e1..569e04a211 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -70,48 +70,75 @@ def _capture_exception(exc): sentry_sdk.capture_event(event, hint=hint) -def _calculate_chat_completion_usage( +def _get_usage(usage, names): + # type: (Any, List[str]) -> int + for name in names: + if hasattr(usage, name) and isinstance(getattr(usage, name), int): + return getattr(usage, name) + return 0 + + +def _calculate_token_usage( messages, response, span, streaming_message_responses, count_tokens ): # type: (Iterable[ChatCompletionMessageParam], Any, Span, Optional[List[str]], Callable[..., Any]) -> None - completion_tokens = 0 # type: Optional[int] - prompt_tokens = 0 # type: Optional[int] + input_tokens = 0 # type: Optional[int] + input_tokens_cached = 0 # type: Optional[int] + output_tokens = 0 # type: Optional[int] + output_tokens_reasoning = 0 # type: Optional[int] total_tokens = 0 # type: Optional[int] + if hasattr(response, "usage"): - if hasattr(response.usage, "completion_tokens") and isinstance( - response.usage.completion_tokens, int - ): - completion_tokens = response.usage.completion_tokens - if hasattr(response.usage, "prompt_tokens") and isinstance( - response.usage.prompt_tokens, int - ): - prompt_tokens = response.usage.prompt_tokens - if hasattr(response.usage, "total_tokens") and isinstance( - response.usage.total_tokens, int - ): - total_tokens = response.usage.total_tokens + input_tokens = _get_usage(response.usage, ["input_tokens", "prompt_tokens"]) + if hasattr(response.usage, "input_tokens_details"): + input_tokens_cached = _get_usage( + response.usage.input_tokens_details, ["cached_tokens"] + ) - if prompt_tokens == 0: + output_tokens = _get_usage( + response.usage, ["output_tokens", "completion_tokens"] + ) + if hasattr(response.usage, "output_tokens_details"): + output_tokens_reasoning = _get_usage( + response.usage.output_tokens_details, ["reasoning_tokens"] + ) + + total_tokens = _get_usage(response.usage, ["total_tokens"]) + + # Manually count tokens + # TODO: check for responses API + if input_tokens == 0: for message in messages: if "content" in message: - prompt_tokens += count_tokens(message["content"]) + input_tokens += count_tokens(message["content"]) - if completion_tokens == 0: + # TODO: check for responses API + if output_tokens == 0: if streaming_message_responses is not None: for message in streaming_message_responses: - completion_tokens += count_tokens(message) + output_tokens += count_tokens(message) elif hasattr(response, "choices"): for choice in response.choices: if hasattr(choice, "message"): - completion_tokens += count_tokens(choice.message) - - if prompt_tokens == 0: - prompt_tokens = None - if completion_tokens == 0: - completion_tokens = None - if total_tokens == 0: - total_tokens = None - record_token_usage(span, prompt_tokens, completion_tokens, total_tokens) + output_tokens += count_tokens(choice.message) + + # Do not set token data if it is 0 + input_tokens = None if input_tokens == 0 else input_tokens + input_tokens_cached = None if input_tokens_cached == 0 else input_tokens_cached + output_tokens = None if output_tokens == 0 else output_tokens + output_tokens_reasoning = ( + None if output_tokens_reasoning == 0 else output_tokens_reasoning + ) + total_tokens = None if total_tokens == 0 else total_tokens + + record_token_usage( + span, + input_tokens=input_tokens, + input_tokens_cached=input_tokens_cached, + output_tokens=output_tokens, + output_tokens_reasoning=output_tokens_reasoning, + total_tokens=total_tokens, + ) def _new_chat_completion_common(f, *args, **kwargs): @@ -158,9 +185,7 @@ def _new_chat_completion_common(f, *args, **kwargs): SPANDATA.AI_RESPONSES, list(map(lambda x: x.message, res.choices)), ) - _calculate_chat_completion_usage( - messages, res, span, None, integration.count_tokens - ) + _calculate_token_usage(messages, res, span, None, integration.count_tokens) span.__exit__(None, None, None) elif hasattr(res, "_iterator"): data_buf: list[list[str]] = [] # one for each choice @@ -191,7 +216,7 @@ def new_iterator(): set_data_normalized( span, SPANDATA.AI_RESPONSES, all_responses ) - _calculate_chat_completion_usage( + _calculate_token_usage( messages, res, span, @@ -224,7 +249,7 @@ async def new_iterator_async(): set_data_normalized( span, SPANDATA.AI_RESPONSES, all_responses ) - _calculate_chat_completion_usage( + _calculate_token_usage( messages, res, span, diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 3fdc138f39..c2360717e2 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -10,7 +10,7 @@ from sentry_sdk.consts import SPANDATA from sentry_sdk.integrations.openai import ( OpenAIIntegration, - _calculate_chat_completion_usage, + _calculate_token_usage, ) from unittest import mock # python 3.3 and above @@ -743,7 +743,7 @@ async def test_span_origin_embeddings_async(sentry_init, capture_events): assert event["spans"][0]["origin"] == "auto.ai.openai" -def test_calculate_chat_completion_usage_a(): +def test_calculate_token_usage_a(): span = mock.MagicMock() def count_tokens(msg): @@ -760,13 +760,20 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_chat_completion_usage( + _calculate_token_usage( messages, response, span, streaming_message_responses, count_tokens ) - mock_record_token_usage.assert_called_once_with(span, 20, 10, 30) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=20, + input_tokens_cached=None, + output_tokens=10, + output_tokens_reasoning=None, + total_tokens=30, + ) -def test_calculate_chat_completion_usage_b(): +def test_calculate_token_usage_b(): span = mock.MagicMock() def count_tokens(msg): @@ -786,13 +793,20 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_chat_completion_usage( + _calculate_token_usage( messages, response, span, streaming_message_responses, count_tokens ) - mock_record_token_usage.assert_called_once_with(span, 11, 10, 10) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=11, + input_tokens_cached=None, + output_tokens=10, + output_tokens_reasoning=None, + total_tokens=10, + ) -def test_calculate_chat_completion_usage_c(): +def test_calculate_token_usage_c(): span = mock.MagicMock() def count_tokens(msg): @@ -812,13 +826,20 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_chat_completion_usage( + _calculate_token_usage( messages, response, span, streaming_message_responses, count_tokens ) - mock_record_token_usage.assert_called_once_with(span, 20, 11, 20) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=20, + input_tokens_cached=None, + output_tokens=11, + output_tokens_reasoning=None, + total_tokens=20, + ) -def test_calculate_chat_completion_usage_d(): +def test_calculate_token_usage_d(): span = mock.MagicMock() def count_tokens(msg): @@ -839,13 +860,20 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_chat_completion_usage( + _calculate_token_usage( messages, response, span, streaming_message_responses, count_tokens ) - mock_record_token_usage.assert_called_once_with(span, 20, None, 20) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=20, + input_tokens_cached=None, + output_tokens=None, + output_tokens_reasoning=None, + total_tokens=20, + ) -def test_calculate_chat_completion_usage_e(): +def test_calculate_token_usage_e(): span = mock.MagicMock() def count_tokens(msg): @@ -858,7 +886,14 @@ def count_tokens(msg): with mock.patch( "sentry_sdk.integrations.openai.record_token_usage" ) as mock_record_token_usage: - _calculate_chat_completion_usage( + _calculate_token_usage( messages, response, span, streaming_message_responses, count_tokens ) - mock_record_token_usage.assert_called_once_with(span, None, None, None) + mock_record_token_usage.assert_called_once_with( + span, + input_tokens=None, + input_tokens_cached=None, + output_tokens=None, + output_tokens_reasoning=None, + total_tokens=None, + ) From 1fe97c903d9916256b500a7ff09cad93e177292a Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Thu, 10 Jul 2025 09:10:54 +0200 Subject: [PATCH 2/4] updated tests --- sentry_sdk/ai/monitoring.py | 3 +++ sentry_sdk/integrations/openai.py | 14 +++++++---- tests/integrations/openai/test_openai.py | 32 ++++++++++++------------ 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/sentry_sdk/ai/monitoring.py b/sentry_sdk/ai/monitoring.py index 07fbb0574e..461fd6af85 100644 --- a/sentry_sdk/ai/monitoring.py +++ b/sentry_sdk/ai/monitoring.py @@ -128,5 +128,8 @@ def record_token_usage( output_tokens_reasoning, ) + if total_tokens is None and input_tokens is not None and output_tokens is not None: + total_tokens = input_tokens + output_tokens + if total_tokens is not None: span.set_data(SPANDATA.GEN_AI_USAGE_TOTAL_TOKENS, total_tokens) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 569e04a211..34c660b9fb 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -366,22 +366,26 @@ def _new_embeddings_create_common(f, *args, **kwargs): response = yield f, args, kwargs - prompt_tokens = 0 + input_tokens = 0 total_tokens = 0 if hasattr(response, "usage"): if hasattr(response.usage, "prompt_tokens") and isinstance( response.usage.prompt_tokens, int ): - prompt_tokens = response.usage.prompt_tokens + input_tokens = response.usage.prompt_tokens if hasattr(response.usage, "total_tokens") and isinstance( response.usage.total_tokens, int ): total_tokens = response.usage.total_tokens - if prompt_tokens == 0: - prompt_tokens = integration.count_tokens(kwargs["input"] or "") + if input_tokens == 0: + input_tokens = integration.count_tokens(kwargs["input"] or "") - record_token_usage(span, prompt_tokens, None, total_tokens or prompt_tokens) + record_token_usage( + span, + input_tokens=input_tokens, + total_tokens=total_tokens or input_tokens, + ) return response diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index c2360717e2..ac6d9f4c29 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -90,9 +90,9 @@ def test_nonstreaming_chat_completion( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.asyncio @@ -132,9 +132,9 @@ async def test_nonstreaming_chat_completion_async( assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] assert SPANDATA.AI_RESPONSES not in span["data"] - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 10 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.output_tokens"] == 10 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 def tiktoken_encoding_if_installed(): @@ -228,9 +228,9 @@ def test_streaming_chat_completion( try: import tiktoken # type: ignore # noqa # pylint: disable=unused-import - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 2 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 1 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 3 + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 1 + assert span["data"]["gen_ai.usage.total_tokens"] == 3 except ImportError: pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly @@ -324,9 +324,9 @@ async def test_streaming_chat_completion_async( try: import tiktoken # type: ignore # noqa # pylint: disable=unused-import - assert span["measurements"]["ai_completion_tokens_used"]["value"] == 2 - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 1 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 3 + assert span["data"]["gen_ai.usage.output_tokens"] == 2 + assert span["data"]["gen_ai.usage.input_tokens"] == 1 + assert span["data"]["gen_ai.usage.total_tokens"] == 3 except ImportError: pass # if tiktoken is not installed, we can't guarantee token usage will be calculated properly @@ -409,8 +409,8 @@ def test_embeddings_create( else: assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.asyncio @@ -457,8 +457,8 @@ async def test_embeddings_create_async( else: assert SPANDATA.AI_INPUT_MESSAGES not in span["data"] - assert span["measurements"]["ai_prompt_tokens_used"]["value"] == 20 - assert span["measurements"]["ai_total_tokens_used"]["value"] == 30 + assert span["data"]["gen_ai.usage.input_tokens"] == 20 + assert span["data"]["gen_ai.usage.total_tokens"] == 30 @pytest.mark.parametrize( From e7ed400a5d73dd26d78f1271b1f1c21d06bb6d2e Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Tue, 15 Jul 2025 14:30:35 +0200 Subject: [PATCH 3/4] better comments --- sentry_sdk/integrations/openai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 34c660b9fb..6116661551 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -106,13 +106,13 @@ def _calculate_token_usage( total_tokens = _get_usage(response.usage, ["total_tokens"]) # Manually count tokens - # TODO: check for responses API + # TODO: when implementing responses API, check for responses API if input_tokens == 0: for message in messages: if "content" in message: input_tokens += count_tokens(message["content"]) - # TODO: check for responses API + # TODO: when implementing responses API, check for responses API if output_tokens == 0: if streaming_message_responses is not None: for message in streaming_message_responses: From 520c3d725411b0702401d94aab9321a82264ca1b Mon Sep 17 00:00:00 2001 From: Anton Pirker Date: Tue, 15 Jul 2025 16:46:56 +0200 Subject: [PATCH 4/4] Update sentry_sdk/integrations/openai.py Co-authored-by: Ivana Kellyer --- sentry_sdk/integrations/openai.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 6116661551..d906a8e0b2 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -123,13 +123,11 @@ def _calculate_token_usage( output_tokens += count_tokens(choice.message) # Do not set token data if it is 0 - input_tokens = None if input_tokens == 0 else input_tokens - input_tokens_cached = None if input_tokens_cached == 0 else input_tokens_cached - output_tokens = None if output_tokens == 0 else output_tokens - output_tokens_reasoning = ( - None if output_tokens_reasoning == 0 else output_tokens_reasoning - ) - total_tokens = None if total_tokens == 0 else total_tokens + input_tokens = input_tokens or None + input_tokens_cached = input_tokens_cached or None + output_tokens = output_tokens or None + output_tokens_reasoning = output_tokens_reasoning or None + total_tokens = total_tokens or None record_token_usage( span,