Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1024,8 +1024,10 @@ def _parse_usage_from_anthropic(self, usage: BetaUsage | BetaMessageDeltaUsage |
usage_details["input_token_count"] = usage.input_tokens
if usage.cache_creation_input_tokens is not None:
usage_details["anthropic.cache_creation_input_tokens"] = usage.cache_creation_input_tokens # type: ignore[typeddict-unknown-key]
usage_details["cache_creation_input_token_count"] = usage.cache_creation_input_tokens
if usage.cache_read_input_tokens is not None:
usage_details["anthropic.cache_read_input_tokens"] = usage.cache_read_input_tokens # type: ignore[typeddict-unknown-key]
usage_details["cache_read_input_token_count"] = usage.cache_read_input_tokens
return usage_details

def _parse_contents_from_anthropic(
Expand Down
21 changes: 21 additions & 0 deletions python/packages/anthropic/tests/test_anthropic_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2354,6 +2354,27 @@ def test_parse_usage_with_cache_tokens(mock_anthropic_client: MagicMock) -> None
assert result["input_token_count"] == 100
assert result["anthropic.cache_creation_input_tokens"] == 20
assert result["anthropic.cache_read_input_tokens"] == 30
assert result["cache_creation_input_token_count"] == 20
assert result["cache_read_input_token_count"] == 30


def test_parse_usage_preserves_zero_cache_tokens(mock_anthropic_client: MagicMock) -> None:
"""Test parsing usage preserves zero-valued mapped cache tokens."""
client = create_test_anthropic_client(mock_anthropic_client)

mock_usage = MagicMock()
mock_usage.input_tokens = 100
mock_usage.output_tokens = 50
mock_usage.cache_creation_input_tokens = 0
mock_usage.cache_read_input_tokens = 0

result = client._parse_usage_from_anthropic(mock_usage)

assert result is not None
assert result["anthropic.cache_creation_input_tokens"] == 0
assert result["cache_creation_input_token_count"] == 0
assert result["anthropic.cache_read_input_tokens"] == 0
assert result["cache_read_input_token_count"] == 0


# Code Execution Result Tests
Expand Down
6 changes: 6 additions & 0 deletions python/packages/core/agent_framework/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,12 +400,18 @@ class UsageDetails(TypedDict, total=False, extra_items=int): # type: ignore[cal
input_token_count: The number of input tokens used.
output_token_count: The number of output tokens generated.
total_token_count: The total number of tokens (input + output).
cache_creation_input_token_count: The number of input tokens written to a provider-managed cache.
cache_read_input_token_count: The number of input tokens served from a provider-managed cache.
reasoning_output_token_count: The number of output tokens used for reasoning.

"""

input_token_count: int | None
output_token_count: int | None
total_token_count: int | None
cache_creation_input_token_count: int | None
cache_read_input_token_count: int | None
reasoning_output_token_count: int | None


def add_usage_details(usage1: UsageDetails | None, usage2: UsageDetails | None) -> UsageDetails:
Expand Down
44 changes: 30 additions & 14 deletions python/packages/core/agent_framework/observability.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ class OtelAttr(str, Enum):
# Usage attributes
INPUT_TOKENS = "gen_ai.usage.input_tokens"
OUTPUT_TOKENS = "gen_ai.usage.output_tokens"
CACHE_CREATION_INPUT_TOKENS = "gen_ai.usage.cache_creation.input_tokens"
CACHE_READ_INPUT_TOKENS = "gen_ai.usage.cache_read.input_tokens"
REASONING_OUTPUT_TOKENS = "gen_ai.usage.reasoning.output_tokens"
# Tool attributes
TOOL_CALL_ID = "gen_ai.tool.call.id"
TOOL_DESCRIPTION = "gen_ai.tool.description"
Expand Down Expand Up @@ -327,6 +330,20 @@ def __str__(self) -> str:
"tool_calls": "tool_call",
"length": "length",
}
USAGE_DETAIL_TO_OTEL_ATTR: Final[tuple[tuple[str, OtelAttr], ...]] = (
("input_token_count", OtelAttr.INPUT_TOKENS),
("output_token_count", OtelAttr.OUTPUT_TOKENS),
("cache_creation_input_token_count", OtelAttr.CACHE_CREATION_INPUT_TOKENS),
("cache_read_input_token_count", OtelAttr.CACHE_READ_INPUT_TOKENS),
("reasoning_output_token_count", OtelAttr.REASONING_OUTPUT_TOKENS),
("anthropic.cache_creation_input_tokens", OtelAttr.CACHE_CREATION_INPUT_TOKENS),
("anthropic.cache_read_input_tokens", OtelAttr.CACHE_READ_INPUT_TOKENS),
("openai.cached_input_tokens", OtelAttr.CACHE_READ_INPUT_TOKENS),
("prompt/cached_tokens", OtelAttr.CACHE_READ_INPUT_TOKENS),
("openai.reasoning_tokens", OtelAttr.REASONING_OUTPUT_TOKENS),
("completion/reasoning_tokens", OtelAttr.REASONING_OUTPUT_TOKENS),
("reasoning_tokens", OtelAttr.REASONING_OUTPUT_TOKENS),
)


# region Telemetry utils
Expand Down Expand Up @@ -2350,12 +2367,16 @@ def _apply_accumulated_usage(attributes: dict[str, Any], captured_fields: set[st
accumulated = INNER_ACCUMULATED_USAGE.get()
if not accumulated:
return
input_tokens = accumulated.get("input_token_count")
if input_tokens:
attributes[OtelAttr.INPUT_TOKENS] = input_tokens
output_tokens = accumulated.get("output_token_count")
if output_tokens:
attributes[OtelAttr.OUTPUT_TOKENS] = output_tokens
_apply_usage_attributes(attributes, accumulated)


def _apply_usage_attributes(attributes: dict[str, Any], usage: Mapping[str, Any]) -> None:
"""Apply known usage details as standard OTel GenAI attributes."""
for usage_key, otel_attr in USAGE_DETAIL_TO_OTEL_ATTR:
value = usage.get(usage_key)
if value is None or isinstance(value, bool) or not isinstance(value, int):
continue
attributes.setdefault(otel_attr, value)


def _get_response_attributes(
Expand All @@ -2378,12 +2399,7 @@ def _get_response_attributes(
if model := getattr(response, "model", None):
attributes[OtelAttr.RESPONSE_MODEL] = model
if capture_usage and (usage := response.usage_details):
input_tokens = usage.get("input_token_count")
if input_tokens:
attributes[OtelAttr.INPUT_TOKENS] = input_tokens
output_tokens = usage.get("output_token_count")
if output_tokens:
attributes[OtelAttr.OUTPUT_TOKENS] = output_tokens
_apply_usage_attributes(attributes, usage)
return attributes


Expand All @@ -2407,9 +2423,9 @@ def _capture_response(
"""Set the response for a given span."""
span.set_attributes(attributes)
attrs: dict[str, Any] = {k: v for k, v in attributes.items() if k in GEN_AI_METRIC_ATTRIBUTES}
if token_usage_histogram and (input_tokens := attributes.get(OtelAttr.INPUT_TOKENS)):
if token_usage_histogram and (input_tokens := attributes.get(OtelAttr.INPUT_TOKENS)) is not None:
token_usage_histogram.record(input_tokens, attributes={**attrs, OtelAttr.T_TYPE: OtelAttr.T_TYPE_INPUT})
if token_usage_histogram and (output_tokens := attributes.get(OtelAttr.OUTPUT_TOKENS)):
if token_usage_histogram and (output_tokens := attributes.get(OtelAttr.OUTPUT_TOKENS)) is not None:
token_usage_histogram.record(output_tokens, {**attrs, OtelAttr.T_TYPE: OtelAttr.T_TYPE_OUTPUT})
if operation_duration_histogram and duration is not None:
if OtelAttr.ERROR_TYPE in attributes:
Expand Down
100 changes: 97 additions & 3 deletions python/packages/core/tests/core/test_observability.py
Original file line number Diff line number Diff line change
Expand Up @@ -2154,6 +2154,58 @@ def test_get_response_attributes_with_usage():
assert result[OtelAttr.OUTPUT_TOKENS] == 50


def test_get_response_attributes_with_additional_usage():
"""Test _get_response_attributes maps additional usage details to OTel attributes."""
from unittest.mock import Mock

from agent_framework.observability import OtelAttr, _get_response_attributes

response = Mock()
response.response_id = None
response.finish_reason = None
response.raw_representation = None
response.usage_details = {
"input_token_count": 0,
"output_token_count": 50,
"cache_creation_input_token_count": 10,
"cache_read_input_token_count": 0,
"reasoning_output_token_count": 30,
}

attrs = {}
result = _get_response_attributes(attrs, response)

assert result[OtelAttr.INPUT_TOKENS] == 0
assert result[OtelAttr.OUTPUT_TOKENS] == 50
assert result[OtelAttr.CACHE_CREATION_INPUT_TOKENS] == 10
assert result[OtelAttr.CACHE_READ_INPUT_TOKENS] == 0
assert result[OtelAttr.REASONING_OUTPUT_TOKENS] == 30


def test_get_response_attributes_maps_legacy_usage_keys():
"""Test _get_response_attributes maps legacy provider usage keys to standard OTel attributes."""
from unittest.mock import Mock

from agent_framework.observability import OtelAttr, _get_response_attributes

response = Mock()
response.response_id = None
response.finish_reason = None
response.raw_representation = None
response.usage_details = {
"anthropic.cache_creation_input_tokens": 12,
"openai.cached_input_tokens": 0,
"completion/reasoning_tokens": 34,
}

attrs = {}
result = _get_response_attributes(attrs, response)

assert result[OtelAttr.CACHE_CREATION_INPUT_TOKENS] == 12
assert result[OtelAttr.CACHE_READ_INPUT_TOKENS] == 0
assert result[OtelAttr.REASONING_OUTPUT_TOKENS] == 34


def test_get_response_attributes_capture_usage_false():
"""Test _get_response_attributes skips usage when capture_usage is False."""
from unittest.mock import Mock
Expand All @@ -2164,13 +2216,22 @@ def test_get_response_attributes_capture_usage_false():
response.response_id = None
response.finish_reason = None
response.raw_representation = None
response.usage_details = {"input_token_count": 100, "output_token_count": 50}
response.usage_details = {
"input_token_count": 100,
"output_token_count": 50,
"cache_creation_input_token_count": 10,
"cache_read_input_token_count": 20,
"reasoning_output_token_count": 30,
}

attrs = {}
result = _get_response_attributes(attrs, response, capture_usage=False)

assert OtelAttr.INPUT_TOKENS not in result
assert OtelAttr.OUTPUT_TOKENS not in result
assert OtelAttr.CACHE_CREATION_INPUT_TOKENS not in result
assert OtelAttr.CACHE_READ_INPUT_TOKENS not in result
assert OtelAttr.REASONING_OUTPUT_TOKENS not in result


def test_get_response_attributes_capture_response_id_false():
Expand Down Expand Up @@ -2933,6 +2994,23 @@ def test_capture_response(span_exporter: InMemorySpanExporter):
assert spans[0].attributes.get(OtelAttr.OUTPUT_TOKENS) == 50


def test_capture_response_records_zero_token_usage():
"""Test _capture_response records zero-valued token usage."""
from agent_framework.observability import OtelAttr, _capture_response

span = Mock()
token_histogram = Mock()
attrs = {
OtelAttr.INPUT_TOKENS: 0,
OtelAttr.OUTPUT_TOKENS: 0,
}

_capture_response(span=span, attributes=attrs, token_usage_histogram=token_histogram)

span.set_attributes.assert_called_once_with(attrs)
assert token_histogram.record.call_count == 2


async def test_layer_ordering_span_sequence_with_function_calling(span_exporter: InMemorySpanExporter):
"""Test that with correct layer ordering, spans appear in the expected sequence.

Expand Down Expand Up @@ -3937,11 +4015,21 @@ class _InstrumentedAgent(AgentTelemetryLayer, RawAgent):
Content.from_function_call(call_id="call_1", name="get_weather", arguments='{"city": "Seattle"}')
],
),
usage_details=UsageDetails(input_token_count=2239, output_token_count=192),
usage_details=UsageDetails(
input_token_count=2239,
output_token_count=192,
cache_read_input_token_count=100,
reasoning_output_token_count=25,
),
),
ChatResponse(
messages=Message(role="assistant", contents=["The weather in Seattle is sunny."]),
usage_details=UsageDetails(input_token_count=2569, output_token_count=99),
usage_details=UsageDetails(
input_token_count=2569,
output_token_count=99,
cache_read_input_token_count=200,
reasoning_output_token_count=0,
),
),
]

Expand All @@ -3965,12 +4053,18 @@ class _InstrumentedAgent(AgentTelemetryLayer, RawAgent):
# Individual chat spans retain their own usage
assert chat_spans[0].attributes.get(OtelAttr.INPUT_TOKENS) == 2239
assert chat_spans[0].attributes.get(OtelAttr.OUTPUT_TOKENS) == 192
assert chat_spans[0].attributes.get(OtelAttr.CACHE_READ_INPUT_TOKENS) == 100
assert chat_spans[0].attributes.get(OtelAttr.REASONING_OUTPUT_TOKENS) == 25
assert chat_spans[1].attributes.get(OtelAttr.INPUT_TOKENS) == 2569
assert chat_spans[1].attributes.get(OtelAttr.OUTPUT_TOKENS) == 99
assert chat_spans[1].attributes.get(OtelAttr.CACHE_READ_INPUT_TOKENS) == 200
assert chat_spans[1].attributes.get(OtelAttr.REASONING_OUTPUT_TOKENS) == 0

# The invoke_agent span must report the aggregate across all LLM round-trips
assert agent_span.attributes.get(OtelAttr.INPUT_TOKENS) == 2239 + 2569
assert agent_span.attributes.get(OtelAttr.OUTPUT_TOKENS) == 192 + 99
assert agent_span.attributes.get(OtelAttr.CACHE_READ_INPUT_TOKENS) == 100 + 200
assert agent_span.attributes.get(OtelAttr.REASONING_OUTPUT_TOKENS) == 25


@pytest.mark.parametrize("enable_sensitive_data", [False], indirect=True)
Expand Down
14 changes: 10 additions & 4 deletions python/packages/openai/agent_framework_openai/_chat_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2979,10 +2979,16 @@ def _parse_usage_from_openai(self, usage: ResponseUsage) -> UsageDetails | None:
output_token_count=usage.output_tokens,
total_token_count=usage.total_tokens,
)
if usage.input_tokens_details and usage.input_tokens_details.cached_tokens:
details["openai.cached_input_tokens"] = usage.input_tokens_details.cached_tokens # type: ignore[typeddict-unknown-key]
if usage.output_tokens_details and usage.output_tokens_details.reasoning_tokens:
details["openai.reasoning_tokens"] = usage.output_tokens_details.reasoning_tokens # type: ignore[typeddict-unknown-key]
if usage.input_tokens_details:
cached_tokens = cast("int | None", getattr(usage.input_tokens_details, "cached_tokens", None))
if cached_tokens is not None:
details["openai.cached_input_tokens"] = cached_tokens # type: ignore[typeddict-unknown-key]
details["cache_read_input_token_count"] = cached_tokens
if usage.output_tokens_details:
reasoning_tokens = cast("int | None", getattr(usage.output_tokens_details, "reasoning_tokens", None))
if reasoning_tokens is not None:
details["openai.reasoning_tokens"] = reasoning_tokens # type: ignore[typeddict-unknown-key]
details["reasoning_output_token_count"] = reasoning_tokens
return details

def _get_metadata_from_response(self, output: Any) -> dict[str, Any]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -765,15 +765,17 @@ def _parse_usage_from_openai(self, usage: CompletionUsage) -> UsageDetails:
details["completion/accepted_prediction_tokens"] = tokens # type: ignore[typeddict-unknown-key]
if tokens := usage.completion_tokens_details.audio_tokens:
details["completion/audio_tokens"] = tokens # type: ignore[typeddict-unknown-key]
if tokens := usage.completion_tokens_details.reasoning_tokens:
if (tokens := usage.completion_tokens_details.reasoning_tokens) is not None:
details["completion/reasoning_tokens"] = tokens # type: ignore[typeddict-unknown-key]
details["reasoning_output_token_count"] = tokens
if tokens := usage.completion_tokens_details.rejected_prediction_tokens:
details["completion/rejected_prediction_tokens"] = tokens # type: ignore[typeddict-unknown-key]
if usage.prompt_tokens_details:
if tokens := usage.prompt_tokens_details.audio_tokens:
details["prompt/audio_tokens"] = tokens # type: ignore[typeddict-unknown-key]
if tokens := usage.prompt_tokens_details.cached_tokens:
if (tokens := usage.prompt_tokens_details.cached_tokens) is not None:
details["prompt/cached_tokens"] = tokens # type: ignore[typeddict-unknown-key]
details["cache_read_input_token_count"] = tokens
return details

def _parse_text_from_openai(self, choice: Choice | ChunkChoice) -> Content | None:
Expand Down
44 changes: 44 additions & 0 deletions python/packages/openai/tests/openai/test_openai_chat_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3301,6 +3301,7 @@ def test_usage_details_with_cached_tokens() -> None:
assert details is not None
assert details["input_token_count"] == 200
assert details["openai.cached_input_tokens"] == 25
assert details["cache_read_input_token_count"] == 25


def test_usage_details_with_reasoning_tokens() -> None:
Expand All @@ -3319,6 +3320,49 @@ def test_usage_details_with_reasoning_tokens() -> None:
assert details is not None
assert details["output_token_count"] == 80
assert details["openai.reasoning_tokens"] == 30
assert details["reasoning_output_token_count"] == 30


def test_usage_details_with_zero_cached_and_reasoning_tokens() -> None:
"""Test _parse_usage_from_openai preserves zero-valued mapped usage details."""
client = OpenAIChatClient(model="test-model", api_key="test-key")

mock_usage = MagicMock()
mock_usage.input_tokens = 150
mock_usage.output_tokens = 80
mock_usage.total_tokens = 230
mock_usage.input_tokens_details = MagicMock()
mock_usage.input_tokens_details.cached_tokens = 0
mock_usage.output_tokens_details = MagicMock()
mock_usage.output_tokens_details.reasoning_tokens = 0

details = client._parse_usage_from_openai(mock_usage) # type: ignore
assert details is not None
assert details["openai.cached_input_tokens"] == 0
assert details["cache_read_input_token_count"] == 0
assert details["openai.reasoning_tokens"] == 0
assert details["reasoning_output_token_count"] == 0


def test_usage_details_omits_missing_cached_and_reasoning_tokens() -> None:
"""Test _parse_usage_from_openai omits missing mapped usage details."""
client = OpenAIChatClient(model="test-model", api_key="test-key")

mock_usage = MagicMock()
mock_usage.input_tokens = 150
mock_usage.output_tokens = 80
mock_usage.total_tokens = 230
mock_usage.input_tokens_details = MagicMock()
mock_usage.input_tokens_details.cached_tokens = None
mock_usage.output_tokens_details = MagicMock()
mock_usage.output_tokens_details.reasoning_tokens = None

details = client._parse_usage_from_openai(mock_usage) # type: ignore
assert details is not None
assert "openai.cached_input_tokens" not in details
assert "cache_read_input_token_count" not in details
assert "openai.reasoning_tokens" not in details
assert "reasoning_output_token_count" not in details


def test_get_metadata_from_response() -> None:
Expand Down
Loading
Loading