Skip to content

Commit 5d3de38

Browse files
committed
feat(telemetry): added timeToFirstByteMs into spans and metrics
1 parent 776fd93 commit 5d3de38

File tree

9 files changed

+76
-21
lines changed

9 files changed

+76
-21
lines changed

src/strands/event_loop/event_loop.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ async def _handle_model_execution(
281281
message = recover_message_on_max_tokens_reached(message)
282282

283283
if model_invoke_span:
284-
tracer.end_model_invoke_span(model_invoke_span, message, usage, stop_reason)
284+
tracer.end_model_invoke_span(model_invoke_span, message, usage, metrics, stop_reason)
285285
break # Success! Break out of retry loop
286286

287287
except Exception as e:

src/strands/event_loop/streaming.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import json
44
import logging
5+
import time
56
from typing import Any, AsyncGenerator, AsyncIterable, Optional
67

78
from ..models.model import Model
@@ -267,31 +268,38 @@ def handle_redact_content(event: RedactContentEvent, state: dict[str, Any]) -> N
267268
state["message"]["content"] = [{"text": event["redactAssistantContentMessage"]}]
268269

269270

270-
def extract_usage_metrics(event: MetadataEvent) -> tuple[Usage, Metrics]:
271+
def extract_usage_metrics(event: MetadataEvent, time_to_first_byte_ms: int | None = None) -> tuple[Usage, Metrics]:
271272
"""Extracts usage metrics from the metadata chunk.
272273
273274
Args:
274275
event: metadata.
276+
time_to_first_byte_ms: time to get the first byte from the model in milliseconds
275277
276278
Returns:
277279
The extracted usage metrics and latency.
278280
"""
279281
usage = Usage(**event["usage"])
280282
metrics = Metrics(**event["metrics"])
283+
if time_to_first_byte_ms:
284+
metrics["timeToFirstByteMs"] = time_to_first_byte_ms
281285

282286
return usage, metrics
283287

284288

285-
async def process_stream(chunks: AsyncIterable[StreamEvent]) -> AsyncGenerator[TypedEvent, None]:
289+
async def process_stream(
290+
chunks: AsyncIterable[StreamEvent], start_time: float | None = None
291+
) -> AsyncGenerator[TypedEvent, None]:
286292
"""Processes the response stream from the API, constructing the final message and extracting usage metrics.
287293
288294
Args:
289295
chunks: The chunks of the response stream from the model.
296+
start_time: Time when the model request is initiated
290297
291298
Yields:
292299
The reason for stopping, the constructed message, and the usage metrics.
293300
"""
294301
stop_reason: StopReason = "end_turn"
302+
first_byte_time = None
295303

296304
state: dict[str, Any] = {
297305
"message": {"role": "assistant", "content": []},
@@ -303,10 +311,14 @@ async def process_stream(chunks: AsyncIterable[StreamEvent]) -> AsyncGenerator[T
303311
state["content"] = state["message"]["content"]
304312

305313
usage: Usage = Usage(inputTokens=0, outputTokens=0, totalTokens=0)
306-
metrics: Metrics = Metrics(latencyMs=0)
314+
metrics: Metrics = Metrics(latencyMs=0, timeToFirstByteMs=0)
307315

308316
async for chunk in chunks:
317+
# Track first byte time when we get first content
318+
if first_byte_time is None and ("contentBlockDelta" in chunk or "contentBlockStart" in chunk):
319+
first_byte_time = time.time()
309320
yield ModelStreamChunkEvent(chunk=chunk)
321+
310322
if "messageStart" in chunk:
311323
state["message"] = handle_message_start(chunk["messageStart"], state["message"])
312324
elif "contentBlockStart" in chunk:
@@ -319,7 +331,10 @@ async def process_stream(chunks: AsyncIterable[StreamEvent]) -> AsyncGenerator[T
319331
elif "messageStop" in chunk:
320332
stop_reason = handle_message_stop(chunk["messageStop"])
321333
elif "metadata" in chunk:
322-
usage, metrics = extract_usage_metrics(chunk["metadata"])
334+
time_to_first_byte_ms = (
335+
int(1000 * (start_time - first_byte_time)) if (start_time and first_byte_time) else None
336+
)
337+
usage, metrics = extract_usage_metrics(chunk["metadata"], time_to_first_byte_ms)
323338
elif "redactContent" in chunk:
324339
handle_redact_content(chunk["redactContent"], state)
325340

@@ -346,7 +361,8 @@ async def stream_messages(
346361
logger.debug("model=<%s> | streaming messages", model)
347362

348363
messages = remove_blank_messages_content_text(messages)
364+
start_time = time.time()
349365
chunks = model.stream(messages, tool_specs if tool_specs else None, system_prompt)
350366

351-
async for event in process_stream(chunks):
367+
async for event in process_stream(chunks, start_time):
352368
yield event

src/strands/telemetry/metrics.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,8 @@ def update_metrics(self, metrics: Metrics) -> None:
286286
metrics: The metrics data to add to the accumulated totals.
287287
"""
288288
self._metrics_client.event_loop_latency.record(metrics["latencyMs"])
289+
if "timeToFirstByteMs" in metrics:
290+
self._metrics_client.model_time_to_first_token.record(metrics["timeToFirstByteMs"])
289291
self.accumulated_metrics["latencyMs"] += metrics["latencyMs"]
290292

291293
def get_summary(self) -> Dict[str, Any]:
@@ -448,7 +450,7 @@ class MetricsClient:
448450
event_loop_output_tokens: Histogram
449451
event_loop_cache_read_input_tokens: Histogram
450452
event_loop_cache_write_input_tokens: Histogram
451-
453+
model_time_to_first_token: Histogram
452454
tool_call_count: Counter
453455
tool_success_count: Counter
454456
tool_error_count: Counter
@@ -507,3 +509,6 @@ def create_instruments(self) -> None:
507509
self.event_loop_cache_write_input_tokens = self.meter.create_histogram(
508510
name=constants.STRANDS_EVENT_LOOP_CACHE_WRITE_INPUT_TOKENS, unit="token"
509511
)
512+
self.model_time_to_first_token = self.meter.create_histogram(
513+
name=constants.STRANDS_MODEL_TIME_TO_FIRST_TOKEN, unit="ms"
514+
)

src/strands/telemetry/metrics_constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@
1515
STRANDS_EVENT_LOOP_OUTPUT_TOKENS = "strands.event_loop.output.tokens"
1616
STRANDS_EVENT_LOOP_CACHE_READ_INPUT_TOKENS = "strands.event_loop.cache_read.input.tokens"
1717
STRANDS_EVENT_LOOP_CACHE_WRITE_INPUT_TOKENS = "strands.event_loop.cache_write.input.tokens"
18+
STRANDS_MODEL_TIME_TO_FIRST_TOKEN = "strands.model.time_to_first_token"

src/strands/telemetry/tracer.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from ..agent.agent_result import AgentResult
1818
from ..types.content import ContentBlock, Message, Messages
19-
from ..types.streaming import StopReason, Usage
19+
from ..types.streaming import Metrics, StopReason, Usage
2020
from ..types.tools import ToolResult, ToolUse
2121
from ..types.traces import Attributes, AttributeValue
2222

@@ -277,14 +277,21 @@ def start_model_invoke_span(
277277
return span
278278

279279
def end_model_invoke_span(
280-
self, span: Span, message: Message, usage: Usage, stop_reason: StopReason, error: Optional[Exception] = None
280+
self,
281+
span: Span,
282+
message: Message,
283+
usage: Usage,
284+
metrics: Metrics,
285+
stop_reason: StopReason,
286+
error: Optional[Exception] = None,
281287
) -> None:
282288
"""End a model invocation span with results and metrics.
283289
284290
Args:
285291
span: The span to end.
286292
message: The message response from the model.
287293
usage: Token usage information from the model call.
294+
metrics: Metrics from the model call.
288295
stop_reason (StopReason): The reason the model stopped generating.
289296
error: Optional exception if the model call failed.
290297
"""
@@ -296,6 +303,8 @@ def end_model_invoke_span(
296303
"gen_ai.usage.total_tokens": usage["totalTokens"],
297304
"gen_ai.usage.cache_read_input_tokens": usage.get("cacheReadInputTokens", 0),
298305
"gen_ai.usage.cache_write_input_tokens": usage.get("cacheWriteInputTokens", 0),
306+
"gen_ai.server.time_to_first_token": metrics.get("timeToFirstByteMs", 0),
307+
"gen_ai.server.request.duration": metrics.get("latencyMs", 0),
299308
}
300309

301310
if self.use_latest_genai_conventions:

src/strands/types/event_loop.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,16 @@ class Usage(TypedDict, total=False):
2323
cacheWriteInputTokens: int
2424

2525

26-
class Metrics(TypedDict):
26+
class Metrics(TypedDict, total=False):
2727
"""Performance metrics for model interactions.
2828
2929
Attributes:
3030
latencyMs (int): Latency of the model request in milliseconds.
31+
timeToFirstByteMs (int): TimeToFirstByte of the first chunk from the model in milliseconds.
3132
"""
3233

33-
latencyMs: int
34+
latencyMs: Required[int]
35+
timeToFirstByteMs: int
3436

3537

3638
StopReason = Literal[

tests/strands/event_loop/test_streaming.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ def test_extract_usage_metrics_with_cache_tokens():
491491
"content": [],
492492
},
493493
{"inputTokens": 0, "outputTokens": 0, "totalTokens": 0},
494-
{"latencyMs": 0},
494+
{"latencyMs": 0, "timeToFirstByteMs": 0},
495495
),
496496
},
497497
],
@@ -781,7 +781,7 @@ async def test_stream_messages(agenerator, alist):
781781
"end_turn",
782782
{"role": "assistant", "content": [{"text": "test"}]},
783783
{"inputTokens": 0, "outputTokens": 0, "totalTokens": 0},
784-
{"latencyMs": 0},
784+
{"latencyMs": 0, "timeToFirstByteMs": 0},
785785
)
786786
},
787787
]

tests/strands/telemetry/test_metrics.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,18 @@ def metrics(request):
109109
return Metrics(**params)
110110

111111

112+
@pytest.fixture
113+
def metrics_with_ttfb(request):
114+
params = {
115+
"latencyMs": 1,
116+
"timeToFirstByteMs": 10,
117+
}
118+
if hasattr(request, "param"):
119+
params.update(request.param)
120+
121+
return Metrics(**params)
122+
123+
112124
@pytest.mark.parametrize("end_time", [None, 1])
113125
@unittest.mock.patch.object(strands.telemetry.metrics.time, "time")
114126
def test_trace_end(mock_time, end_time, trace):
@@ -132,8 +144,8 @@ def mock_get_meter_provider():
132144
mock_create_counter = mock.MagicMock()
133145
mock_meter.create_counter.return_value = mock_create_counter
134146

135-
mock_create_histogram = mock.MagicMock()
136-
mock_meter.create_histogram.return_value = mock_create_histogram
147+
# Create separate mock objects for each histogram call
148+
mock_meter.create_histogram.side_effect = lambda *args, **kwargs: mock.MagicMock()
137149
meter_provider_mock.get_meter.return_value = mock_meter
138150

139151
mock_get_meter_provider.return_value = meter_provider_mock
@@ -326,9 +338,9 @@ def test_event_loop_metrics_update_usage(usage, event_loop_metrics, mock_get_met
326338
metrics_client.event_loop_cache_write_input_tokens.record.assert_called()
327339

328340

329-
def test_event_loop_metrics_update_metrics(metrics, event_loop_metrics, mock_get_meter_provider):
341+
def test_event_loop_metrics_update_metrics(metrics_with_ttfb, event_loop_metrics, mock_get_meter_provider):
330342
for _ in range(3):
331-
event_loop_metrics.update_metrics(metrics)
343+
event_loop_metrics.update_metrics(metrics_with_ttfb)
332344

333345
tru_metrics = event_loop_metrics.accumulated_metrics
334346
exp_metrics = Metrics(
@@ -338,6 +350,7 @@ def test_event_loop_metrics_update_metrics(metrics, event_loop_metrics, mock_get
338350
assert tru_metrics == exp_metrics
339351
mock_get_meter_provider.return_value.get_meter.assert_called()
340352
event_loop_metrics._metrics_client.event_loop_latency.record.assert_called_with(1)
353+
event_loop_metrics._metrics_client.model_time_to_first_token.record.assert_any_call(10)
341354

342355

343356
def test_event_loop_metrics_get_summary(trace, tool, event_loop_metrics, mock_get_meter_provider):

tests/strands/telemetry/test_tracer.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from strands.telemetry.tracer import JSONEncoder, Tracer, get_tracer, serialize
1313
from strands.types.content import ContentBlock
14-
from strands.types.streaming import StopReason, Usage
14+
from strands.types.streaming import Metrics, StopReason, Usage
1515

1616

1717
@pytest.fixture(autouse=True)
@@ -205,9 +205,10 @@ def test_end_model_invoke_span(mock_span):
205205
tracer = Tracer()
206206
message = {"role": "assistant", "content": [{"text": "Response"}]}
207207
usage = Usage(inputTokens=10, outputTokens=20, totalTokens=30)
208+
metrics = Metrics(latencyMs=20, timeToFirstByteMs=10)
208209
stop_reason: StopReason = "end_turn"
209210

210-
tracer.end_model_invoke_span(mock_span, message, usage, stop_reason)
211+
tracer.end_model_invoke_span(mock_span, message, usage, metrics, stop_reason)
211212

212213
mock_span.set_attribute.assert_any_call("gen_ai.usage.prompt_tokens", 10)
213214
mock_span.set_attribute.assert_any_call("gen_ai.usage.input_tokens", 10)
@@ -216,6 +217,8 @@ def test_end_model_invoke_span(mock_span):
216217
mock_span.set_attribute.assert_any_call("gen_ai.usage.total_tokens", 30)
217218
mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_read_input_tokens", 0)
218219
mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_write_input_tokens", 0)
220+
mock_span.set_attribute.assert_any_call("gen_ai.server.request.duration", 20)
221+
mock_span.set_attribute.assert_any_call("gen_ai.server.time_to_first_token", 10)
219222
mock_span.add_event.assert_called_with(
220223
"gen_ai.choice",
221224
attributes={"message": json.dumps(message["content"]), "finish_reason": "end_turn"},
@@ -231,9 +234,10 @@ def test_end_model_invoke_span_latest_conventions(mock_span):
231234
tracer.use_latest_genai_conventions = True
232235
message = {"role": "assistant", "content": [{"text": "Response"}]}
233236
usage = Usage(inputTokens=10, outputTokens=20, totalTokens=30)
237+
metrics = Metrics(latencyMs=20, timeToFirstByteMs=10)
234238
stop_reason: StopReason = "end_turn"
235239

236-
tracer.end_model_invoke_span(mock_span, message, usage, stop_reason)
240+
tracer.end_model_invoke_span(mock_span, message, usage, metrics, stop_reason)
237241

238242
mock_span.set_attribute.assert_any_call("gen_ai.usage.prompt_tokens", 10)
239243
mock_span.set_attribute.assert_any_call("gen_ai.usage.input_tokens", 10)
@@ -242,6 +246,8 @@ def test_end_model_invoke_span_latest_conventions(mock_span):
242246
mock_span.set_attribute.assert_any_call("gen_ai.usage.total_tokens", 30)
243247
mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_read_input_tokens", 0)
244248
mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_write_input_tokens", 0)
249+
mock_span.set_attribute.assert_any_call("gen_ai.server.time_to_first_token", 10)
250+
mock_span.set_attribute.assert_any_call("gen_ai.server.request.duration", 20)
245251
mock_span.add_event.assert_called_with(
246252
"gen_ai.client.inference.operation.details",
247253
attributes={
@@ -766,8 +772,9 @@ def test_end_model_invoke_span_with_cache_metrics(mock_span):
766772
cacheWriteInputTokens=3,
767773
)
768774
stop_reason: StopReason = "end_turn"
775+
metrics = Metrics(latencyMs=10, timeToFirstByteMs=5)
769776

770-
tracer.end_model_invoke_span(mock_span, message, usage, stop_reason)
777+
tracer.end_model_invoke_span(mock_span, message, usage, metrics, stop_reason)
771778

772779
mock_span.set_attribute.assert_any_call("gen_ai.usage.prompt_tokens", 10)
773780
mock_span.set_attribute.assert_any_call("gen_ai.usage.input_tokens", 10)
@@ -776,6 +783,8 @@ def test_end_model_invoke_span_with_cache_metrics(mock_span):
776783
mock_span.set_attribute.assert_any_call("gen_ai.usage.total_tokens", 30)
777784
mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_read_input_tokens", 5)
778785
mock_span.set_attribute.assert_any_call("gen_ai.usage.cache_write_input_tokens", 3)
786+
mock_span.set_attribute.assert_any_call("gen_ai.server.request.duration", 10)
787+
mock_span.set_attribute.assert_any_call("gen_ai.server.time_to_first_token", 5)
779788
mock_span.set_status.assert_called_once_with(StatusCode.OK)
780789
mock_span.end.assert_called_once()
781790

0 commit comments

Comments
 (0)