Skip to content

Commit 68f3cb9

Browse files
authored
Start reporting 'tracing::error!' events to OTEL (tensorzero#3030)
* Start reporting 'tracing::error!' events to OTEL For now, we only include the human-readable message. In the future, we might add a key for the serialized error JSON * Fix clippy * Fix formatting
1 parent 4ebc537 commit 68f3cb9

File tree

3 files changed

+150
-5
lines changed

3 files changed

+150
-5
lines changed

tensorzero-core/src/endpoints/inference.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,10 @@ pub async fn inference(
239239
&clickhouse_connection_info,
240240
)
241241
.await?;
242-
tracing::Span::current().record("episode_id", episode_id.to_string());
242+
// Record the episode id if we didn't already have one
243+
if params.episode_id.is_none() {
244+
tracing::Span::current().record("episode_id", episode_id.to_string());
245+
}
243246

244247
let (function, function_name) = find_function(&params, &config)?;
245248
let mut candidate_variants: BTreeMap<String, Arc<VariantInfo>> =

tensorzero-core/src/observability.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,13 @@ pub fn build_opentelemetry_layer<T: SpanExporter + 'static>(
9393
// We only expose spans that explicitly contain field prefixed with "http." or "otel."
9494
// For example, `#[instrument(fields(otel.name = "my_otel_name"))]` will be exported
9595
let filter = filter::filter_fn(|metadata| {
96-
metadata.fields().iter().any(|field| {
97-
field.name().starts_with("http.") || field.name().starts_with("otel.")
98-
})
96+
if metadata.is_event() {
97+
matches!(metadata.level(), &tracing::Level::ERROR)
98+
} else {
99+
metadata.fields().iter().any(|field| {
100+
field.name().starts_with("http.") || field.name().starts_with("otel.")
101+
})
102+
}
99103
});
100104

101105
reload_handle

tensorzero-core/tests/e2e/otel.rs

Lines changed: 139 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use std::{
33
sync::{Arc, Mutex},
44
};
55

6-
use opentelemetry::{KeyValue, SpanId, Value};
6+
use opentelemetry::{trace::Status, KeyValue, SpanId, Value};
77
use opentelemetry_sdk::{
88
error::OTelSdkResult,
99
trace::{SpanData, SpanExporter},
@@ -15,6 +15,7 @@ use tensorzero::{
1515
use tensorzero_core::inference::types::TextKind;
1616
use tensorzero_core::observability::build_opentelemetry_layer;
1717
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
18+
use uuid::Uuid;
1819

1920
use crate::providers::common::make_embedded_gateway_no_config;
2021

@@ -50,6 +51,7 @@ impl CapturingOtelExporter {
5051
}
5152
}
5253

54+
#[derive(Debug)]
5355
pub struct SpanMap {
5456
pub root_spans: Vec<SpanData>,
5557
pub span_children: HashMap<SpanId, Vec<SpanData>>,
@@ -135,6 +137,7 @@ pub async fn test_capture_simple_inference_spans() {
135137
// Since we're using the embedded gateway, the root span will be `function_inference`
136138
// (we won't have a top-level HTTP span)
137139
assert_eq!(root_span.name, "function_inference");
140+
assert_eq!(root_span.status, Status::Unset);
138141
let root_attr_map = attrs_to_map(&root_span.attributes);
139142
assert_eq!(root_attr_map["model_name"], "dummy::good".into());
140143
assert_eq!(
@@ -154,6 +157,7 @@ pub async fn test_capture_simple_inference_spans() {
154157
};
155158

156159
assert_eq!(variant_span.name, "variant_inference");
160+
assert_eq!(variant_span.status, Status::Unset);
157161
let variant_attr_map = attrs_to_map(&variant_span.attributes);
158162
assert_eq!(
159163
variant_attr_map["function_name"],
@@ -168,6 +172,7 @@ pub async fn test_capture_simple_inference_spans() {
168172
};
169173

170174
assert_eq!(model_span.name, "model_inference");
175+
assert_eq!(model_span.status, Status::Unset);
171176
let model_attr_map = attrs_to_map(&model_span.attributes);
172177
assert_eq!(model_attr_map["model_name"], "dummy::good".into());
173178
assert_eq!(model_attr_map["stream"], false.into());
@@ -177,6 +182,7 @@ pub async fn test_capture_simple_inference_spans() {
177182
panic!("Expected one child span: {model_children:#?}");
178183
};
179184
assert_eq!(model_provider_span.name, "model_provider_inference");
185+
assert_eq!(model_provider_span.status, Status::Unset);
180186
let model_provider_attr_map = attrs_to_map(&model_provider_span.attributes);
181187
assert_eq!(model_provider_attr_map["provider_name"], "dummy".into());
182188
assert_eq!(
@@ -199,3 +205,135 @@ pub async fn test_capture_simple_inference_spans() {
199205

200206
assert_eq!(num_spans, 4);
201207
}
208+
209+
#[tokio::test]
210+
pub async fn test_capture_model_error() {
211+
let episode_uuid = Uuid::now_v7();
212+
let exporter = install_capturing_otel_exporter();
213+
214+
let client = make_embedded_gateway_no_config().await;
215+
let _err = client
216+
.inference(ClientInferenceParams {
217+
episode_id: Some(episode_uuid),
218+
model_name: Some("openai::missing-model-name".to_string()),
219+
input: ClientInput {
220+
system: None,
221+
messages: vec![ClientInputMessage {
222+
role: Role::User,
223+
content: vec![ClientInputMessageContent::Text(TextKind::Text {
224+
text: "What is your name?".to_string(),
225+
})],
226+
}],
227+
},
228+
..Default::default()
229+
})
230+
.await
231+
.unwrap_err();
232+
233+
let all_spans = exporter.take_spans();
234+
let num_spans = all_spans.len();
235+
let spans = build_span_map(all_spans);
236+
237+
let [root_span] = spans.root_spans.as_slice() else {
238+
panic!("Expected one root span: {:#?}", spans.root_spans);
239+
};
240+
// Since we're using the embedded gateway, the root span will be `function_inference`
241+
// (we won't have a top-level HTTP span)
242+
assert_eq!(root_span.name, "function_inference");
243+
assert_eq!(
244+
root_span.status,
245+
Status::Error {
246+
description: "".into()
247+
}
248+
);
249+
let root_attr_map = attrs_to_map(&root_span.attributes);
250+
assert_eq!(
251+
root_attr_map["model_name"],
252+
"openai::missing-model-name".into()
253+
);
254+
assert_eq!(root_attr_map["episode_id"], episode_uuid.to_string().into());
255+
assert_eq!(root_attr_map.get("function_name"), None);
256+
assert_eq!(root_attr_map.get("variant_name"), None);
257+
258+
let root_children = &spans.span_children[&root_span.span_context.span_id()];
259+
let [variant_span] = root_children.as_slice() else {
260+
panic!("Expected one child span: {root_children:#?}");
261+
};
262+
263+
assert_eq!(variant_span.name, "variant_inference");
264+
assert_eq!(variant_span.status, Status::Unset);
265+
let variant_attr_map = attrs_to_map(&variant_span.attributes);
266+
assert_eq!(
267+
variant_attr_map["function_name"],
268+
"tensorzero::default".into()
269+
);
270+
assert_eq!(
271+
variant_attr_map["variant_name"],
272+
"openai::missing-model-name".into()
273+
);
274+
assert_eq!(variant_attr_map["stream"], false.into());
275+
276+
let variant_children = &spans.span_children[&variant_span.span_context.span_id()];
277+
let [model_span] = variant_children.as_slice() else {
278+
panic!("Expected one child span: {variant_children:#?}");
279+
};
280+
281+
assert_eq!(model_span.name, "model_inference");
282+
assert_eq!(
283+
model_span.status,
284+
Status::Error {
285+
description: "".into()
286+
}
287+
);
288+
let model_attr_map = attrs_to_map(&model_span.attributes);
289+
assert_eq!(
290+
model_attr_map["model_name"],
291+
"openai::missing-model-name".into()
292+
);
293+
assert_eq!(model_attr_map["stream"], false.into());
294+
295+
let model_children = &spans.span_children[&model_span.span_context.span_id()];
296+
let [model_provider_span] = model_children.as_slice() else {
297+
panic!("Expected one child span: {model_children:#?}");
298+
};
299+
assert_eq!(model_provider_span.name, "model_provider_inference");
300+
assert_eq!(
301+
model_provider_span.status,
302+
Status::Error {
303+
description: "".into()
304+
}
305+
);
306+
assert_eq!(
307+
model_provider_span.events.len(),
308+
1,
309+
"Unexpected number of events: {model_provider_span:#?}",
310+
);
311+
assert!(
312+
model_provider_span.events[0]
313+
.name
314+
.starts_with("Error from openai server:"),
315+
"Unexpected span event: {:?}",
316+
model_provider_span.events[0]
317+
);
318+
let model_provider_attr_map = attrs_to_map(&model_provider_span.attributes);
319+
assert_eq!(model_provider_attr_map["provider_name"], "openai".into());
320+
assert_eq!(
321+
model_provider_attr_map["gen_ai.operation.name"],
322+
"chat".into()
323+
);
324+
assert_eq!(model_provider_attr_map["gen_ai.system"], "openai".into());
325+
assert_eq!(
326+
model_provider_attr_map["gen_ai.request.model"],
327+
"missing-model-name".into()
328+
);
329+
assert_eq!(model_attr_map["stream"], false.into());
330+
331+
assert_eq!(
332+
spans
333+
.span_children
334+
.get(&model_provider_span.span_context.span_id()),
335+
None
336+
);
337+
338+
assert_eq!(num_spans, 4);
339+
}

0 commit comments

Comments
 (0)