Skip to content

Commit 5f10d61

Browse files
[refactor]: Refactor Frontend Trace OpenTelemetry Implementation (#7390)
Co-authored-by: Iman Tabrizian <[email protected]>
1 parent 52bb23f commit 5f10d61

File tree

2 files changed

+92
-135
lines changed

2 files changed

+92
-135
lines changed

src/tracer.cc

Lines changed: 56 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -335,13 +335,23 @@ TraceManager::SampleTrace(const TraceStartOptions& start_options)
335335
std::chrono::duration_cast<std::chrono::nanoseconds>(
336336
std::chrono::steady_clock::now().time_since_epoch())
337337
.count();
338-
ts->otel_context_ = start_options.propagated_context;
339-
opentelemetry::nostd::shared_ptr<otel_trace_api::Span> root_span;
340-
root_span = ts->StartSpan(
341-
"InferRequest", steady_timestamp_ns, otel_trace_api::kSpanKey);
338+
if (ts->span_stacks_.find(ts->trace_id_) == ts->span_stacks_.end()) {
339+
std::unique_ptr<
340+
std::stack<opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>>
341+
st(new std::stack<
342+
opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>());
343+
ts->span_stacks_.emplace(ts->trace_id_, std::move(st));
344+
}
345+
auto active_span =
346+
otel_trace_api::GetSpan(start_options.propagated_context);
347+
if (active_span->GetContext().IsValid()) {
348+
ts->span_stacks_[ts->trace_id_]->emplace(active_span);
349+
}
342350
// Storing "InferRequest" span as a root span
343351
// to keep it alive for the duration of the request.
344-
ts->otel_context_ = ts->otel_context_.SetValue(kRootSpan, root_span);
352+
ts->root_span_ =
353+
ts->StartSpan("InferRequest", steady_timestamp_ns, ts->trace_id_);
354+
ts->span_stacks_[ts->trace_id_]->emplace(ts->root_span_);
345355
#else
346356
LOG_ERROR << "Unsupported trace mode: "
347357
<< TraceManager::InferenceTraceModeString(ts->setting_->mode_);
@@ -358,7 +368,7 @@ TraceManager::Trace::~Trace()
358368
setting_->WriteTrace(streams_);
359369
} else if (setting_->mode_ == TRACE_MODE_OPENTELEMETRY) {
360370
#ifndef _WIN32
361-
EndSpan(kRootSpan);
371+
EndSpan(trace_id_);
362372
#else
363373
LOG_ERROR << "Unsupported trace mode: "
364374
<< TraceManager::InferenceTraceModeString(setting_->mode_);
@@ -390,7 +400,8 @@ TraceManager::Trace::CaptureTimestamp(
390400
<< "{\"name\":\"" << name << "\",\"ns\":" << timestamp_ns << "}]}";
391401
} else if (setting_->mode_ == TRACE_MODE_OPENTELEMETRY) {
392402
#ifndef _WIN32
393-
AddEvent(kRootSpan, name, timestamp_ns);
403+
root_span_->AddEvent(
404+
name, time_offset_ + std::chrono::nanoseconds{timestamp_ns});
394405
#else
395406
LOG_ERROR << "Unsupported trace mode: "
396407
<< TraceManager::InferenceTraceModeString(setting_->mode_);
@@ -501,15 +512,15 @@ TraceManager::ProcessOpenTelemetryParameters(
501512

502513
void
503514
TraceManager::Trace::StartSpan(
504-
std::string span_key, TRITONSERVER_InferenceTrace* trace,
515+
TRITONSERVER_InferenceTrace* trace,
505516
TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
506517
uint64_t trace_id)
507518
{
508519
uint64_t parent_id;
509520
LOG_TRITONSERVER_ERROR(
510521
TRITONSERVER_InferenceTraceParentId(trace, &parent_id),
511522
"getting trace parent id");
512-
std::string parent_span_key = "";
523+
auto span_parent_id = parent_id;
513524

514525
// Currently, only 2 types of sub-spans are supported:
515526
// request span and compute span. Compute span is a leaf span
@@ -521,16 +532,9 @@ TraceManager::Trace::StartSpan(
521532
// If parent_id > 0, then this is a child trace, spawned from
522533
// the ensamble's main request. For this instance, the parent
523534
// span is the ensembles's request span.
524-
if (parent_id == 0 && activity == TRITONSERVER_TRACE_REQUEST_START) {
525-
parent_span_key = kRootSpan;
526-
} else if (activity == TRITONSERVER_TRACE_REQUEST_START) {
527-
// [FIXME] For BLS requests parent span for children's request spans
528-
// should be parent model's compute span. Currently,
529-
// this won't work, since parent's compute span will be created
530-
// only after children's spans are created.
531-
parent_span_key = kRequestSpan + std::to_string(parent_id);
532-
} else if (activity == TRITONSERVER_TRACE_COMPUTE_START) {
533-
parent_span_key = kRequestSpan + std::to_string(trace_id);
535+
if ((parent_id == 0 && activity == TRITONSERVER_TRACE_REQUEST_START) ||
536+
(activity == TRITONSERVER_TRACE_COMPUTE_START)) {
537+
span_parent_id = trace_id;
534538
}
535539

536540
std::string display_name = "compute";
@@ -542,7 +546,7 @@ TraceManager::Trace::StartSpan(
542546
display_name = model_name;
543547
}
544548

545-
auto span = StartSpan(display_name, timestamp_ns, parent_span_key);
549+
auto span = StartSpan(display_name, timestamp_ns, span_parent_id);
546550

547551
if (activity == TRITONSERVER_TRACE_REQUEST_START) {
548552
int64_t model_version;
@@ -564,14 +568,13 @@ TraceManager::Trace::StartSpan(
564568
PrepareTraceContext(span, &buffer);
565569
TRITONSERVER_InferenceTraceSetContext(trace, buffer.Contents().c_str());
566570
}
567-
568-
otel_context_ = otel_context_.SetValue(span_key, span);
571+
span_stacks_[trace_id]->emplace(span);
569572
}
570573

571574
opentelemetry::nostd::shared_ptr<otel_trace_api::Span>
572575
TraceManager::Trace::StartSpan(
573576
std::string display_name, const uint64_t& raw_timestamp_ns,
574-
std::string parent_span_key)
577+
uint64_t trace_id)
575578
{
576579
otel_trace_api::StartSpanOptions options;
577580
options.kind = otel_trace_api::SpanKind::kServer;
@@ -580,45 +583,37 @@ TraceManager::Trace::StartSpan(
580583
options.start_steady_time =
581584
otel_common::SteadyTimestamp{std::chrono::nanoseconds{raw_timestamp_ns}};
582585

583-
// If the new span is a child span, we need to retrieve its parent from
584-
// the context and provide it through StartSpanOptions to the child span
585-
if (!parent_span_key.empty() && otel_context_.HasKey(parent_span_key)) {
586-
auto parent_span = opentelemetry::nostd::get<
587-
opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>(
588-
otel_context_.GetValue(parent_span_key));
589-
options.parent = parent_span->GetContext();
586+
// If the new span is a child span, we need to retrieve its parent and
587+
// provide it through StartSpanOptions to the child span
588+
if (span_stacks_.find(trace_id) != span_stacks_.end() &&
589+
!span_stacks_[trace_id]->empty()) {
590+
options.parent = span_stacks_[trace_id]->top()->GetContext();
590591
}
591592
auto provider = opentelemetry::trace::Provider::GetTracerProvider();
592593
return provider->GetTracer(kTritonTracer)->StartSpan(display_name, options);
593594
}
594595

595596
void
596-
TraceManager::Trace::EndSpan(std::string span_key)
597+
TraceManager::Trace::EndSpan(uint64_t trace_id)
597598
{
598599
auto timestamp_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
599600
std::chrono::steady_clock::now().time_since_epoch())
600601
.count();
601-
EndSpan(span_key, timestamp_ns);
602+
EndSpan(timestamp_ns, trace_id);
602603
}
603604

604605

605606
void
606607
TraceManager::Trace::EndSpan(
607-
std::string span_key, const uint64_t& raw_timestamp_ns)
608+
const uint64_t& raw_timestamp_ns, uint64_t trace_id)
608609
{
609-
if (otel_context_.HasKey(span_key)) {
610-
auto span = opentelemetry::nostd::get<
611-
opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>(
612-
otel_context_.GetValue(span_key));
613-
614-
if (span == nullptr) {
615-
return;
616-
}
617-
610+
if (span_stacks_.find(trace_id) != span_stacks_.end() &&
611+
!span_stacks_[trace_id]->empty()) {
618612
otel_trace_api::EndSpanOptions end_options;
619613
end_options.end_steady_time = otel_common::SteadyTimestamp{
620614
std::chrono::nanoseconds{raw_timestamp_ns}};
621-
span->End(end_options);
615+
span_stacks_[trace_id]->top()->End(end_options);
616+
span_stacks_[trace_id]->pop();
622617
}
623618
}
624619

@@ -630,79 +625,46 @@ TraceManager::Trace::ReportToOpenTelemetry(
630625
uint64_t id;
631626
LOG_TRITONSERVER_ERROR(
632627
TRITONSERVER_InferenceTraceId(trace, &id), "getting trace id");
633-
634-
auto current_span_key = GetSpanKeyForActivity(activity, id);
635-
if (current_span_key.empty()) {
636-
return;
628+
if (span_stacks_.find(id) == span_stacks_.end()) {
629+
std::unique_ptr<
630+
std::stack<opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>>
631+
st(new std::stack<
632+
opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>());
633+
span_stacks_.emplace(id, std::move(st));
637634
}
638635

639-
AddEvent(current_span_key, trace, activity, timestamp_ns, id);
640-
}
641-
642-
std::string
643-
TraceManager::Trace::GetSpanKeyForActivity(
644-
TRITONSERVER_InferenceTraceActivity activity, uint64_t trace_id)
645-
{
646-
std::string span_name;
647-
switch (activity) {
648-
case TRITONSERVER_TRACE_REQUEST_START:
649-
case TRITONSERVER_TRACE_QUEUE_START:
650-
case TRITONSERVER_TRACE_REQUEST_END: {
651-
span_name = kRequestSpan + std::to_string(trace_id);
652-
break;
653-
}
654-
655-
case TRITONSERVER_TRACE_COMPUTE_START:
656-
case TRITONSERVER_TRACE_COMPUTE_INPUT_END:
657-
case TRITONSERVER_TRACE_COMPUTE_OUTPUT_START:
658-
case TRITONSERVER_TRACE_COMPUTE_END: {
659-
span_name = kComputeSpan + std::to_string(trace_id);
660-
break;
661-
}
662-
case TRITONSERVER_TRACE_TENSOR_QUEUE_INPUT:
663-
case TRITONSERVER_TRACE_TENSOR_BACKEND_INPUT:
664-
case TRITONSERVER_TRACE_TENSOR_BACKEND_OUTPUT:
665-
default: {
666-
LOG_ERROR << "Unsupported activity: "
667-
<< TRITONSERVER_InferenceTraceActivityString(activity);
668-
span_name = "";
669-
break;
670-
}
671-
}
672-
673-
return span_name;
636+
AddEvent(trace, activity, timestamp_ns, id);
674637
}
675638

676639
void
677640
TraceManager::Trace::AddEvent(
678-
std::string span_key, TRITONSERVER_InferenceTrace* trace,
641+
TRITONSERVER_InferenceTrace* trace,
679642
TRITONSERVER_InferenceTraceActivity activity, uint64_t timestamp_ns,
680-
uint64_t id)
643+
uint64_t trace_id)
681644
{
682645
if (activity == TRITONSERVER_TRACE_REQUEST_START ||
683646
activity == TRITONSERVER_TRACE_COMPUTE_START) {
684-
StartSpan(span_key, trace, activity, timestamp_ns, id);
647+
StartSpan(trace, activity, timestamp_ns, trace_id);
685648
}
686649

687650
AddEvent(
688-
span_key, TRITONSERVER_InferenceTraceActivityString(activity),
689-
timestamp_ns);
651+
TRITONSERVER_InferenceTraceActivityString(activity), timestamp_ns,
652+
trace_id);
690653

691654
if (activity == TRITONSERVER_TRACE_REQUEST_END ||
692655
activity == TRITONSERVER_TRACE_COMPUTE_END) {
693-
EndSpan(span_key, timestamp_ns);
656+
EndSpan(timestamp_ns, trace_id);
694657
}
695658
}
696659

697660
void
698661
TraceManager::Trace::AddEvent(
699-
std::string span_key, std::string event, uint64_t timestamp)
662+
const std::string& event, uint64_t timestamp, uint64_t trace_id)
700663
{
701-
if (otel_context_.HasKey(span_key)) {
702-
auto span = opentelemetry::nostd::get<
703-
opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>(
704-
otel_context_.GetValue(span_key));
705-
span->AddEvent(event, time_offset_ + std::chrono::nanoseconds{timestamp});
664+
if (span_stacks_.find(trace_id) != span_stacks_.end() &&
665+
!span_stacks_[trace_id]->empty()) {
666+
span_stacks_[trace_id]->top()->AddEvent(
667+
event, time_offset_ + std::chrono::nanoseconds{timestamp});
706668
}
707669
}
708670

0 commit comments

Comments
 (0)