Skip to content

Commit a07169e

Browse files
indrajit96Kevin Wangkrishung5
authored
feat: Metrics API enhanced for Failure counters (#377)
Enhances existing metrics API to incorporate failure and failure counts in inference requests --------- Co-authored-by: Kevin Wang <[email protected]> Co-authored-by: krishung5 <[email protected]>
1 parent 627eb5a commit a07169e

File tree

10 files changed

+147
-50
lines changed

10 files changed

+147
-50
lines changed

src/backend_model_instance.cc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "backend_config.h"
3737
#include "backend_model.h"
3838
#include "cuda_utils.h"
39+
#include "infer_stats.h"
3940
#include "metrics.h"
4041
#include "model_config.pb.h"
4142
#include "numa_utils.h"
@@ -558,7 +559,8 @@ TritonModelInstance::PrepareRequestsOrRespond(
558559
// If any errors occurred, respond with error for each request.
559560
if (!status.IsOk()) {
560561
for (auto& r : requests) {
561-
InferenceRequest::RespondIfError(r, status, true /* release_requests */);
562+
InferenceRequest::RespondIfError(
563+
r, status, true /* release_requests */, FailureReason::OTHER);
562564
}
563565
// Log a single error for batch of requests for better visibility
564566
LOG_STATUS_ERROR(status, "Requests failed pre-execution checks");
@@ -685,7 +687,16 @@ TritonModelInstance::Execute(
685687
for (TRITONBACKEND_Request* tr : triton_requests) {
686688
std::unique_ptr<InferenceRequest> ur(
687689
reinterpret_cast<InferenceRequest*>(tr));
688-
InferenceRequest::RespondIfError(ur, status, true /* release_requests */);
690+
// NOTE: If a backend both returns an error in
691+
// TRITONBACKEND_ModelInstanceExecute and reports an error with
692+
// TRITONBACKEND_ModelInstanceReportStatistics, this can result in double
693+
// counting of the failure metric for the same request. However, it is
694+
// currently not expected for this to be a common case, as the return
695+
// value of TRITONBACKEND_ModelInstanceExecute is used to express
696+
// ownership of the request rather than success of an inference request.
697+
// See tritonbackend.h for more details on this.
698+
InferenceRequest::RespondIfError(
699+
ur, status, true /* release_requests */, FailureReason::BACKEND);
689700
}
690701

691702
TRITONSERVER_ErrorDelete(err);

src/dynamic_batch_scheduler.cc

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,12 @@ IsStaleState(Payload::State payload_state)
5050
void
5151
FinishSkippedRequests(
5252
std::vector<std::deque<std::unique_ptr<InferenceRequest>>>&& requests,
53-
const Status& response_status)
53+
const Status& response_status, FailureReason reason)
5454
{
5555
for (auto& queue : requests) {
5656
for (auto& request : queue) {
57-
InferenceRequest::RespondIfError(request, response_status, true);
57+
InferenceRequest::RespondIfError(
58+
request, response_status, true /* release_requests */, reason);
5859
}
5960
}
6061
}
@@ -69,8 +70,10 @@ FinishRejectedCancelledRequests(
6970
const static Status rejected_status =
7071
Status(Status::Code::UNAVAILABLE, "Request timeout expired");
7172
const static Status cancelled_status = Status(Status::Code::CANCELLED);
72-
FinishSkippedRequests(std::move(rejected_requests), rejected_status);
73-
FinishSkippedRequests(std::move(cancelled_requests), cancelled_status);
73+
FinishSkippedRequests(
74+
std::move(rejected_requests), rejected_status, FailureReason::REJECTED);
75+
FinishSkippedRequests(
76+
std::move(cancelled_requests), cancelled_status, FailureReason::CANCELED);
7477
}
7578

7679
DynamicBatchScheduler::DynamicBatchScheduler(

src/ensemble_scheduler/ensemble_scheduler.cc

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -81,23 +81,26 @@ class RequestTracker {
8181
std::lock_guard<std::mutex> lk(mtx_);
8282
inflight_request_counter_--;
8383
if (inflight_request_counter_ == 0) {
84+
if (request_ != nullptr) {
8485
#ifdef TRITON_ENABLE_STATS
85-
const auto& infer_stats = context_stats_aggregator_.ImmutableInferStats();
86-
request_->ReportStatisticsWithDuration(
87-
metric_reporter_, status_.IsOk(), compute_start_ns_,
88-
infer_stats.compute_input_duration_ns_,
89-
infer_stats.compute_infer_duration_ns_,
90-
infer_stats.compute_output_duration_ns_);
91-
if (status_.IsOk()) {
92-
stats_aggregator_->UpdateInferBatchStatsWithDuration(
93-
metric_reporter_, std::max(1U, request_->BatchSize()),
86+
const auto& infer_stats =
87+
context_stats_aggregator_.ImmutableInferStats();
88+
request_->ReportStatisticsWithDuration(
89+
metric_reporter_, status_.IsOk(), compute_start_ns_,
9490
infer_stats.compute_input_duration_ns_,
9591
infer_stats.compute_infer_duration_ns_,
9692
infer_stats.compute_output_duration_ns_);
97-
}
93+
if (status_.IsOk()) {
94+
stats_aggregator_->UpdateInferBatchStatsWithDuration(
95+
metric_reporter_, std::max(1U, request_->BatchSize()),
96+
infer_stats.compute_input_duration_ns_,
97+
infer_stats.compute_infer_duration_ns_,
98+
infer_stats.compute_output_duration_ns_);
99+
}
98100
#endif
99-
InferenceRequest::Release(
100-
std::move(request_), TRITONSERVER_REQUEST_RELEASE_ALL);
101+
InferenceRequest::Release(
102+
std::move(request_), TRITONSERVER_REQUEST_RELEASE_ALL);
103+
}
101104
}
102105
return (inflight_request_counter_ == 0);
103106
}
@@ -1136,7 +1139,8 @@ EnsembleContext::FinishEnsemble(std::unique_ptr<InferenceResponse>&& response)
11361139
"more "
11371140
"ensemble steps can be made");
11381141
InferenceRequest::RespondIfError(
1139-
request_tracker_->Request(), ensemble_status_);
1142+
request_tracker_->Request(), ensemble_status_,
1143+
false /* release_requests */, FailureReason::OTHER);
11401144
} else {
11411145
request_tracker_->Request()->ResponseFactory()->SendFlags(
11421146
TRITONSERVER_RESPONSE_COMPLETE_FINAL);
@@ -1149,7 +1153,8 @@ EnsembleContext::FinishEnsemble(std::unique_ptr<InferenceResponse>&& response)
11491153
ensemble_status_);
11501154
} else {
11511155
InferenceRequest::RespondIfError(
1152-
request_tracker_->Request(), ensemble_status_);
1156+
request_tracker_->Request(), ensemble_status_,
1157+
false /* release_requests */, FailureReason::OTHER);
11531158
}
11541159
}
11551160

src/infer_request.cc

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -421,10 +421,25 @@ InferenceRequest::Run(std::unique_ptr<InferenceRequest>& request)
421421
return status;
422422
}
423423

424+
FailureReason
425+
stringToFailureReason(const std::string& error_type)
426+
{
427+
if (error_type == "REJECTED") {
428+
return FailureReason::REJECTED;
429+
}
430+
if (error_type == "CANCELED") {
431+
return FailureReason::CANCELED;
432+
}
433+
if (error_type == "BACKEND") {
434+
return FailureReason::BACKEND;
435+
}
436+
return FailureReason::OTHER;
437+
}
438+
424439
void
425440
InferenceRequest::RespondIfError(
426441
std::unique_ptr<InferenceRequest>& request, const Status& status,
427-
const bool release_request)
442+
const bool release_request, FailureReason reason)
428443
{
429444
if (status.IsOk()) {
430445
return;
@@ -442,7 +457,10 @@ InferenceRequest::RespondIfError(
442457
InferenceResponse::SendWithStatus(
443458
std::move(response), TRITONSERVER_RESPONSE_COMPLETE_FINAL, status),
444459
(request->LogRequest() + "failed to send error response").c_str());
445-
460+
#ifdef TRITON_ENABLE_STATS
461+
request->ReportErrorStatistics(
462+
request->model_raw_->MetricReporter().get(), reason);
463+
#endif
446464
// If releasing the request then invoke the release callback which
447465
// gives ownership to the callback. So can't access 'request' after
448466
// this point.
@@ -452,20 +470,6 @@ InferenceRequest::RespondIfError(
452470
}
453471
}
454472

455-
void
456-
InferenceRequest::RespondIfError(
457-
std::vector<std::unique_ptr<InferenceRequest>>& requests,
458-
const Status& status, const bool release_requests)
459-
{
460-
if (status.IsOk()) {
461-
return;
462-
}
463-
464-
for (auto& request : requests) {
465-
RespondIfError(request, status, release_requests);
466-
}
467-
}
468-
469473
Status
470474
InferenceRequest::Release(
471475
std::unique_ptr<InferenceRequest>&& request, const uint32_t release_flags)
@@ -1371,6 +1375,21 @@ InferenceRequest::ValidateBytesInputs(
13711375
}
13721376

13731377
#ifdef TRITON_ENABLE_STATS
1378+
1379+
void
1380+
InferenceRequest::ReportErrorStatistics(
1381+
MetricModelReporter* metric_reporter, FailureReason reason)
1382+
{
1383+
INFER_STATS_DECL_TIMESTAMP(request_end_ns);
1384+
model_raw_->MutableStatsAggregator()->UpdateFailure(
1385+
metric_reporter, request_start_ns_, request_end_ns, reason);
1386+
if (secondary_stats_aggregator_ != nullptr) {
1387+
secondary_stats_aggregator_->UpdateFailure(
1388+
nullptr /* metric_reporter */, request_start_ns_, request_end_ns,
1389+
reason);
1390+
}
1391+
}
1392+
13741393
void
13751394
InferenceRequest::ReportStatistics(
13761395
MetricModelReporter* metric_reporter, bool success,
@@ -1407,10 +1426,12 @@ InferenceRequest::ReportStatistics(
14071426
}
14081427
} else {
14091428
model_raw_->MutableStatsAggregator()->UpdateFailure(
1410-
metric_reporter, request_start_ns_, request_end_ns);
1429+
metric_reporter, request_start_ns_, request_end_ns,
1430+
FailureReason::BACKEND);
14111431
if (secondary_stats_aggregator_ != nullptr) {
14121432
secondary_stats_aggregator_->UpdateFailure(
1413-
nullptr /* metric_reporter */, request_start_ns_, request_end_ns);
1433+
nullptr /* metric_reporter */, request_start_ns_, request_end_ns,
1434+
FailureReason::BACKEND);
14141435
}
14151436
}
14161437
}
@@ -1443,10 +1464,12 @@ InferenceRequest::ReportStatisticsWithDuration(
14431464
}
14441465
} else {
14451466
model_raw_->MutableStatsAggregator()->UpdateFailure(
1446-
metric_reporter, request_start_ns_, request_end_ns);
1467+
metric_reporter, request_start_ns_, request_end_ns,
1468+
FailureReason::OTHER);
14471469
if (secondary_stats_aggregator_ != nullptr) {
14481470
secondary_stats_aggregator_->UpdateFailure(
1449-
nullptr /* metric_reporter */, request_start_ns_, request_end_ns);
1471+
nullptr /* metric_reporter */, request_start_ns_, request_end_ns,
1472+
FailureReason::OTHER);
14501473
}
14511474
}
14521475
}
@@ -1850,5 +1873,4 @@ operator!=(
18501873
{
18511874
return !(lhs == rhs);
18521875
}
1853-
18541876
}} // namespace triton::core

src/infer_request.h

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,8 @@ class InferenceRequest {
590590
// 'release_request' is true 'request' is returned as nullptr.
591591
static void RespondIfError(
592592
std::unique_ptr<InferenceRequest>& request, const Status& status,
593-
const bool release_request = false);
593+
const bool release_request = false,
594+
FailureReason reason = FailureReason::OTHER);
594595

595596
// Send an error response to a set of 'requests'. If 'status' is
596597
// Success then no responses are sent and the requests are not
@@ -603,7 +604,8 @@ class InferenceRequest {
603604
// returned with all nullptrs.
604605
static void RespondIfError(
605606
std::vector<std::unique_ptr<InferenceRequest>>& requests,
606-
const Status& status, const bool release_requests = false);
607+
const Status& status, const bool release_requests = false,
608+
FailureReason reason = FailureReason::OTHER);
607609

608610
// Release the request. Call the release callback and transfer
609611
// ownership of the request to the callback. On return 'request' is
@@ -673,6 +675,16 @@ class InferenceRequest {
673675
const uint64_t compute_start_ns, const uint64_t compute_input_end_ns,
674676
const uint64_t compute_output_start_ns, const uint64_t compute_end_ns);
675677

678+
// Report the error statistics to stats collectors associated with the
679+
// request.
680+
// FIXME: A separate function may not be necessary here, but is being used
681+
// cautiously in case of unforeseen issues such as possibly capturing a trace
682+
// twice. This should be revisited and better tested to see if the
683+
// ReportStatistics function can be used as-is for the newly captured failure
684+
// cases.
685+
void ReportErrorStatistics(
686+
MetricModelReporter* metric_reporter, FailureReason reason);
687+
676688
// Report the statistics to stats collectors associated with the request.
677689
// Duration and timestamps provide two granularities for stats collectors.
678690
void ReportStatisticsWithDuration(

src/infer_stats.cc

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,28 @@ namespace triton { namespace core {
3636

3737
#ifdef TRITON_ENABLE_STATS
3838

39+
// This function converts FailureReason enum values to std::string
40+
std::string
41+
failureReasonToString(FailureReason reason)
42+
{
43+
switch (reason) {
44+
case FailureReason::REJECTED:
45+
return "REJECTED";
46+
case FailureReason::CANCELED:
47+
return "CANCELED";
48+
case FailureReason::BACKEND:
49+
return "BACKEND";
50+
case FailureReason::OTHER:
51+
return "OTHER";
52+
default:
53+
return "OTHER";
54+
}
55+
}
56+
3957
void
4058
InferenceStatsAggregator::UpdateFailure(
4159
MetricModelReporter* metric_reporter, const uint64_t request_start_ns,
42-
const uint64_t request_end_ns)
60+
const uint64_t request_end_ns, FailureReason reason)
4361
{
4462
std::lock_guard<std::mutex> lock(mu_);
4563

@@ -48,7 +66,8 @@ InferenceStatsAggregator::UpdateFailure(
4866

4967
#ifdef TRITON_ENABLE_METRICS
5068
if (metric_reporter != nullptr) {
51-
metric_reporter->IncrementCounter("inf_failure", 1);
69+
std::string reason_str = failureReasonToString(reason);
70+
metric_reporter->IncrementCounter("inf_failure_" + reason_str, 1);
5271
}
5372
#endif // TRITON_ENABLE_METRICS
5473
}

src/infer_stats.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939

4040
namespace triton { namespace core {
4141

42+
// Define the FailureReason enum within the triton::core namespace
43+
enum class FailureReason { REJECTED, CANCELED, BACKEND, OTHER };
44+
4245
class MetricModelReporter;
4346

4447

@@ -136,7 +139,7 @@ class InferenceStatsAggregator {
136139
// Add durations to Infer stats for a failed inference request.
137140
void UpdateFailure(
138141
MetricModelReporter* metric_reporter, const uint64_t request_start_ns,
139-
const uint64_t request_end_ns);
142+
const uint64_t request_end_ns, FailureReason reason);
140143

141144
// Add durations to infer stats for a successful inference request.
142145
void UpdateSuccess(

src/metric_model_reporter.cc

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#ifdef TRITON_ENABLE_METRICS
3030

3131
#include "constants.h"
32+
#include "infer_stats.h"
3233
#include "triton/common/logging.h"
3334

3435
// Global config group has 'name' of empty string.
@@ -101,6 +102,13 @@ MetricReporterConfig::ParseQuantiles(std::string options)
101102
//
102103
// MetricModelReporter
103104
//
105+
const std::map<FailureReason, std::string>
106+
MetricModelReporter::failure_reasons_map = {
107+
{FailureReason::REJECTED, "REJECTED"},
108+
{FailureReason::CANCELED, "CANCELED"},
109+
{FailureReason::BACKEND, "BACKEND"},
110+
{FailureReason::OTHER, "OTHER"}};
111+
104112
Status
105113
MetricModelReporter::Create(
106114
const ModelIdentifier& model_id, const int64_t model_version,
@@ -189,7 +197,6 @@ MetricModelReporter::InitializeCounters(
189197
{
190198
// Always setup these counters, regardless of config
191199
counter_families_["inf_success"] = &Metrics::FamilyInferenceSuccess();
192-
counter_families_["inf_failure"] = &Metrics::FamilyInferenceFailure();
193200
counter_families_["inf_count"] = &Metrics::FamilyInferenceCount();
194201
counter_families_["inf_exec_count"] =
195202
&Metrics::FamilyInferenceExecutionCount();
@@ -227,6 +234,15 @@ MetricModelReporter::InitializeCounters(
227234
counters_[name] = CreateMetric<prometheus::Counter>(*family_ptr, labels);
228235
}
229236
}
237+
238+
// Initialize failure metrics with reasons
239+
for (const auto& reason_pair : failure_reasons_map) {
240+
std::map<std::string, std::string> extended_labels = labels;
241+
extended_labels["reason"] = reason_pair.second;
242+
counters_["inf_failure_" + reason_pair.second] =
243+
CreateMetric<prometheus::Counter>(
244+
Metrics::FamilyInferenceFailure(), extended_labels);
245+
}
230246
}
231247

232248
void

src/metric_model_reporter.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ class MetricModelReporter {
9494
// Lookup summary metric by name, and observe the value if it exists.
9595
void ObserveSummary(const std::string& name, double value);
9696

97+
static const std::map<FailureReason, std::string> failure_reasons_map;
98+
9799
private:
98100
MetricModelReporter(
99101
const ModelIdentifier& model_id, const int64_t model_version,

0 commit comments

Comments
 (0)