@@ -107,6 +107,14 @@ EnsembleDurations
107107GetTotalEnsembleDurations (const ServerSideStats& stats)
108108{
109109 EnsembleDurations result;
110+ // Calculate avg cache hit latency and cache miss latency for ensemble model
111+ // in case top level response caching is enabled.
112+ const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count ;
113+ const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count ;
114+ result.total_cache_hit_time_avg_us +=
115+ AverageDurationInUs (stats.cache_hit_time_ns , ensemble_cache_hit_cnt);
116+ result.total_cache_miss_time_avg_us +=
117+ AverageDurationInUs (stats.cache_miss_time_ns , ensemble_cache_miss_cnt);
110118 for (const auto & model_stats : stats.composing_models_stat ) {
111119 if (model_stats.second .composing_models_stat .empty ()) {
112120 // Cache hit count covers cache hits, not related to compute times
@@ -238,7 +246,6 @@ ReportServerSideStats(
238246 if (parser->ResponseCacheEnabled ()) {
239247 const uint64_t overhead_avg_us = GetOverheadDuration (
240248 cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us);
241-
242249 std::cout << " (overhead " << overhead_avg_us << " usec + "
243250 << " queue " << queue_avg_us << " usec + "
244251 << " cache hit/miss " << combined_cache_compute_avg_us
@@ -283,12 +290,18 @@ ReportServerSideStats(
283290 const uint64_t overhead_avg_us = GetOverheadDuration (
284291 cumm_avg_us, ensemble_times.total_queue_time_avg_us ,
285292 ensemble_times.total_combined_cache_compute_time_avg_us );
286- std::cout << " (overhead " << overhead_avg_us << " usec + "
287- << " queue " << ensemble_times.total_queue_time_avg_us
288- << " usec + "
289- << " cache hit/miss "
290- << ensemble_times.total_combined_cache_compute_time_avg_us
291- << " usec)" << std::endl;
293+ // FIXME - Refactor these calculations in case of ensemble top level
294+ // response cache is enabled
295+ if (!parser->TopLevelResponseCachingEnabled ()) {
296+ std::cout << " (overhead " << overhead_avg_us << " usec + "
297+ << " queue " << ensemble_times.total_queue_time_avg_us
298+ << " usec + "
299+ << " cache hit/miss "
300+ << ensemble_times.total_combined_cache_compute_time_avg_us
301+ << " usec)" << std::endl;
302+ } else {
303+ std::cout << std::endl;
304+ }
292305 std::cout << ident << ident << " Average Cache Hit Latency: "
293306 << ensemble_times.total_cache_hit_time_avg_us << " usec"
294307 << std::endl;
@@ -1550,6 +1563,21 @@ InferenceProfiler::DetermineStatsModelVersion(
15501563 return cb::Error::Success;
15511564}
15521565
1566+ // Only for unit-testing
1567+ #ifndef DOCTEST_CONFIG_DISABLE
1568+ cb::Error
1569+ InferenceProfiler::SetTopLevelResponseCaching (
1570+ bool enable_top_level_response_caching)
1571+ {
1572+ parser_ = std::make_shared<ModelParser>(cb::BackendKind::TRITON);
1573+ if (parser_ == nullptr ) {
1574+ return cb::Error (" Failed to initialize ModelParser" );
1575+ }
1576+ parser_->SetTopLevelResponseCaching (enable_top_level_response_caching);
1577+ return cb::Error::Success;
1578+ }
1579+ #endif
1580+
15531581cb::Error
15541582InferenceProfiler::SummarizeServerStats (
15551583 const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
@@ -1605,8 +1633,20 @@ InferenceProfiler::SummarizeServerStatsHelper(
16051633
16061634 const auto & end_itr = end_status.find (this_id);
16071635 if (end_itr == end_status.end ()) {
1608- return cb::Error (
1609- " missing statistics for requested model" , pa::GENERIC_ERROR);
1636+ // In case of ensemble models, if top level response caching is enabled,
1637+ // the composing models statistics are unavailable in case of a cache hit.
1638+ // This is due to the scheduler sends cache response and composing models do
1639+ // not get executed. It's a valid scenario and shouldn't throw error.
1640+ bool stats_not_found_and_invalid =
1641+ model_version == -1 && !parser_->TopLevelResponseCachingEnabled ();
1642+ if (stats_not_found_and_invalid) {
1643+ return cb::Error (
1644+ " missing statistics for requested model" , pa::GENERIC_ERROR);
1645+ } else {
1646+ // Setting server stats 0 for composing model in case of ensemble request
1647+ // cache hit since the composing model will not be executed
1648+ server_stats->Reset ();
1649+ }
16101650 } else {
16111651 uint64_t start_infer_cnt = 0 ;
16121652 uint64_t start_exec_cnt = 0 ;
0 commit comments