Skip to content

Commit fddba6d

Browse files
lkomalinnshah1
authored andcommitted
Changes to support Ensemble Top Level Response Caching (#560)
1 parent c822917 commit fddba6d

File tree

5 files changed

+130
-10
lines changed

5 files changed

+130
-10
lines changed

src/c++/perf_analyzer/inference_profiler.cc

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,14 @@ EnsembleDurations
107107
GetTotalEnsembleDurations(const ServerSideStats& stats)
108108
{
109109
EnsembleDurations result;
110+
// Calculate avg cache hit latency and cache miss latency for ensemble model
111+
// in case top level response caching is enabled.
112+
const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count;
113+
const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count;
114+
result.total_cache_hit_time_avg_us +=
115+
AverageDurationInUs(stats.cache_hit_time_ns, ensemble_cache_hit_cnt);
116+
result.total_cache_miss_time_avg_us +=
117+
AverageDurationInUs(stats.cache_miss_time_ns, ensemble_cache_miss_cnt);
110118
for (const auto& model_stats : stats.composing_models_stat) {
111119
if (model_stats.second.composing_models_stat.empty()) {
112120
// Cache hit count covers cache hits, not related to compute times
@@ -238,7 +246,6 @@ ReportServerSideStats(
238246
if (parser->ResponseCacheEnabled()) {
239247
const uint64_t overhead_avg_us = GetOverheadDuration(
240248
cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us);
241-
242249
std::cout << " (overhead " << overhead_avg_us << " usec + "
243250
<< "queue " << queue_avg_us << " usec + "
244251
<< "cache hit/miss " << combined_cache_compute_avg_us
@@ -283,12 +290,18 @@ ReportServerSideStats(
283290
const uint64_t overhead_avg_us = GetOverheadDuration(
284291
cumm_avg_us, ensemble_times.total_queue_time_avg_us,
285292
ensemble_times.total_combined_cache_compute_time_avg_us);
286-
std::cout << " (overhead " << overhead_avg_us << " usec + "
287-
<< "queue " << ensemble_times.total_queue_time_avg_us
288-
<< " usec + "
289-
<< "cache hit/miss "
290-
<< ensemble_times.total_combined_cache_compute_time_avg_us
291-
<< " usec)" << std::endl;
293+
// FIXME - Refactor these calculations in case of ensemble top level
294+
// response cache is enabled
295+
if (!parser->TopLevelResponseCachingEnabled()) {
296+
std::cout << " (overhead " << overhead_avg_us << " usec + "
297+
<< "queue " << ensemble_times.total_queue_time_avg_us
298+
<< " usec + "
299+
<< "cache hit/miss "
300+
<< ensemble_times.total_combined_cache_compute_time_avg_us
301+
<< " usec)" << std::endl;
302+
} else {
303+
std::cout << std::endl;
304+
}
292305
std::cout << ident << ident << " Average Cache Hit Latency: "
293306
<< ensemble_times.total_cache_hit_time_avg_us << " usec"
294307
<< std::endl;
@@ -1550,6 +1563,21 @@ InferenceProfiler::DetermineStatsModelVersion(
15501563
return cb::Error::Success;
15511564
}
15521565

1566+
// Only for unit-testing
1567+
#ifndef DOCTEST_CONFIG_DISABLE
1568+
cb::Error
1569+
InferenceProfiler::SetTopLevelResponseCaching(
1570+
bool enable_top_level_response_caching)
1571+
{
1572+
parser_ = std::make_shared<ModelParser>(cb::BackendKind::TRITON);
1573+
if (parser_ == nullptr) {
1574+
return cb::Error("Failed to initialize ModelParser");
1575+
}
1576+
parser_->SetTopLevelResponseCaching(enable_top_level_response_caching);
1577+
return cb::Error::Success;
1578+
}
1579+
#endif
1580+
15531581
cb::Error
15541582
InferenceProfiler::SummarizeServerStats(
15551583
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
@@ -1605,8 +1633,20 @@ InferenceProfiler::SummarizeServerStatsHelper(
16051633

16061634
const auto& end_itr = end_status.find(this_id);
16071635
if (end_itr == end_status.end()) {
1608-
return cb::Error(
1609-
"missing statistics for requested model", pa::GENERIC_ERROR);
1636+
// In case of ensemble models, if top level response caching is enabled,
1637+
// the composing models statistics are unavailable in case of a cache hit.
1638+
// This is due to the scheduler sends cache response and composing models do
1639+
// not get executed. It's a valid scenario and shouldn't throw error.
1640+
bool stats_not_found_and_invalid =
1641+
model_version == -1 && !parser_->TopLevelResponseCachingEnabled();
1642+
if (stats_not_found_and_invalid) {
1643+
return cb::Error(
1644+
"missing statistics for requested model", pa::GENERIC_ERROR);
1645+
} else {
1646+
// Setting server stats 0 for composing model in case of ensemble request
1647+
// cache hit since the composing model will not be executed
1648+
server_stats->Reset();
1649+
}
16101650
} else {
16111651
uint64_t start_infer_cnt = 0;
16121652
uint64_t start_exec_cnt = 0;

src/c++/perf_analyzer/inference_profiler.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ namespace triton { namespace perfanalyzer {
5252
#ifndef DOCTEST_CONFIG_DISABLE
5353
class NaggyMockInferenceProfiler;
5454
class TestInferenceProfiler;
55+
class ModelParser;
5556
#endif
5657

5758
/// Constant parameters that determine the whether stopping criteria has met
@@ -119,6 +120,28 @@ struct ServerSideStats {
119120
uint64_t cache_miss_time_ns;
120121

121122
std::map<cb::ModelIdentifier, ServerSideStats> composing_models_stat;
123+
// This function sets composing model server stats to 0 in case of a cache hit
124+
// when top level response cache is enabled, since composing models are not
125+
// executed and do not have any stats
126+
void Reset()
127+
{
128+
inference_count = 0;
129+
execution_count = 0;
130+
success_count = 0;
131+
queue_count = 0;
132+
compute_input_count = 0;
133+
compute_infer_count = 0;
134+
compute_output_count = 0;
135+
cumm_time_ns = 0;
136+
queue_time_ns = 0;
137+
compute_input_time_ns = 0;
138+
compute_infer_time_ns = 0;
139+
compute_output_time_ns = 0;
140+
cache_hit_count = 0;
141+
cache_hit_time_ns = 0;
142+
cache_miss_count = 0;
143+
cache_miss_time_ns = 0;
144+
}
122145
};
123146

124147
/// Holds the statistics recorded at the client side.
@@ -553,12 +576,17 @@ class InferenceProfiler {
553576
/// measurement
554577
/// \param end_stats The stats for all models at the end of the measurement
555578
/// \param model_version The determined model version
579+
556580
cb::Error DetermineStatsModelVersion(
557581
const cb::ModelIdentifier& model_identifier,
558582
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_stats,
559583
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_stats,
560584
int64_t* model_version);
561585

586+
#ifndef DOCTEST_CONFIG_DISABLE
587+
cb::Error SetTopLevelResponseCaching(bool enable_top_level_request_caching);
588+
#endif
589+
562590
/// \param start_status The model status at the start of the measurement.
563591
/// \param end_status The model status at the end of the measurement.
564592
/// \param server_stats Returns the summary that the fields recorded by server
@@ -761,6 +789,7 @@ class InferenceProfiler {
761789
#ifndef DOCTEST_CONFIG_DISABLE
762790
friend NaggyMockInferenceProfiler;
763791
friend TestInferenceProfiler;
792+
friend ModelParser;
764793

765794
public:
766795
InferenceProfiler() = default;

src/c++/perf_analyzer/model_parser.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,10 @@ ModelParser::InitTriton(
169169
response_cache_enabled_ = cache_itr->value["enable"].GetBool();
170170
}
171171

172+
if (cache_itr != config.MemberEnd()) {
173+
top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool();
174+
}
175+
172176
return cb::Error::Success;
173177
}
174178

src/c++/perf_analyzer/model_parser.h

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ namespace triton { namespace perfanalyzer {
3535
#ifndef DOCTEST_CONFIG_DISABLE
3636
class TestModelParser;
3737
class MockModelParser;
38+
class InferenceProfiler;
3839
#endif
3940

4041
struct ModelTensor {
@@ -73,7 +74,8 @@ class ModelParser {
7374
outputs_(std::make_shared<ModelTensorMap>()),
7475
composing_models_map_(std::make_shared<ComposingModelMap>()),
7576
scheduler_type_(NONE), max_batch_size_(0), is_decoupled_(false),
76-
response_cache_enabled_(false)
77+
response_cache_enabled_(false),
78+
top_level_response_caching_enabled_(false)
7779
{
7880
}
7981

@@ -151,6 +153,22 @@ class ModelParser {
151153
/// model
152154
bool ResponseCacheEnabled() const { return response_cache_enabled_; }
153155

156+
/// Returns whether or not top level request caching is enabled for this model
157+
/// \return the truth value of whether top level request caching is enabled
158+
/// for this model
159+
bool TopLevelResponseCachingEnabled() const
160+
{
161+
return top_level_response_caching_enabled_;
162+
}
163+
164+
/// Only for testing
165+
#ifndef DOCTEST_CONFIG_DISABLE
166+
void SetTopLevelResponseCaching(bool enable_top_level_response_caching)
167+
{
168+
top_level_response_caching_enabled_ = enable_top_level_response_caching;
169+
}
170+
#endif
171+
154172
/// Get the details about the model inputs.
155173
/// \return The map with tensor_name and the tensor details
156174
/// stored as key-value pair.
@@ -169,6 +187,7 @@ class ModelParser {
169187
return composing_models_map_;
170188
}
171189

190+
172191
protected:
173192
ModelSchedulerType scheduler_type_;
174193
bool is_decoupled_;
@@ -220,10 +239,12 @@ class ModelParser {
220239
std::string model_signature_name_;
221240
size_t max_batch_size_;
222241
bool response_cache_enabled_;
242+
bool top_level_response_caching_enabled_;
223243

224244
#ifndef DOCTEST_CONFIG_DISABLE
225245
friend TestModelParser;
226246
friend MockModelParser;
247+
friend InferenceProfiler;
227248

228249
public:
229250
ModelParser() = default;

src/c++/perf_analyzer/test_inference_profiler.cc

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,15 @@ class TestInferenceProfiler : public InferenceProfiler {
160160
return InferenceProfiler::DetermineStatsModelVersion(
161161
model_identifier, start_stats, end_stats, model_version);
162162
}
163+
164+
cb::Error SetTopLevelResponseCaching(bool enable_top_level_response_caching)
165+
{
166+
return InferenceProfiler::SetTopLevelResponseCaching(
167+
enable_top_level_response_caching);
168+
}
163169
};
164170

171+
165172
TEST_CASE("testing the ValidLatencyMeasurement function")
166173
{
167174
size_t valid_sequence_count{};
@@ -850,6 +857,25 @@ TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()")
850857
expect_exception = true;
851858
}
852859

860+
SUBCASE("One entry - version -1 - valid and in start")
861+
{
862+
model_identifier = {"ModelA", "-1"};
863+
start_stats_map.insert({{"ModelA", "3"}, old_stats});
864+
end_stats_map.insert({{"ModelA", "3"}, new_stats});
865+
cb::Error status = tip.SetTopLevelResponseCaching(true);
866+
CHECK(status.IsOk());
867+
expected_model_version = -1;
868+
}
869+
870+
SUBCASE("One entry - version -1 - not valid")
871+
{
872+
model_identifier = {"ModelA", "-1"};
873+
end_stats_map.insert({{"ModelA", "3"}, old_stats});
874+
cb::Error status = tip.SetTopLevelResponseCaching(false);
875+
CHECK(status.IsOk());
876+
expected_model_version = -1;
877+
expect_exception = true;
878+
}
853879

854880
std::stringstream captured_cerr;
855881
std::streambuf* old = std::cerr.rdbuf(captured_cerr.rdbuf());

0 commit comments

Comments
 (0)