Skip to content

Commit c822917

Browse files
ganeshku1nnshah1
authored andcommitted
Revert "Changes to support Ensemble Top Level Response Caching (#560) (#642)"
This reverts commit cc6a3b2.
1 parent e46598b commit c822917

File tree

5 files changed

+10
-130
lines changed

5 files changed

+10
-130
lines changed

src/c++/perf_analyzer/inference_profiler.cc

Lines changed: 9 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -107,14 +107,6 @@ EnsembleDurations
107107
GetTotalEnsembleDurations(const ServerSideStats& stats)
108108
{
109109
EnsembleDurations result;
110-
// Calculate avg cache hit latency and cache miss latency for ensemble model
111-
// in case top level response caching is enabled.
112-
const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count;
113-
const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count;
114-
result.total_cache_hit_time_avg_us +=
115-
AverageDurationInUs(stats.cache_hit_time_ns, ensemble_cache_hit_cnt);
116-
result.total_cache_miss_time_avg_us +=
117-
AverageDurationInUs(stats.cache_miss_time_ns, ensemble_cache_miss_cnt);
118110
for (const auto& model_stats : stats.composing_models_stat) {
119111
if (model_stats.second.composing_models_stat.empty()) {
120112
// Cache hit count covers cache hits, not related to compute times
@@ -246,6 +238,7 @@ ReportServerSideStats(
246238
if (parser->ResponseCacheEnabled()) {
247239
const uint64_t overhead_avg_us = GetOverheadDuration(
248240
cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us);
241+
249242
std::cout << " (overhead " << overhead_avg_us << " usec + "
250243
<< "queue " << queue_avg_us << " usec + "
251244
<< "cache hit/miss " << combined_cache_compute_avg_us
@@ -290,18 +283,12 @@ ReportServerSideStats(
290283
const uint64_t overhead_avg_us = GetOverheadDuration(
291284
cumm_avg_us, ensemble_times.total_queue_time_avg_us,
292285
ensemble_times.total_combined_cache_compute_time_avg_us);
293-
// FIXME - Refactor these calculations in case of ensemble top level
294-
// response cache is enabled
295-
if (!parser->TopLevelResponseCachingEnabled()) {
296-
std::cout << " (overhead " << overhead_avg_us << " usec + "
297-
<< "queue " << ensemble_times.total_queue_time_avg_us
298-
<< " usec + "
299-
<< "cache hit/miss "
300-
<< ensemble_times.total_combined_cache_compute_time_avg_us
301-
<< " usec)" << std::endl;
302-
} else {
303-
std::cout << std::endl;
304-
}
286+
std::cout << " (overhead " << overhead_avg_us << " usec + "
287+
<< "queue " << ensemble_times.total_queue_time_avg_us
288+
<< " usec + "
289+
<< "cache hit/miss "
290+
<< ensemble_times.total_combined_cache_compute_time_avg_us
291+
<< " usec)" << std::endl;
305292
std::cout << ident << ident << " Average Cache Hit Latency: "
306293
<< ensemble_times.total_cache_hit_time_avg_us << " usec"
307294
<< std::endl;
@@ -1563,21 +1550,6 @@ InferenceProfiler::DetermineStatsModelVersion(
15631550
return cb::Error::Success;
15641551
}
15651552

1566-
// Only for unit-testing
1567-
#ifndef DOCTEST_CONFIG_DISABLE
1568-
cb::Error
1569-
InferenceProfiler::SetTopLevelResponseCaching(
1570-
bool enable_top_level_response_caching)
1571-
{
1572-
parser_ = std::make_shared<ModelParser>(cb::BackendKind::TRITON);
1573-
if (parser_ == nullptr) {
1574-
return cb::Error("Failed to initialize ModelParser");
1575-
}
1576-
parser_->SetTopLevelResponseCaching(enable_top_level_response_caching);
1577-
return cb::Error::Success;
1578-
}
1579-
#endif
1580-
15811553
cb::Error
15821554
InferenceProfiler::SummarizeServerStats(
15831555
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
@@ -1633,20 +1605,8 @@ InferenceProfiler::SummarizeServerStatsHelper(
16331605

16341606
const auto& end_itr = end_status.find(this_id);
16351607
if (end_itr == end_status.end()) {
1636-
// In case of ensemble models, if top level response caching is enabled,
1637-
// the composing models statistics are unavailable in case of a cache hit.
1638-
// This is due to the scheduler sends cache response and composing models do
1639-
// not get executed. It's a valid scenario and shouldn't throw error.
1640-
bool stats_not_found_and_invalid =
1641-
model_version == -1 && !parser_->TopLevelResponseCachingEnabled();
1642-
if (stats_not_found_and_invalid) {
1643-
return cb::Error(
1644-
"missing statistics for requested model", pa::GENERIC_ERROR);
1645-
} else {
1646-
// Setting server stats 0 for composing model in case of ensemble request
1647-
// cache hit since the composing model will not be executed
1648-
server_stats->Reset();
1649-
}
1608+
return cb::Error(
1609+
"missing statistics for requested model", pa::GENERIC_ERROR);
16501610
} else {
16511611
uint64_t start_infer_cnt = 0;
16521612
uint64_t start_exec_cnt = 0;

src/c++/perf_analyzer/inference_profiler.h

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ namespace triton { namespace perfanalyzer {
5252
#ifndef DOCTEST_CONFIG_DISABLE
5353
class NaggyMockInferenceProfiler;
5454
class TestInferenceProfiler;
55-
class ModelParser;
5655
#endif
5756

5857
/// Constant parameters that determine the whether stopping criteria has met
@@ -120,28 +119,6 @@ struct ServerSideStats {
120119
uint64_t cache_miss_time_ns;
121120

122121
std::map<cb::ModelIdentifier, ServerSideStats> composing_models_stat;
123-
// This function sets composing model server stats to 0 in case of a cache hit
124-
// when top level response cache is enabled, since composing models are not
125-
// executed and do not have any stats
126-
void Reset()
127-
{
128-
inference_count = 0;
129-
execution_count = 0;
130-
success_count = 0;
131-
queue_count = 0;
132-
compute_input_count = 0;
133-
compute_infer_count = 0;
134-
compute_output_count = 0;
135-
cumm_time_ns = 0;
136-
queue_time_ns = 0;
137-
compute_input_time_ns = 0;
138-
compute_infer_time_ns = 0;
139-
compute_output_time_ns = 0;
140-
cache_hit_count = 0;
141-
cache_hit_time_ns = 0;
142-
cache_miss_count = 0;
143-
cache_miss_time_ns = 0;
144-
}
145122
};
146123

147124
/// Holds the statistics recorded at the client side.
@@ -576,17 +553,12 @@ class InferenceProfiler {
576553
/// measurement
577554
/// \param end_stats The stats for all models at the end of the measurement
578555
/// \param model_version The determined model version
579-
580556
cb::Error DetermineStatsModelVersion(
581557
const cb::ModelIdentifier& model_identifier,
582558
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_stats,
583559
const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_stats,
584560
int64_t* model_version);
585561

586-
#ifndef DOCTEST_CONFIG_DISABLE
587-
cb::Error SetTopLevelResponseCaching(bool enable_top_level_request_caching);
588-
#endif
589-
590562
/// \param start_status The model status at the start of the measurement.
591563
/// \param end_status The model status at the end of the measurement.
592564
/// \param server_stats Returns the summary that the fields recorded by server
@@ -789,7 +761,6 @@ class InferenceProfiler {
789761
#ifndef DOCTEST_CONFIG_DISABLE
790762
friend NaggyMockInferenceProfiler;
791763
friend TestInferenceProfiler;
792-
friend ModelParser;
793764

794765
public:
795766
InferenceProfiler() = default;

src/c++/perf_analyzer/model_parser.cc

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,10 +169,6 @@ ModelParser::InitTriton(
169169
response_cache_enabled_ = cache_itr->value["enable"].GetBool();
170170
}
171171

172-
if (cache_itr != config.MemberEnd()) {
173-
top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool();
174-
}
175-
176172
return cb::Error::Success;
177173
}
178174

src/c++/perf_analyzer/model_parser.h

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ namespace triton { namespace perfanalyzer {
3535
#ifndef DOCTEST_CONFIG_DISABLE
3636
class TestModelParser;
3737
class MockModelParser;
38-
class InferenceProfiler;
3938
#endif
4039

4140
struct ModelTensor {
@@ -74,8 +73,7 @@ class ModelParser {
7473
outputs_(std::make_shared<ModelTensorMap>()),
7574
composing_models_map_(std::make_shared<ComposingModelMap>()),
7675
scheduler_type_(NONE), max_batch_size_(0), is_decoupled_(false),
77-
response_cache_enabled_(false),
78-
top_level_response_caching_enabled_(false)
76+
response_cache_enabled_(false)
7977
{
8078
}
8179

@@ -153,22 +151,6 @@ class ModelParser {
153151
/// model
154152
bool ResponseCacheEnabled() const { return response_cache_enabled_; }
155153

156-
/// Returns whether or not top level request caching is enabled for this model
157-
/// \return the truth value of whether top level request caching is enabled
158-
/// for this model
159-
bool TopLevelResponseCachingEnabled() const
160-
{
161-
return top_level_response_caching_enabled_;
162-
}
163-
164-
/// Only for testing
165-
#ifndef DOCTEST_CONFIG_DISABLE
166-
void SetTopLevelResponseCaching(bool enable_top_level_response_caching)
167-
{
168-
top_level_response_caching_enabled_ = enable_top_level_response_caching;
169-
}
170-
#endif
171-
172154
/// Get the details about the model inputs.
173155
/// \return The map with tensor_name and the tensor details
174156
/// stored as key-value pair.
@@ -187,7 +169,6 @@ class ModelParser {
187169
return composing_models_map_;
188170
}
189171

190-
191172
protected:
192173
ModelSchedulerType scheduler_type_;
193174
bool is_decoupled_;
@@ -239,12 +220,10 @@ class ModelParser {
239220
std::string model_signature_name_;
240221
size_t max_batch_size_;
241222
bool response_cache_enabled_;
242-
bool top_level_response_caching_enabled_;
243223

244224
#ifndef DOCTEST_CONFIG_DISABLE
245225
friend TestModelParser;
246226
friend MockModelParser;
247-
friend InferenceProfiler;
248227

249228
public:
250229
ModelParser() = default;

src/c++/perf_analyzer/test_inference_profiler.cc

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -160,15 +160,8 @@ class TestInferenceProfiler : public InferenceProfiler {
160160
return InferenceProfiler::DetermineStatsModelVersion(
161161
model_identifier, start_stats, end_stats, model_version);
162162
}
163-
164-
cb::Error SetTopLevelResponseCaching(bool enable_top_level_response_caching)
165-
{
166-
return InferenceProfiler::SetTopLevelResponseCaching(
167-
enable_top_level_response_caching);
168-
}
169163
};
170164

171-
172165
TEST_CASE("testing the ValidLatencyMeasurement function")
173166
{
174167
size_t valid_sequence_count{};
@@ -857,25 +850,6 @@ TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()")
857850
expect_exception = true;
858851
}
859852

860-
SUBCASE("One entry - version -1 - valid and in start")
861-
{
862-
model_identifier = {"ModelA", "-1"};
863-
start_stats_map.insert({{"ModelA", "3"}, old_stats});
864-
end_stats_map.insert({{"ModelA", "3"}, new_stats});
865-
cb::Error status = tip.SetTopLevelResponseCaching(true);
866-
CHECK(status.IsOk());
867-
expected_model_version = -1;
868-
}
869-
870-
SUBCASE("One entry - version -1 - not valid")
871-
{
872-
model_identifier = {"ModelA", "-1"};
873-
end_stats_map.insert({{"ModelA", "3"}, old_stats});
874-
cb::Error status = tip.SetTopLevelResponseCaching(false);
875-
CHECK(status.IsOk());
876-
expected_model_version = -1;
877-
expect_exception = true;
878-
}
879853

880854
std::stringstream captured_cerr;
881855
std::streambuf* old = std::cerr.rdbuf(captured_cerr.rdbuf());

0 commit comments

Comments
 (0)