Revert "Changes to support Ensemble Top Level Response Caching (#560) (#642)"

ganeshku1 · nnshah1 · commit c82291744eab · 2024-06-04T21:05:35.000-07:00
This reverts commit cc6a3b2.
diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
@@ -107,14 +107,6 @@ EnsembleDurations
 GetTotalEnsembleDurations(const ServerSideStats& stats)
 {
   EnsembleDurations result;
-  // Calculate avg cache hit latency and cache miss latency for ensemble model
-  // in case top level response caching is enabled.
-  const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count;
-  const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count;
-  result.total_cache_hit_time_avg_us +=
-      AverageDurationInUs(stats.cache_hit_time_ns, ensemble_cache_hit_cnt);
-  result.total_cache_miss_time_avg_us +=
-      AverageDurationInUs(stats.cache_miss_time_ns, ensemble_cache_miss_cnt);
   for (const auto& model_stats : stats.composing_models_stat) {
     if (model_stats.second.composing_models_stat.empty()) {
       // Cache hit count covers cache hits, not related to compute times
@@ -246,6 +238,7 @@ ReportServerSideStats(
     if (parser->ResponseCacheEnabled()) {
       const uint64_t overhead_avg_us = GetOverheadDuration(
           cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us);
+
       std::cout << " (overhead " << overhead_avg_us << " usec + "
                 << "queue " << queue_avg_us << " usec + "
                 << "cache hit/miss " << combined_cache_compute_avg_us
@@ -290,18 +283,12 @@ ReportServerSideStats(
       const uint64_t overhead_avg_us = GetOverheadDuration(
           cumm_avg_us, ensemble_times.total_queue_time_avg_us,
           ensemble_times.total_combined_cache_compute_time_avg_us);
-      // FIXME - Refactor these calculations in case of ensemble top level
-      // response cache is enabled
-      if (!parser->TopLevelResponseCachingEnabled()) {
-        std::cout << " (overhead " << overhead_avg_us << " usec + "
-                  << "queue " << ensemble_times.total_queue_time_avg_us
-                  << " usec + "
-                  << "cache hit/miss "
-                  << ensemble_times.total_combined_cache_compute_time_avg_us
-                  << " usec)" << std::endl;
-      } else {
-        std::cout << std::endl;
-      }
+      std::cout << " (overhead " << overhead_avg_us << " usec + "
+                << "queue " << ensemble_times.total_queue_time_avg_us
+                << " usec + "
+                << "cache hit/miss "
+                << ensemble_times.total_combined_cache_compute_time_avg_us
+                << " usec)" << std::endl;
       std::cout << ident << ident << "  Average Cache Hit Latency: "
                 << ensemble_times.total_cache_hit_time_avg_us << " usec"
                 << std::endl;
@@ -1563,21 +1550,6 @@ InferenceProfiler::DetermineStatsModelVersion(
   return cb::Error::Success;
 }
 
-// Only for unit-testing
-#ifndef DOCTEST_CONFIG_DISABLE
-cb::Error
-InferenceProfiler::SetTopLevelResponseCaching(
-    bool enable_top_level_response_caching)
-{
-  parser_ = std::make_shared<ModelParser>(cb::BackendKind::TRITON);
-  if (parser_ == nullptr) {
-    return cb::Error("Failed to initialize ModelParser");
-  }
-  parser_->SetTopLevelResponseCaching(enable_top_level_response_caching);
-  return cb::Error::Success;
-}
-#endif
-
 cb::Error
 InferenceProfiler::SummarizeServerStats(
     const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
@@ -1633,20 +1605,8 @@ InferenceProfiler::SummarizeServerStatsHelper(
 
   const auto& end_itr = end_status.find(this_id);
   if (end_itr == end_status.end()) {
-    // In case of ensemble models, if top level response caching is enabled,
-    // the composing models statistics are unavailable in case of a cache hit.
-    // This is due to the scheduler sends cache response and composing models do
-    // not get executed. It's a valid scenario and shouldn't throw error.
-    bool stats_not_found_and_invalid =
-        model_version == -1 && !parser_->TopLevelResponseCachingEnabled();
-    if (stats_not_found_and_invalid) {
-      return cb::Error(
-          "missing statistics for requested model", pa::GENERIC_ERROR);
-    } else {
-      // Setting server stats 0 for composing model in case of ensemble request
-      // cache hit since the composing model will not be executed
-      server_stats->Reset();
-    }
+    return cb::Error(
+        "missing statistics for requested model", pa::GENERIC_ERROR);
   } else {
     uint64_t start_infer_cnt = 0;
     uint64_t start_exec_cnt = 0;
diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h
@@ -52,7 +52,6 @@ namespace triton { namespace perfanalyzer {
 #ifndef DOCTEST_CONFIG_DISABLE
 class NaggyMockInferenceProfiler;
 class TestInferenceProfiler;
-class ModelParser;
 #endif
 
 /// Constant parameters that determine the whether stopping criteria has met
@@ -120,28 +119,6 @@ struct ServerSideStats {
   uint64_t cache_miss_time_ns;
 
   std::map<cb::ModelIdentifier, ServerSideStats> composing_models_stat;
-  // This function sets composing model server stats to 0 in case of a cache hit
-  // when top level response cache is enabled, since composing models are not
-  // executed and do not have any stats
-  void Reset()
-  {
-    inference_count = 0;
-    execution_count = 0;
-    success_count = 0;
-    queue_count = 0;
-    compute_input_count = 0;
-    compute_infer_count = 0;
-    compute_output_count = 0;
-    cumm_time_ns = 0;
-    queue_time_ns = 0;
-    compute_input_time_ns = 0;
-    compute_infer_time_ns = 0;
-    compute_output_time_ns = 0;
-    cache_hit_count = 0;
-    cache_hit_time_ns = 0;
-    cache_miss_count = 0;
-    cache_miss_time_ns = 0;
-  }
 };
 
 /// Holds the statistics recorded at the client side.
@@ -576,17 +553,12 @@ class InferenceProfiler {
   ///  measurement
   /// \param end_stats The stats for all models at the end of the measurement
   /// \param model_version The determined model version
-
   cb::Error DetermineStatsModelVersion(
       const cb::ModelIdentifier& model_identifier,
       const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_stats,
       const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_stats,
       int64_t* model_version);
 
-#ifndef DOCTEST_CONFIG_DISABLE
-  cb::Error SetTopLevelResponseCaching(bool enable_top_level_request_caching);
-#endif
-
   /// \param start_status The model status at the start of the measurement.
   /// \param end_status The model status at the end of the measurement.
   /// \param server_stats Returns the summary that the fields recorded by server
@@ -789,7 +761,6 @@ class InferenceProfiler {
 #ifndef DOCTEST_CONFIG_DISABLE
   friend NaggyMockInferenceProfiler;
   friend TestInferenceProfiler;
-  friend ModelParser;
 
  public:
   InferenceProfiler() = default;
diff --git a/src/c++/perf_analyzer/model_parser.cc b/src/c++/perf_analyzer/model_parser.cc
@@ -169,10 +169,6 @@ ModelParser::InitTriton(
     response_cache_enabled_ = cache_itr->value["enable"].GetBool();
   }
 
-  if (cache_itr != config.MemberEnd()) {
-    top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool();
-  }
-
   return cb::Error::Success;
 }
 
diff --git a/src/c++/perf_analyzer/model_parser.h b/src/c++/perf_analyzer/model_parser.h
@@ -35,7 +35,6 @@ namespace triton { namespace perfanalyzer {
 #ifndef DOCTEST_CONFIG_DISABLE
 class TestModelParser;
 class MockModelParser;
-class InferenceProfiler;
 #endif
 
 struct ModelTensor {
@@ -74,8 +73,7 @@ class ModelParser {
         outputs_(std::make_shared<ModelTensorMap>()),
         composing_models_map_(std::make_shared<ComposingModelMap>()),
         scheduler_type_(NONE), max_batch_size_(0), is_decoupled_(false),
-        response_cache_enabled_(false),
-        top_level_response_caching_enabled_(false)
+        response_cache_enabled_(false)
   {
   }
 
@@ -153,22 +151,6 @@ class ModelParser {
   /// model
   bool ResponseCacheEnabled() const { return response_cache_enabled_; }
 
-  /// Returns whether or not top level request caching is enabled for this model
-  /// \return the truth value of whether top level request caching is enabled
-  /// for this model
-  bool TopLevelResponseCachingEnabled() const
-  {
-    return top_level_response_caching_enabled_;
-  }
-
-/// Only for testing
-#ifndef DOCTEST_CONFIG_DISABLE
-  void SetTopLevelResponseCaching(bool enable_top_level_response_caching)
-  {
-    top_level_response_caching_enabled_ = enable_top_level_response_caching;
-  }
-#endif
-
   /// Get the details about the model inputs.
   /// \return The map with tensor_name and the tensor details
   /// stored as key-value pair.
@@ -187,7 +169,6 @@ class ModelParser {
     return composing_models_map_;
   }
 
-
  protected:
   ModelSchedulerType scheduler_type_;
   bool is_decoupled_;
@@ -239,12 +220,10 @@ class ModelParser {
   std::string model_signature_name_;
   size_t max_batch_size_;
   bool response_cache_enabled_;
-  bool top_level_response_caching_enabled_;
 
 #ifndef DOCTEST_CONFIG_DISABLE
   friend TestModelParser;
   friend MockModelParser;
-  friend InferenceProfiler;
 
  public:
   ModelParser() = default;
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
@@ -160,15 +160,8 @@ class TestInferenceProfiler : public InferenceProfiler {
     return InferenceProfiler::DetermineStatsModelVersion(
         model_identifier, start_stats, end_stats, model_version);
   }
-
-  cb::Error SetTopLevelResponseCaching(bool enable_top_level_response_caching)
-  {
-    return InferenceProfiler::SetTopLevelResponseCaching(
-        enable_top_level_response_caching);
-  }
 };
 
-
 TEST_CASE("testing the ValidLatencyMeasurement function")
 {
   size_t valid_sequence_count{};
@@ -857,25 +850,6 @@ TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()")
     expect_exception = true;
   }
 
-  SUBCASE("One entry - version -1 - valid and in start")
-  {
-    model_identifier = {"ModelA", "-1"};
-    start_stats_map.insert({{"ModelA", "3"}, old_stats});
-    end_stats_map.insert({{"ModelA", "3"}, new_stats});
-    cb::Error status = tip.SetTopLevelResponseCaching(true);
-    CHECK(status.IsOk());
-    expected_model_version = -1;
-  }
-
-  SUBCASE("One entry - version -1 - not valid")
-  {
-    model_identifier = {"ModelA", "-1"};
-    end_stats_map.insert({{"ModelA", "3"}, old_stats});
-    cb::Error status = tip.SetTopLevelResponseCaching(false);
-    CHECK(status.IsOk());
-    expected_model_version = -1;
-    expect_exception = true;
-  }
 
   std::stringstream captured_cerr;
   std::streambuf* old = std::cerr.rdbuf(captured_cerr.rdbuf());

Original file line number	Diff line number	Diff line change
`@@ -169,10 +169,6 @@ ModelParser::InitTriton(`
`169`	`169`	`response_cache_enabled_ = cache_itr->value["enable"].GetBool();`
`170`	`170`	`}`
`171`	`171`
`172`		`- if (cache_itr != config.MemberEnd()) {`
`173`		`- top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool();`
`174`		`- }`
`175`		`-`
`176`	`172`	`return cb::Error::Success;`
`177`	`173`	`}`
`178`	`174`