Changes to support Ensemble Top Level Response Caching (#560)

lkomali · nnshah1 · commit fddba6d38ff7 · 2024-06-04T21:06:24.000-07:00
diff --git a/src/c++/perf_analyzer/inference_profiler.cc b/src/c++/perf_analyzer/inference_profiler.cc
@@ -107,6 +107,14 @@ EnsembleDurations
 GetTotalEnsembleDurations(const ServerSideStats& stats)
 {
   EnsembleDurations result;
+  // Calculate avg cache hit latency and cache miss latency for ensemble model
+  // in case top level response caching is enabled.
+  const uint64_t ensemble_cache_hit_cnt = stats.cache_hit_count;
+  const uint64_t ensemble_cache_miss_cnt = stats.cache_miss_count;
+  result.total_cache_hit_time_avg_us +=
+      AverageDurationInUs(stats.cache_hit_time_ns, ensemble_cache_hit_cnt);
+  result.total_cache_miss_time_avg_us +=
+      AverageDurationInUs(stats.cache_miss_time_ns, ensemble_cache_miss_cnt);
   for (const auto& model_stats : stats.composing_models_stat) {
     if (model_stats.second.composing_models_stat.empty()) {
       // Cache hit count covers cache hits, not related to compute times
@@ -238,7 +246,6 @@ ReportServerSideStats(
     if (parser->ResponseCacheEnabled()) {
       const uint64_t overhead_avg_us = GetOverheadDuration(
           cumm_avg_us, queue_avg_us, combined_cache_compute_avg_us);
-
       std::cout << " (overhead " << overhead_avg_us << " usec + "
                 << "queue " << queue_avg_us << " usec + "
                 << "cache hit/miss " << combined_cache_compute_avg_us
@@ -283,12 +290,18 @@ ReportServerSideStats(
       const uint64_t overhead_avg_us = GetOverheadDuration(
           cumm_avg_us, ensemble_times.total_queue_time_avg_us,
           ensemble_times.total_combined_cache_compute_time_avg_us);
-      std::cout << " (overhead " << overhead_avg_us << " usec + "
-                << "queue " << ensemble_times.total_queue_time_avg_us
-                << " usec + "
-                << "cache hit/miss "
-                << ensemble_times.total_combined_cache_compute_time_avg_us
-                << " usec)" << std::endl;
+      // FIXME - Refactor these calculations in case of ensemble top level
+      // response cache is enabled
+      if (!parser->TopLevelResponseCachingEnabled()) {
+        std::cout << " (overhead " << overhead_avg_us << " usec + "
+                  << "queue " << ensemble_times.total_queue_time_avg_us
+                  << " usec + "
+                  << "cache hit/miss "
+                  << ensemble_times.total_combined_cache_compute_time_avg_us
+                  << " usec)" << std::endl;
+      } else {
+        std::cout << std::endl;
+      }
       std::cout << ident << ident << "  Average Cache Hit Latency: "
                 << ensemble_times.total_cache_hit_time_avg_us << " usec"
                 << std::endl;
@@ -1550,6 +1563,21 @@ InferenceProfiler::DetermineStatsModelVersion(
   return cb::Error::Success;
 }
 
+// Only for unit-testing
+#ifndef DOCTEST_CONFIG_DISABLE
+cb::Error
+InferenceProfiler::SetTopLevelResponseCaching(
+    bool enable_top_level_response_caching)
+{
+  parser_ = std::make_shared<ModelParser>(cb::BackendKind::TRITON);
+  if (parser_ == nullptr) {
+    return cb::Error("Failed to initialize ModelParser");
+  }
+  parser_->SetTopLevelResponseCaching(enable_top_level_response_caching);
+  return cb::Error::Success;
+}
+#endif
+
 cb::Error
 InferenceProfiler::SummarizeServerStats(
     const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_status,
@@ -1605,8 +1633,20 @@ InferenceProfiler::SummarizeServerStatsHelper(
 
   const auto& end_itr = end_status.find(this_id);
   if (end_itr == end_status.end()) {
-    return cb::Error(
-        "missing statistics for requested model", pa::GENERIC_ERROR);
+    // In case of ensemble models, if top level response caching is enabled,
+    // the composing models statistics are unavailable in case of a cache hit.
+    // This is due to the scheduler sends cache response and composing models do
+    // not get executed. It's a valid scenario and shouldn't throw error.
+    bool stats_not_found_and_invalid =
+        model_version == -1 && !parser_->TopLevelResponseCachingEnabled();
+    if (stats_not_found_and_invalid) {
+      return cb::Error(
+          "missing statistics for requested model", pa::GENERIC_ERROR);
+    } else {
+      // Setting server stats 0 for composing model in case of ensemble request
+      // cache hit since the composing model will not be executed
+      server_stats->Reset();
+    }
   } else {
     uint64_t start_infer_cnt = 0;
     uint64_t start_exec_cnt = 0;
diff --git a/src/c++/perf_analyzer/inference_profiler.h b/src/c++/perf_analyzer/inference_profiler.h
@@ -52,6 +52,7 @@ namespace triton { namespace perfanalyzer {
 #ifndef DOCTEST_CONFIG_DISABLE
 class NaggyMockInferenceProfiler;
 class TestInferenceProfiler;
+class ModelParser;
 #endif
 
 /// Constant parameters that determine the whether stopping criteria has met
@@ -119,6 +120,28 @@ struct ServerSideStats {
   uint64_t cache_miss_time_ns;
 
   std::map<cb::ModelIdentifier, ServerSideStats> composing_models_stat;
+  // This function sets composing model server stats to 0 in case of a cache hit
+  // when top level response cache is enabled, since composing models are not
+  // executed and do not have any stats
+  void Reset()
+  {
+    inference_count = 0;
+    execution_count = 0;
+    success_count = 0;
+    queue_count = 0;
+    compute_input_count = 0;
+    compute_infer_count = 0;
+    compute_output_count = 0;
+    cumm_time_ns = 0;
+    queue_time_ns = 0;
+    compute_input_time_ns = 0;
+    compute_infer_time_ns = 0;
+    compute_output_time_ns = 0;
+    cache_hit_count = 0;
+    cache_hit_time_ns = 0;
+    cache_miss_count = 0;
+    cache_miss_time_ns = 0;
+  }
 };
 
 /// Holds the statistics recorded at the client side.
@@ -553,12 +576,17 @@ class InferenceProfiler {
   ///  measurement
   /// \param end_stats The stats for all models at the end of the measurement
   /// \param model_version The determined model version
+
   cb::Error DetermineStatsModelVersion(
       const cb::ModelIdentifier& model_identifier,
       const std::map<cb::ModelIdentifier, cb::ModelStatistics>& start_stats,
       const std::map<cb::ModelIdentifier, cb::ModelStatistics>& end_stats,
       int64_t* model_version);
 
+#ifndef DOCTEST_CONFIG_DISABLE
+  cb::Error SetTopLevelResponseCaching(bool enable_top_level_request_caching);
+#endif
+
   /// \param start_status The model status at the start of the measurement.
   /// \param end_status The model status at the end of the measurement.
   /// \param server_stats Returns the summary that the fields recorded by server
@@ -761,6 +789,7 @@ class InferenceProfiler {
 #ifndef DOCTEST_CONFIG_DISABLE
   friend NaggyMockInferenceProfiler;
   friend TestInferenceProfiler;
+  friend ModelParser;
 
  public:
   InferenceProfiler() = default;
diff --git a/src/c++/perf_analyzer/model_parser.cc b/src/c++/perf_analyzer/model_parser.cc
@@ -169,6 +169,10 @@ ModelParser::InitTriton(
     response_cache_enabled_ = cache_itr->value["enable"].GetBool();
   }
 
+  if (cache_itr != config.MemberEnd()) {
+    top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool();
+  }
+
   return cb::Error::Success;
 }
 
diff --git a/src/c++/perf_analyzer/model_parser.h b/src/c++/perf_analyzer/model_parser.h
@@ -35,6 +35,7 @@ namespace triton { namespace perfanalyzer {
 #ifndef DOCTEST_CONFIG_DISABLE
 class TestModelParser;
 class MockModelParser;
+class InferenceProfiler;
 #endif
 
 struct ModelTensor {
@@ -73,7 +74,8 @@ class ModelParser {
         outputs_(std::make_shared<ModelTensorMap>()),
         composing_models_map_(std::make_shared<ComposingModelMap>()),
         scheduler_type_(NONE), max_batch_size_(0), is_decoupled_(false),
-        response_cache_enabled_(false)
+        response_cache_enabled_(false),
+        top_level_response_caching_enabled_(false)
   {
   }
 
@@ -151,6 +153,22 @@ class ModelParser {
   /// model
   bool ResponseCacheEnabled() const { return response_cache_enabled_; }
 
+  /// Returns whether or not top level request caching is enabled for this model
+  /// \return the truth value of whether top level request caching is enabled
+  /// for this model
+  bool TopLevelResponseCachingEnabled() const
+  {
+    return top_level_response_caching_enabled_;
+  }
+
+/// Only for testing
+#ifndef DOCTEST_CONFIG_DISABLE
+  void SetTopLevelResponseCaching(bool enable_top_level_response_caching)
+  {
+    top_level_response_caching_enabled_ = enable_top_level_response_caching;
+  }
+#endif
+
   /// Get the details about the model inputs.
   /// \return The map with tensor_name and the tensor details
   /// stored as key-value pair.
@@ -169,6 +187,7 @@ class ModelParser {
     return composing_models_map_;
   }
 
+
  protected:
   ModelSchedulerType scheduler_type_;
   bool is_decoupled_;
@@ -220,10 +239,12 @@ class ModelParser {
   std::string model_signature_name_;
   size_t max_batch_size_;
   bool response_cache_enabled_;
+  bool top_level_response_caching_enabled_;
 
 #ifndef DOCTEST_CONFIG_DISABLE
   friend TestModelParser;
   friend MockModelParser;
+  friend InferenceProfiler;
 
  public:
   ModelParser() = default;
diff --git a/src/c++/perf_analyzer/test_inference_profiler.cc b/src/c++/perf_analyzer/test_inference_profiler.cc
@@ -160,8 +160,15 @@ class TestInferenceProfiler : public InferenceProfiler {
     return InferenceProfiler::DetermineStatsModelVersion(
         model_identifier, start_stats, end_stats, model_version);
   }
+
+  cb::Error SetTopLevelResponseCaching(bool enable_top_level_response_caching)
+  {
+    return InferenceProfiler::SetTopLevelResponseCaching(
+        enable_top_level_response_caching);
+  }
 };
 
+
 TEST_CASE("testing the ValidLatencyMeasurement function")
 {
   size_t valid_sequence_count{};
@@ -850,6 +857,25 @@ TEST_CASE("determine_stats_model_version: testing DetermineStatsModelVersion()")
     expect_exception = true;
   }
 
+  SUBCASE("One entry - version -1 - valid and in start")
+  {
+    model_identifier = {"ModelA", "-1"};
+    start_stats_map.insert({{"ModelA", "3"}, old_stats});
+    end_stats_map.insert({{"ModelA", "3"}, new_stats});
+    cb::Error status = tip.SetTopLevelResponseCaching(true);
+    CHECK(status.IsOk());
+    expected_model_version = -1;
+  }
+
+  SUBCASE("One entry - version -1 - not valid")
+  {
+    model_identifier = {"ModelA", "-1"};
+    end_stats_map.insert({{"ModelA", "3"}, old_stats});
+    cb::Error status = tip.SetTopLevelResponseCaching(false);
+    CHECK(status.IsOk());
+    expected_model_version = -1;
+    expect_exception = true;
+  }
 
   std::stringstream captured_cerr;
   std::streambuf* old = std::cerr.rdbuf(captured_cerr.rdbuf());

Original file line number	Diff line number	Diff line change
`@@ -169,6 +169,10 @@ ModelParser::InitTriton(`
`169`	`169`	`response_cache_enabled_ = cache_itr->value["enable"].GetBool();`
`170`	`170`	`}`
`171`	`171`
	`172`	`+ if (cache_itr != config.MemberEnd()) {`
	`173`	`+ top_level_response_caching_enabled_ = cache_itr->value["enable"].GetBool();`
	`174`	`+ }`
	`175`	`+`
`172`	`176`	`return cb::Error::Success;`
`173`	`177`	`}`
`174`	`178`