Add CUDA cache cleaning flag to pytorch backend (#61)

zeruniverse · web-flow · commit 663ee99e1b37 · 2022-07-05T17:00:59.000-07:00
* Add CUDA cache cleaning flag to pytorch backend

* minor fixes

minor code formatting change per review

* Add more notes to README
diff --git a/README.md b/README.md
@@ -179,6 +179,23 @@ key: "ENABLE_WEIGHT_SHARING"
 }
 ```
 
+* `ENABLE_CACHE_CLEANING`: Boolean flag to enable CUDA cache cleaning after each model execution.
+If not specified, cache cleaning is disabled. This flag has no effect if model is on CPU.
+Setting this flag to true will negatively impact the performance due to additional CUDA cache
+cleaning operation after each model execution. Therefore, you should only use this flag if you
+serve multiple models with Triton and encounter CUDA out of memory issue during model executions.
+
+The section of model config file specifying this parameter will look like:
+
+```
+parameters: {
+key: "ENABLE_CACHE_CLEANING"
+    value: {
+    string_value:"true"
+    }
+}
+```
+
 * Additional Optimizations: Three additional boolean parameters are available to disable
 certain Torch optimizations that can sometimes cause latency regressions in models with
 complex execution modes and dynamic shapes. If not specified, all are enabled by default.
diff --git a/src/libtorch.cc b/src/libtorch.cc
@@ -98,6 +98,7 @@ class ModelState : public BackendModel {
   {
     return enable_nvfuser_pair_;
   }
+  bool EnabledCacheCleaning(){ return enable_cache_cleaning_; }
 
   bool EnabledWeightSharing() { return enable_weight_sharing_; }
 
@@ -114,6 +115,9 @@ class ModelState : public BackendModel {
   // Flag to indicate whether inference mode is enabled. Defaults to false.
   bool enable_inference_mode_;
 
+  // Flag to indicate whether cache clearning after each run is enabled. Defaults to false.
+  bool enable_cache_cleaning_;
+
   // Flag to indicate whether weight sharing is enabled. Defaults to false.
   bool enable_weight_sharing_;
 
@@ -173,7 +177,8 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
 
 ModelState::ModelState(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model), enable_optimized_execution_(true),
-      enable_inference_mode_(false), enable_weight_sharing_(false),
+      enable_inference_mode_(false), enable_cache_cleaning_(false),
+      enable_weight_sharing_(false),
       enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
       enable_jit_executor_pair_({false, true}),
@@ -298,6 +303,25 @@ ModelState::ParseParameters()
          " for model instance '" + Name() + "'")
             .c_str());
 
+    // If 'ENABLE_CACHE_CLEANING' is not present in 'parameters' then
+    // no update is made to 'enable_cache_cleaning_'.
+    err = ParseParameter(
+        params, "ENABLE_CACHE_CLEANING", &enable_cache_cleaning_);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        TRITONSERVER_ErrorDelete(err);
+      }
+    }
+
+    LOG_MESSAGE(
+        TRITONSERVER_LOG_INFO,
+        (std::string("Cache Cleaning is ") +
+         (enable_cache_cleaning_ ? "enabled" : "disabled") +
+         " for model instance '" + Name() + "'")
+            .c_str());
+
     // If 'INFERENCE_MODE' is not present in 'parameters' then no update is made
     // to 'enable_inference_mode_'.
     err = ParseParameter(params, "INFERENCE_MODE", &enable_inference_mode_);
@@ -453,6 +477,9 @@ class ModelInstanceState : public BackendModelInstance {
   void ProcessRequests(
       TRITONBACKEND_Request** requests, const uint32_t request_count);
 
+  // Clear CUDA cache
+  void ClearCache();
+
  private:
   ModelInstanceState(
       ModelState* model_state,
@@ -585,16 +612,21 @@ ModelInstanceState::ModelInstanceState(
   THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
 }
 
-ModelInstanceState::~ModelInstanceState()
+void ModelInstanceState::ClearCache()
 {
-  torch_model_.reset();
 #ifdef TRITON_ENABLE_GPU
   if (device_.is_cuda()) {
     c10::cuda::CUDACachingAllocator::emptyCache();
   }
 #endif  // TRITON_ENABLE_GPU
 }
 
+ModelInstanceState::~ModelInstanceState()
+{
+  torch_model_.reset();
+  ClearCache();
+}
+
 TRITONSERVER_Error*
 ModelInstanceState::ValidateBooleanSequenceControl(
     triton::common::TritonJson::Value& sequence_batching,
@@ -2081,6 +2113,10 @@ TRITONBACKEND_ModelInstanceExecute(
   // specific request.
   instance_state->ProcessRequests(requests, request_count);
 
+  if(model_state->EnabledCacheCleaning()) {
+    instance_state->ClearCache();
+  }
+
   return nullptr;  // success
 }