Add nvtx markers which can be useful in perf profiling (#64)

tanmayv25 · web-flow · commit df900ce3d3c0 · 2022-05-24T13:42:51.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -44,6 +44,7 @@ project(tritonpytorchbackend LANGUAGES C CXX)
 
 option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
+option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
 option(TRITON_PYTORCH_ENABLE_TORCHTRT "Enable TorchTRT support" OFF)
 option(TRITON_PYTORCH_ENABLE_TORCHVISION "Enable Torchvision support" ON)
 
@@ -120,6 +121,10 @@ else()
   endif()
 endif() # TRITON_ENABLE_GPU
 
+if(${TRITON_ENABLE_NVTX})
+  add_definitions(-DTRITON_ENABLE_NVTX=1)
+endif() # TRITON_ENABLE_NVTX
+
 #
 # Shared library implementing the Triton Backend API
 #
diff --git a/src/libtorch.cc b/src/libtorch.cc
@@ -33,6 +33,7 @@
 #include "triton/backend/backend_model.h"
 #include "triton/backend/backend_model_instance.h"
 #include "triton/backend/backend_output_responder.h"
+#include "triton/common/nvtx.h"
 #include "triton/core/tritonbackend.h"
 
 #ifdef TRITON_PYTORCH_ENABLE_TORCHVISION
@@ -307,7 +308,6 @@ ModelState::ParseParameters()
         TRITONSERVER_ErrorDelete(err);
       }
     }
-
     LOG_MESSAGE(
         TRITONSERVER_LOG_INFO,
         (std::string("Inference Mode is ") +
@@ -926,6 +926,8 @@ ModelInstanceState::ProcessRequests(
        std::to_string(request_count) + " requests")
           .c_str());
 
+  NVTX_RANGE(nvtx_, "ProcessRequests " + Name());
+
   uint64_t exec_start_ns = 0;
   SET_TIMESTAMP(exec_start_ns);
 
@@ -1188,6 +1190,8 @@ ModelInstanceState::Execute(
     std::vector<torch::jit::IValue>* input_tensors,
     std::vector<torch::jit::IValue>* output_tensors)
 {
+  NVTX_RANGE(nvtx_, "Execute " + Name());
+
   torch::jit::IValue model_outputs_;
 
   try {
@@ -1758,6 +1762,8 @@ ModelInstanceState::ReadOutputTensors(
     TRITONBACKEND_Request** requests, const uint32_t request_count,
     std::vector<TRITONBACKEND_Response*>* responses, uint64_t* compute_end_ns)
 {
+  NVTX_RANGE(nvtx_, "ReadOutputTensors " + Name());
+
   BackendOutputResponder responder(
       requests, request_count, responses, model_state_->TritonMemoryManager(),
       model_state_->MaxBatchSize() > 0, model_state_->EnablePinnedInput(),