Merge pull request #299 from nvkevihu/cpp-profiler

meena-at-work · web-flow · commit a5ef6042c725 · 2022-07-11T12:48:13.000-07:00
[C++ Benchmark] Profiling to Tensorboard
diff --git a/tftrt/benchmarking-cpp/BUILD b/tftrt/benchmarking-cpp/BUILD
@@ -0,0 +1,33 @@
+# Description:
+#   TensorFlow C++ inference example with TF-TRT model.
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_protos_profiler_service",
+)
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],
+)
+
+tf_cc_binary(
+    name = "tftrt_benchmark_runner",
+    srcs = [
+        "main.cc",
+    ],
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/profiler/rpc/client:capture_profile",
+        "//tensorflow/core/profiler/rpc/client:profiler_client",
+    ] + tf_protos_profiler_service(),
+)
diff --git a/tftrt/benchmarking-cpp/CMakeLists.txt b/tftrt/benchmarking-cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.13)
-project(TF_TRT_Benchmark_Runner)
+project(TFTRT_Benchmark_Runner)
 
 #-------------------------------------------------------------
 # Configuration
@@ -29,19 +29,19 @@ add_custom_target(tf_symlinks DEPENDS ${tf_framework_shared_lib} ${tf_shared_lib
 #-----------------------------------------------------------
 # Benchmark Runner Targets
 #-----------------------------------------------------------
-add_executable(tf_trt_benchmark_runner main.cc)
+add_executable(tftrt_benchmark_runner main.cc)
 
-target_link_libraries(tf_trt_benchmark_runner tensorflow_cc)
-target_link_libraries(tf_trt_benchmark_runner tensorflow_framework)
+target_link_libraries(tftrt_benchmark_runner tensorflow_cc)
+target_link_libraries(tftrt_benchmark_runner tensorflow_framework)
 
-target_compile_options(tf_trt_benchmark_runner PRIVATE -D_GLIBCXX_USE_CXX11_ABI=1 -DGOOGLE_CUDA -DGOOGLE_TENSORRT)
+target_compile_options(tftrt_benchmark_runner PRIVATE -D_GLIBCXX_USE_CXX11_ABI=1 -DGOOGLE_CUDA -DGOOGLE_TENSORRT)
 
-target_link_directories(tf_trt_benchmark_runner PRIVATE ${tf_python_dir})
-target_link_directories(tf_trt_benchmark_runner PRIVATE ${tf_dir})
+target_link_directories(tftrt_benchmark_runner PRIVATE ${tf_python_dir})
+target_link_directories(tftrt_benchmark_runner PRIVATE ${tf_dir})
 
-target_compile_options(tf_trt_benchmark_runner PRIVATE -O2 -Wl,-rpath=${tf_python_dir})
+target_compile_options(tftrt_benchmark_runner PRIVATE -O2 -Wl,-rpath=${tf_python_dir})
 
-target_include_directories(tf_trt_benchmark_runner PRIVATE ${tf_python_dir}/include)
-target_include_directories(tf_trt_benchmark_runner PRIVATE ${trt_include_path})
+target_include_directories(tftrt_benchmark_runner PRIVATE ${tf_python_dir}/include)
+target_include_directories(tftrt_benchmark_runner PRIVATE ${trt_include_path})
 
-add_dependencies(tf_trt_benchmark_runner tf_symlinks)
+add_dependencies(tftrt_benchmark_runner tf_symlinks)
diff --git a/tftrt/benchmarking-cpp/README.md b/tftrt/benchmarking-cpp/README.md
@@ -1,46 +1,64 @@
-# Benchmark Runner
-
-This straightforward example uses TF's C++ API to serve a saved model and measure throughput. Built off of the [example here](https://github.com/tensorflow/tensorrt/tree/fb0a2cf638c8707041e42451c601247f04c7e6d8/tftrt/examples/cpp/image-classification).
-
-## Docker Environment
-
-Pull the image:
-
-```
-docker pull nvcr.io/nvidia/tensorflow:22.06-tf2-py3
-```
-
-Start the container:
-
-```
-docker run --rm --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -it --name TFTRT_CPP nvcr.io/nvidia/tensorflow:22.06-tf2-py3
-```
-
-Clone the repo:
-
-```
-git clone https://github.com/tensorflow/tensorrt
-```
-
-## Model Conversion
-
-To convert a saved model to TF-TRT:
-
-```
-python3 convert_model.py --model-dir /path/to/model/dir --output-dir /path/to/dest/dir
-```
-
-## Building
-
-```
-cd tensorrt/tftrt/examples/cpp/benchmark_runner
-mkdir build && cd build
-cmake ..
-make
-```
-
-## Running
-
-```
-./tf_trt_benchmark_runner --model_path="/path/to/dest/dir"
-```
+# Benchmark Runner
+
+This straightforward example uses TF's C++ API to serve a saved model and measure throughput. Built off of the [example here](https://github.com/tensorflow/tensorrt/tree/fb0a2cf638c8707041e42451c601247f04c7e6d8/tftrt/examples/cpp/image-classification).
+
+## Docker Environment
+
+Start the container:
+
+```
+docker run --rm --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -it --name TFTRT_CPP nvcr.io/nvidia/tensorflow:22.06-tf2-py3
+```
+
+Clone the repo:
+
+```
+git clone https://github.com/tensorflow/tensorrt
+```
+
+## Model Conversion
+
+To convert a saved model to TF-TRT:
+
+```
+python3 convert_model.py --model-dir /path/to/model/dir --output-dir /path/to/dest/dir
+```
+
+## Building
+
+The binary relies on a modified Tensorflow, which will need to be rebuilt. Internal users can use a container with Tensorflow already modified and built, instead of building with Bazel, which will take much longer.
+
+### Bazel
+
+The `setup.sh` script applies the Tensorflow patch and prepares the container for the Bazel build.
+
+```
+/workspace/tensorrt/tftrt/benchmarking-cpp/build-scripts/setup.sh
+cd /opt/tensorflow
+./tftrt-build.sh
+```
+
+The binary will be located at `/opt/tensorflow/tensorflow-source/bazel-bin/tensorflow/examples/benchmarking-cpp/tftrt_benchmark_runner`.
+
+### Prebuilt
+
+For internal NVIDIA users, a container with a prebuilt modified Tensorflow is available. In the container, use CMake to build the binary without needing to rebuild Tensorflow:
+
+```
+cd /workspace/tensorrt/tftrt/benchmarking-cpp
+mkdir build && cd build
+cmake ..
+make
+```
+
+The binary will be located at `/workspace/tensorrt/tftrt/benchmarking-cpp/tftrt_benchmark_runner`.
+
+## Running
+
+```
+./tftrt_benchmark_runner --model_path="/path/to/dest/dir"
+```
+
+### Profiling
+
+To profile, set the `--out_dir` flag. Run `tensorboard --logdir [out_dir]` to view results.
diff --git a/tftrt/benchmarking-cpp/build-scripts/setup.sh b/tftrt/benchmarking-cpp/build-scripts/setup.sh
@@ -0,0 +1,7 @@
+TF_DIR=/opt/tensorflow
+SRC_DIR=$TF_DIR/tensorflow-source/tensorflow/examples/benchmarking-cpp
+CUR_DIR=$(dirname $(dirname $(readlink -fm $0)))
+
+ln -s $CUR_DIR $SRC_DIR
+patch $TF_DIR/tensorflow-source/tensorflow/core/profiler/rpc/client/BUILD $SRC_DIR/build-scripts/tf-profiler.patch
+ln -s $SRC_DIR/build-scripts/tftrt-build.sh $TF_DIR
diff --git a/tftrt/benchmarking-cpp/build-scripts/tf-profiler.patch b/tftrt/benchmarking-cpp/build-scripts/tf-profiler.patch
@@ -0,0 +1,22 @@
+--- /opt/tensorflow/tensorflow-source/tensorflow/core/profiler/rpc/client/BUILD	2022-06-24 20:49:49.656963813 +0000
++++ /opt/tensorflow/tensorflow-source/tensorflow/core/profiler/rpc/client/BUILD_PATCHED	2022-06-24 20:49:35.416963948 +0000
+@@ -25,6 +25,7 @@
+     visibility = [
+         "//tensorflow/compiler/xla/python:__pkg__",
+         "//tensorflow/python/profiler/internal:__pkg__",
++        "//tensorflow:internal",
+     ],
+     deps = [
+         ":profiler_client_for_pybind",
+@@ -67,7 +68,10 @@
+ cc_library(
+     name = "profiler_client",
+     hdrs = ["profiler_client.h"],
+-    visibility = ["//tensorflow/compiler/xla:__subpackages__"],
++    visibility = [
++        "//tensorflow/compiler/xla:__subpackages__",
++        "//tensorflow:internal",
++    ],
+     deps = [
+         ":profiler_client_impl",
+         "//tensorflow/core:lib",
diff --git a/tftrt/benchmarking-cpp/build-scripts/tftrt-build.sh b/tftrt/benchmarking-cpp/build-scripts/tftrt-build.sh
@@ -0,0 +1,13 @@
+# TODO: to programatically determine the python and tf API versions
+PYVER=3.8 #TODO get this by parsing `python --version`
+TFAPI=2 #TODO get this by parsing tf.__version__
+
+/opt/tensorflow/nvbuild.sh --configonly --python$PYVER --v$TFAPI
+
+BUILD_OPTS="$(cat /opt/tensorflow/nvbuildopts)"
+if [[ "$TFAPI" == "2" ]]; then
+  BUILD_OPTS="--config=v2 $BUILD_OPTS"
+fi
+
+cd tensorflow-source
+bazel build $BUILD_OPTS tensorflow/examples/benchmarking-cpp/...
diff --git a/tftrt/benchmarking-cpp/main.cc b/tftrt/benchmarking-cpp/main.cc
@@ -7,14 +7,16 @@
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/saved_model/loader.h"
-#include "tensorflow/compiler/tf2tensorrt/trt_convert_api.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/default_device.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
@@ -142,6 +144,24 @@ Status SetupCallable(std::unique_ptr<tensorflow::Session>& session,
   return session->MakeCallable(opts, handle);
 }
 
+// Start the profiling session.
+Status StartProfiling(std::unique_ptr<tensorflow::ProfilerSession>& profiler) {
+  profiler = tensorflow::ProfilerSession::Create(
+      tensorflow::ProfilerSession::DefaultOptions()
+  );
+  return profiler->Status();
+}
+
+// Tear down the profiler and export tensorboard logs.
+Status StopProfiling(std::unique_ptr<tensorflow::ProfilerSession>& profiler,
+                     const string& out_dir) {
+  tensorflow::profiler::XSpace xspace;
+  TF_RETURN_IF_ERROR(profiler->CollectData(&xspace));
+  tensorflow::profiler::ExportToTensorBoard(xspace, out_dir);
+  profiler.reset();
+  return Status::OK();
+}
+
 int main(int argc, char* argv[]) {
   // Parse arguments
   string model_path = "/path/to/model/";
@@ -151,6 +171,7 @@ int main(int argc, char* argv[]) {
   int32_t eval_iters = 800;
   bool input_from_device = true;
   bool output_to_host = true;
+  string out_dir = "";
   std::vector<Flag> flag_list = {
       Flag("model_path", &model_path, "graph to be executed"),
       Flag("signature_key", &signature_key, "the serving signature to use"),
@@ -159,6 +180,7 @@ int main(int argc, char* argv[]) {
       Flag("eval_iters", &eval_iters, "number of timed iterations to run"),
       Flag("input_from_device", &input_from_device, "use inputs from device, rather than host"),
       Flag("output_to_host", &output_to_host, "copy outputs to host after inference"),
+      Flag("out_dir", &out_dir, "if set, runs the profiler and exports to this directory"),
   };
   string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
@@ -205,18 +227,29 @@ int main(int argc, char* argv[]) {
   std::chrono::steady_clock::time_point eval_start_time;
   std::chrono::steady_clock::time_point start_time;
   std::chrono::steady_clock::time_point end_time;
+  std::unique_ptr<tensorflow::ProfilerSession> profiler;
   for (int i = 0; i < warmup_iters + eval_iters; i++) {
     if (i == warmup_iters) {
       LOG(INFO) << "Warmup done";
+      if (!out_dir.empty()) {
+        StartProfiling(profiler);
+      }
       eval_start_time = std::chrono::steady_clock::now();
     }
 
-    start_time = std::chrono::steady_clock::now();
-    TFTRT_ENSURE_OK(
-        bundle.session->RunCallable(handle, inputs_device, &outputs, nullptr));
-    // Sync, as `set_fetch_skip_sync(false)` is currently not implemented
-    TFTRT_ENSURE_OK(device->Sync());
-    end_time = std::chrono::steady_clock::now();
+    {
+      tensorflow::profiler::TraceMe trace([&i, &warmup_iters]() {
+        return tensorflow::profiler::TraceMeEncode(
+          "gpu_compute", {{"iter", i - warmup_iters}}
+        );
+      }, 1);
+      start_time = std::chrono::steady_clock::now();
+      TFTRT_ENSURE_OK(
+          bundle.session->RunCallable(handle, inputs_device, &outputs, nullptr));
+      // Sync, as `set_fetch_skip_sync(false)` is currently not implemented
+      TFTRT_ENSURE_OK(device->Sync());
+      end_time = std::chrono::steady_clock::now();
+    }
 
     if ((i % 10) == 0) {
       LOG(INFO) << "step: " << i;
@@ -225,6 +258,9 @@ int main(int argc, char* argv[]) {
     double duration = (end_time - start_time).count() / 1e6;
     infer_time.push_back(duration);
   }
+  if (!out_dir.empty()) {
+    StopProfiling(profiler, out_dir);
+  }
   TFTRT_ENSURE_OK(bundle.session->ReleaseCallable(handle));
 
   // Print results