Skip to content
This repository was archived by the owner on Feb 3, 2025. It is now read-only.

Commit a5ef604

Browse files
Merge pull request #299 from nvkevihu/cpp-profiler
[C++ Benchmark] Profiling to Tensorboard
2 parents 7831a76 + 8681b68 commit a5ef604

File tree

7 files changed

+193
-64
lines changed

7 files changed

+193
-64
lines changed

tftrt/benchmarking-cpp/BUILD

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Description:
2+
# TensorFlow C++ inference example with TF-TRT model.
3+
4+
load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
5+
load(
6+
"//tensorflow/core/platform:build_config.bzl",
7+
"tf_protos_profiler_service",
8+
)
9+
10+
package(
11+
default_visibility = ["//tensorflow:internal"],
12+
licenses = ["notice"],
13+
)
14+
15+
tf_cc_binary(
16+
name = "tftrt_benchmark_runner",
17+
srcs = [
18+
"main.cc",
19+
],
20+
deps = [
21+
"//tensorflow/cc:cc_ops",
22+
"//tensorflow/cc/saved_model:loader",
23+
"//tensorflow/core:core_cpu",
24+
"//tensorflow/core:framework",
25+
"//tensorflow/core:framework_internal",
26+
"//tensorflow/core:lib",
27+
"//tensorflow/core:lib_internal",
28+
"//tensorflow/core:protos_all_cc",
29+
"//tensorflow/core:tensorflow",
30+
"//tensorflow/core/profiler/rpc/client:capture_profile",
31+
"//tensorflow/core/profiler/rpc/client:profiler_client",
32+
] + tf_protos_profiler_service(),
33+
)

tftrt/benchmarking-cpp/CMakeLists.txt

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
cmake_minimum_required(VERSION 3.13)
2-
project(TF_TRT_Benchmark_Runner)
2+
project(TFTRT_Benchmark_Runner)
33

44
#-------------------------------------------------------------
55
# Configuration
@@ -29,19 +29,19 @@ add_custom_target(tf_symlinks DEPENDS ${tf_framework_shared_lib} ${tf_shared_lib
2929
#-----------------------------------------------------------
3030
# Benchmark Runner Targets
3131
#-----------------------------------------------------------
32-
add_executable(tf_trt_benchmark_runner main.cc)
32+
add_executable(tftrt_benchmark_runner main.cc)
3333

34-
target_link_libraries(tf_trt_benchmark_runner tensorflow_cc)
35-
target_link_libraries(tf_trt_benchmark_runner tensorflow_framework)
34+
target_link_libraries(tftrt_benchmark_runner tensorflow_cc)
35+
target_link_libraries(tftrt_benchmark_runner tensorflow_framework)
3636

37-
target_compile_options(tf_trt_benchmark_runner PRIVATE -D_GLIBCXX_USE_CXX11_ABI=1 -DGOOGLE_CUDA -DGOOGLE_TENSORRT)
37+
target_compile_options(tftrt_benchmark_runner PRIVATE -D_GLIBCXX_USE_CXX11_ABI=1 -DGOOGLE_CUDA -DGOOGLE_TENSORRT)
3838

39-
target_link_directories(tf_trt_benchmark_runner PRIVATE ${tf_python_dir})
40-
target_link_directories(tf_trt_benchmark_runner PRIVATE ${tf_dir})
39+
target_link_directories(tftrt_benchmark_runner PRIVATE ${tf_python_dir})
40+
target_link_directories(tftrt_benchmark_runner PRIVATE ${tf_dir})
4141

42-
target_compile_options(tf_trt_benchmark_runner PRIVATE -O2 -Wl,-rpath=${tf_python_dir})
42+
target_compile_options(tftrt_benchmark_runner PRIVATE -O2 -Wl,-rpath=${tf_python_dir})
4343

44-
target_include_directories(tf_trt_benchmark_runner PRIVATE ${tf_python_dir}/include)
45-
target_include_directories(tf_trt_benchmark_runner PRIVATE ${trt_include_path})
44+
target_include_directories(tftrt_benchmark_runner PRIVATE ${tf_python_dir}/include)
45+
target_include_directories(tftrt_benchmark_runner PRIVATE ${trt_include_path})
4646

47-
add_dependencies(tf_trt_benchmark_runner tf_symlinks)
47+
add_dependencies(tftrt_benchmark_runner tf_symlinks)

tftrt/benchmarking-cpp/README.md

Lines changed: 64 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,64 @@
1-
# Benchmark Runner
2-
3-
This straightforward example uses TF's C++ API to serve a saved model and measure throughput. Built off of the [example here](https://github.com/tensorflow/tensorrt/tree/fb0a2cf638c8707041e42451c601247f04c7e6d8/tftrt/examples/cpp/image-classification).
4-
5-
## Docker Environment
6-
7-
Pull the image:
8-
9-
```
10-
docker pull nvcr.io/nvidia/tensorflow:22.06-tf2-py3
11-
```
12-
13-
Start the container:
14-
15-
```
16-
docker run --rm --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -it --name TFTRT_CPP nvcr.io/nvidia/tensorflow:22.06-tf2-py3
17-
```
18-
19-
Clone the repo:
20-
21-
```
22-
git clone https://github.com/tensorflow/tensorrt
23-
```
24-
25-
## Model Conversion
26-
27-
To convert a saved model to TF-TRT:
28-
29-
```
30-
python3 convert_model.py --model-dir /path/to/model/dir --output-dir /path/to/dest/dir
31-
```
32-
33-
## Building
34-
35-
```
36-
cd tensorrt/tftrt/examples/cpp/benchmark_runner
37-
mkdir build && cd build
38-
cmake ..
39-
make
40-
```
41-
42-
## Running
43-
44-
```
45-
./tf_trt_benchmark_runner --model_path="/path/to/dest/dir"
46-
```
1+
# Benchmark Runner
2+
3+
This straightforward example uses TF's C++ API to serve a saved model and measure throughput. Built off of the [example here](https://github.com/tensorflow/tensorrt/tree/fb0a2cf638c8707041e42451c601247f04c7e6d8/tftrt/examples/cpp/image-classification).
4+
5+
## Docker Environment
6+
7+
Start the container:
8+
9+
```
10+
docker run --rm --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -it --name TFTRT_CPP nvcr.io/nvidia/tensorflow:22.06-tf2-py3
11+
```
12+
13+
Clone the repo:
14+
15+
```
16+
git clone https://github.com/tensorflow/tensorrt
17+
```
18+
19+
## Model Conversion
20+
21+
To convert a saved model to TF-TRT:
22+
23+
```
24+
python3 convert_model.py --model-dir /path/to/model/dir --output-dir /path/to/dest/dir
25+
```
26+
27+
## Building
28+
29+
The binary relies on a modified Tensorflow, which will need to be rebuilt. Internal users can use a container with Tensorflow already modified and built, instead of building with Bazel, which will take much longer.
30+
31+
### Bazel
32+
33+
The `setup.sh` script applies the Tensorflow patch and prepares the container for the Bazel build.
34+
35+
```
36+
/workspace/tensorrt/tftrt/benchmarking-cpp/build-scripts/setup.sh
37+
cd /opt/tensorflow
38+
./tftrt-build.sh
39+
```
40+
41+
The binary will be located at `/opt/tensorflow/tensorflow-source/bazel-bin/tensorflow/examples/benchmarking-cpp/tftrt_benchmark_runner`.
42+
43+
### Prebuilt
44+
45+
For internal NVIDIA users, a container with a prebuilt modified Tensorflow is available. In the container, use CMake to build the binary without needing to rebuild Tensorflow:
46+
47+
```
48+
cd /workspace/tensorrt/tftrt/benchmarking-cpp
49+
mkdir build && cd build
50+
cmake ..
51+
make
52+
```
53+
54+
The binary will be located at `/workspace/tensorrt/tftrt/benchmarking-cpp/tftrt_benchmark_runner`.
55+
56+
## Running
57+
58+
```
59+
./tftrt_benchmark_runner --model_path="/path/to/dest/dir"
60+
```
61+
62+
### Profiling
63+
64+
To profile, set the `--out_dir` flag. Run `tensorboard --logdir [out_dir]` to view results.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
TF_DIR=/opt/tensorflow
2+
SRC_DIR=$TF_DIR/tensorflow-source/tensorflow/examples/benchmarking-cpp
3+
CUR_DIR=$(dirname $(dirname $(readlink -fm $0)))
4+
5+
ln -s $CUR_DIR $SRC_DIR
6+
patch $TF_DIR/tensorflow-source/tensorflow/core/profiler/rpc/client/BUILD $SRC_DIR/build-scripts/tf-profiler.patch
7+
ln -s $SRC_DIR/build-scripts/tftrt-build.sh $TF_DIR
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
--- /opt/tensorflow/tensorflow-source/tensorflow/core/profiler/rpc/client/BUILD 2022-06-24 20:49:49.656963813 +0000
2+
+++ /opt/tensorflow/tensorflow-source/tensorflow/core/profiler/rpc/client/BUILD_PATCHED 2022-06-24 20:49:35.416963948 +0000
3+
@@ -25,6 +25,7 @@
4+
visibility = [
5+
"//tensorflow/compiler/xla/python:__pkg__",
6+
"//tensorflow/python/profiler/internal:__pkg__",
7+
+ "//tensorflow:internal",
8+
],
9+
deps = [
10+
":profiler_client_for_pybind",
11+
@@ -67,7 +68,10 @@
12+
cc_library(
13+
name = "profiler_client",
14+
hdrs = ["profiler_client.h"],
15+
- visibility = ["//tensorflow/compiler/xla:__subpackages__"],
16+
+ visibility = [
17+
+ "//tensorflow/compiler/xla:__subpackages__",
18+
+ "//tensorflow:internal",
19+
+ ],
20+
deps = [
21+
":profiler_client_impl",
22+
"//tensorflow/core:lib",
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# TODO: to programatically determine the python and tf API versions
2+
PYVER=3.8 #TODO get this by parsing `python --version`
3+
TFAPI=2 #TODO get this by parsing tf.__version__
4+
5+
/opt/tensorflow/nvbuild.sh --configonly --python$PYVER --v$TFAPI
6+
7+
BUILD_OPTS="$(cat /opt/tensorflow/nvbuildopts)"
8+
if [[ "$TFAPI" == "2" ]]; then
9+
BUILD_OPTS="--config=v2 $BUILD_OPTS"
10+
fi
11+
12+
cd tensorflow-source
13+
bazel build $BUILD_OPTS tensorflow/examples/benchmarking-cpp/...

tftrt/benchmarking-cpp/main.cc

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,16 @@
77
#include "tensorflow/cc/ops/const_op.h"
88
#include "tensorflow/cc/ops/image_ops.h"
99
#include "tensorflow/cc/saved_model/loader.h"
10-
#include "tensorflow/compiler/tf2tensorrt/trt_convert_api.h"
1110
#include "tensorflow/core/common_runtime/device_mgr.h"
1211
#include "tensorflow/core/framework/graph.pb.h"
1312
#include "tensorflow/core/framework/tensor.h"
1413
#include "tensorflow/core/graph/default_device.h"
1514
#include "tensorflow/core/platform/init_main.h"
1615
#include "tensorflow/core/platform/logging.h"
1716
#include "tensorflow/core/platform/types.h"
17+
#include "tensorflow/core/profiler/lib/profiler_session.h"
18+
#include "tensorflow/core/profiler/lib/traceme.h"
19+
#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
1820
#include "tensorflow/core/public/session.h"
1921
#include "tensorflow/core/util/command_line_flags.h"
2022

@@ -142,6 +144,24 @@ Status SetupCallable(std::unique_ptr<tensorflow::Session>& session,
142144
return session->MakeCallable(opts, handle);
143145
}
144146

147+
// Start the profiling session.
148+
Status StartProfiling(std::unique_ptr<tensorflow::ProfilerSession>& profiler) {
149+
profiler = tensorflow::ProfilerSession::Create(
150+
tensorflow::ProfilerSession::DefaultOptions()
151+
);
152+
return profiler->Status();
153+
}
154+
155+
// Tear down the profiler and export tensorboard logs.
156+
Status StopProfiling(std::unique_ptr<tensorflow::ProfilerSession>& profiler,
157+
const string& out_dir) {
158+
tensorflow::profiler::XSpace xspace;
159+
TF_RETURN_IF_ERROR(profiler->CollectData(&xspace));
160+
tensorflow::profiler::ExportToTensorBoard(xspace, out_dir);
161+
profiler.reset();
162+
return Status::OK();
163+
}
164+
145165
int main(int argc, char* argv[]) {
146166
// Parse arguments
147167
string model_path = "/path/to/model/";
@@ -151,6 +171,7 @@ int main(int argc, char* argv[]) {
151171
int32_t eval_iters = 800;
152172
bool input_from_device = true;
153173
bool output_to_host = true;
174+
string out_dir = "";
154175
std::vector<Flag> flag_list = {
155176
Flag("model_path", &model_path, "graph to be executed"),
156177
Flag("signature_key", &signature_key, "the serving signature to use"),
@@ -159,6 +180,7 @@ int main(int argc, char* argv[]) {
159180
Flag("eval_iters", &eval_iters, "number of timed iterations to run"),
160181
Flag("input_from_device", &input_from_device, "use inputs from device, rather than host"),
161182
Flag("output_to_host", &output_to_host, "copy outputs to host after inference"),
183+
Flag("out_dir", &out_dir, "if set, runs the profiler and exports to this directory"),
162184
};
163185
string usage = tensorflow::Flags::Usage(argv[0], flag_list);
164186
const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
@@ -205,18 +227,29 @@ int main(int argc, char* argv[]) {
205227
std::chrono::steady_clock::time_point eval_start_time;
206228
std::chrono::steady_clock::time_point start_time;
207229
std::chrono::steady_clock::time_point end_time;
230+
std::unique_ptr<tensorflow::ProfilerSession> profiler;
208231
for (int i = 0; i < warmup_iters + eval_iters; i++) {
209232
if (i == warmup_iters) {
210233
LOG(INFO) << "Warmup done";
234+
if (!out_dir.empty()) {
235+
StartProfiling(profiler);
236+
}
211237
eval_start_time = std::chrono::steady_clock::now();
212238
}
213239

214-
start_time = std::chrono::steady_clock::now();
215-
TFTRT_ENSURE_OK(
216-
bundle.session->RunCallable(handle, inputs_device, &outputs, nullptr));
217-
// Sync, as `set_fetch_skip_sync(false)` is currently not implemented
218-
TFTRT_ENSURE_OK(device->Sync());
219-
end_time = std::chrono::steady_clock::now();
240+
{
241+
tensorflow::profiler::TraceMe trace([&i, &warmup_iters]() {
242+
return tensorflow::profiler::TraceMeEncode(
243+
"gpu_compute", {{"iter", i - warmup_iters}}
244+
);
245+
}, 1);
246+
start_time = std::chrono::steady_clock::now();
247+
TFTRT_ENSURE_OK(
248+
bundle.session->RunCallable(handle, inputs_device, &outputs, nullptr));
249+
// Sync, as `set_fetch_skip_sync(false)` is currently not implemented
250+
TFTRT_ENSURE_OK(device->Sync());
251+
end_time = std::chrono::steady_clock::now();
252+
}
220253

221254
if ((i % 10) == 0) {
222255
LOG(INFO) << "step: " << i;
@@ -225,6 +258,9 @@ int main(int argc, char* argv[]) {
225258
double duration = (end_time - start_time).count() / 1e6;
226259
infer_time.push_back(duration);
227260
}
261+
if (!out_dir.empty()) {
262+
StopProfiling(profiler, out_dir);
263+
}
228264
TFTRT_ENSURE_OK(bundle.session->ReleaseCallable(handle));
229265

230266
// Print results

0 commit comments

Comments
 (0)