Skip to content
This repository was archived by the owner on Feb 3, 2025. It is now read-only.

Commit a937c24

Browse files
committed
Change build system and add profiler
1 parent 7831a76 commit a937c24

File tree

6 files changed

+167
-52
lines changed

6 files changed

+167
-52
lines changed

tftrt/benchmarking-cpp/BUILD

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Description:
2+
# TensorFlow C++ inference example with TF-TRT model.
3+
4+
load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
5+
load(
6+
"//tensorflow/core/platform:build_config.bzl",
7+
"tf_protos_profiler_service",
8+
)
9+
10+
package(
11+
default_visibility = ["//tensorflow:internal"],
12+
licenses = ["notice"],
13+
)
14+
15+
tf_cc_binary(
16+
name = "tftrt_benchmark_runner",
17+
srcs = [
18+
"main.cc",
19+
],
20+
deps = [
21+
"//tensorflow/cc:cc_ops",
22+
"//tensorflow/cc/saved_model:loader",
23+
"//tensorflow/core:core_cpu",
24+
"//tensorflow/core:framework",
25+
"//tensorflow/core:framework_internal",
26+
"//tensorflow/core:lib",
27+
"//tensorflow/core:lib_internal",
28+
"//tensorflow/core:protos_all_cc",
29+
"//tensorflow/core:tensorflow",
30+
"//tensorflow/core/profiler/rpc/client:capture_profile",
31+
"//tensorflow/core/profiler/rpc/client:profiler_client",
32+
] + tf_protos_profiler_service(),
33+
)

tftrt/benchmarking-cpp/README.md

Lines changed: 49 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,49 @@
1-
# Benchmark Runner
2-
3-
This straightforward example uses TF's C++ API to serve a saved model and measure throughput. Built off of the [example here](https://github.com/tensorflow/tensorrt/tree/fb0a2cf638c8707041e42451c601247f04c7e6d8/tftrt/examples/cpp/image-classification).
4-
5-
## Docker Environment
6-
7-
Pull the image:
8-
9-
```
10-
docker pull nvcr.io/nvidia/tensorflow:22.06-tf2-py3
11-
```
12-
13-
Start the container:
14-
15-
```
16-
docker run --rm --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -it --name TFTRT_CPP nvcr.io/nvidia/tensorflow:22.06-tf2-py3
17-
```
18-
19-
Clone the repo:
20-
21-
```
22-
git clone https://github.com/tensorflow/tensorrt
23-
```
24-
25-
## Model Conversion
26-
27-
To convert a saved model to TF-TRT:
28-
29-
```
30-
python3 convert_model.py --model-dir /path/to/model/dir --output-dir /path/to/dest/dir
31-
```
32-
33-
## Building
34-
35-
```
36-
cd tensorrt/tftrt/examples/cpp/benchmark_runner
37-
mkdir build && cd build
38-
cmake ..
39-
make
40-
```
41-
42-
## Running
43-
44-
```
45-
./tf_trt_benchmark_runner --model_path="/path/to/dest/dir"
46-
```
1+
# Benchmark Runner
2+
3+
This straightforward example uses TF's C++ API to serve a saved model and measure throughput. Built off of the [example here](https://github.com/tensorflow/tensorrt/tree/fb0a2cf638c8707041e42451c601247f04c7e6d8/tftrt/examples/cpp/image-classification).
4+
5+
## Docker Environment
6+
7+
Pull the image:
8+
9+
```
10+
docker pull nvcr.io/nvidia/tensorflow:22.06-tf2-py3
11+
```
12+
13+
Start the container:
14+
15+
```
16+
docker run --rm --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 -it --name TFTRT_CPP nvcr.io/nvidia/tensorflow:22.06-tf2-py3
17+
```
18+
19+
Clone the repo:
20+
21+
```
22+
git clone https://github.com/tensorflow/tensorrt
23+
```
24+
25+
## Model Conversion
26+
27+
To convert a saved model to TF-TRT:
28+
29+
```
30+
python3 convert_model.py --model-dir /path/to/model/dir --output-dir /path/to/dest/dir
31+
```
32+
33+
## Building
34+
35+
```
36+
/workspace/tensorrt/tftrt/benchmarking-cpp/build-scripts/setup.sh
37+
cd /opt/tensorflow
38+
./tftrt-build.sh
39+
```
40+
41+
## Running
42+
43+
```
44+
/opt/tensorflow/tensorflow-source/bazel-bin/tensorflow/examples/benchmarking-cpp/tftrt_benchmark_runner --model_path="/path/to/dest/dir"
45+
```
46+
47+
## Profiling
48+
49+
To profile, set the `--out_dir` flag. Run `tensorboard --logdir [out_dir]` to view results.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
TF_DIR=/opt/tensorflow
2+
SRC_DIR=$TF_DIR/tensorflow-source/tensorflow/examples/benchmarking-cpp
3+
CUR_DIR=$(dirname $(dirname $(readlink -fm $0)))
4+
5+
ln -s $CUR_DIR $SRC_DIR
6+
patch $TF_DIR/tensorflow-source/tensorflow/core/profiler/rpc/client/BUILD $SRC_DIR/build-scripts/tf-profiler.patch
7+
ln -s $SRC_DIR/build-scripts/tftrt-build.sh $TF_DIR
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
--- /opt/tensorflow/tensorflow-source/tensorflow/core/profiler/rpc/client/BUILD 2022-06-24 20:49:49.656963813 +0000
2+
+++ /opt/tensorflow/tensorflow-source/tensorflow/core/profiler/rpc/client/BUILD_PATCHED 2022-06-24 20:49:35.416963948 +0000
3+
@@ -25,6 +25,7 @@
4+
visibility = [
5+
"//tensorflow/compiler/xla/python:__pkg__",
6+
"//tensorflow/python/profiler/internal:__pkg__",
7+
+ "//tensorflow:internal",
8+
],
9+
deps = [
10+
":profiler_client_for_pybind",
11+
@@ -67,7 +68,10 @@
12+
cc_library(
13+
name = "profiler_client",
14+
hdrs = ["profiler_client.h"],
15+
- visibility = ["//tensorflow/compiler/xla:__subpackages__"],
16+
+ visibility = [
17+
+ "//tensorflow/compiler/xla:__subpackages__",
18+
+ "//tensorflow:internal",
19+
+ ],
20+
deps = [
21+
":profiler_client_impl",
22+
"//tensorflow/core:lib",
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# TODO: to programatically determine the python and tf API versions
2+
PYVER=3.8 #TODO get this by parsing `python --version`
3+
TFAPI=2 #TODO get this by parsing tf.__version__
4+
5+
/opt/tensorflow/nvbuild.sh --configonly --python$PYVER --v$TFAPI
6+
7+
BUILD_OPTS="$(cat /opt/tensorflow/nvbuildopts)"
8+
if [[ "$TFAPI" == "2" ]]; then
9+
BUILD_OPTS="--config=v2 $BUILD_OPTS"
10+
fi
11+
12+
cd tensorflow-source
13+
bazel build $BUILD_OPTS tensorflow/examples/benchmarking-cpp/...

tftrt/benchmarking-cpp/main.cc

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
#include "tensorflow/core/platform/init_main.h"
1616
#include "tensorflow/core/platform/logging.h"
1717
#include "tensorflow/core/platform/types.h"
18+
#include "tensorflow/core/profiler/lib/profiler_session.h"
19+
#include "tensorflow/core/profiler/lib/traceme.h"
20+
#include "tensorflow/core/profiler/rpc/client/capture_profile.h"
1821
#include "tensorflow/core/public/session.h"
1922
#include "tensorflow/core/util/command_line_flags.h"
2023

@@ -142,6 +145,24 @@ Status SetupCallable(std::unique_ptr<tensorflow::Session>& session,
142145
return session->MakeCallable(opts, handle);
143146
}
144147

148+
// Start the profiling session.
149+
Status StartProfiling(std::unique_ptr<tensorflow::ProfilerSession>& profiler) {
150+
profiler = tensorflow::ProfilerSession::Create(
151+
tensorflow::ProfilerSession::DefaultOptions()
152+
);
153+
return profiler->Status();
154+
}
155+
156+
// Tear down the profiler and export tensorboard logs.
157+
Status StopProfiling(std::unique_ptr<tensorflow::ProfilerSession>& profiler,
158+
const string& out_dir) {
159+
tensorflow::profiler::XSpace xspace;
160+
TF_RETURN_IF_ERROR(profiler->CollectData(&xspace));
161+
tensorflow::profiler::ExportToTensorBoard(xspace, out_dir);
162+
profiler.reset();
163+
return Status::OK();
164+
}
165+
145166
int main(int argc, char* argv[]) {
146167
// Parse arguments
147168
string model_path = "/path/to/model/";
@@ -151,6 +172,7 @@ int main(int argc, char* argv[]) {
151172
int32_t eval_iters = 800;
152173
bool input_from_device = true;
153174
bool output_to_host = true;
175+
string out_dir = "";
154176
std::vector<Flag> flag_list = {
155177
Flag("model_path", &model_path, "graph to be executed"),
156178
Flag("signature_key", &signature_key, "the serving signature to use"),
@@ -159,6 +181,7 @@ int main(int argc, char* argv[]) {
159181
Flag("eval_iters", &eval_iters, "number of timed iterations to run"),
160182
Flag("input_from_device", &input_from_device, "use inputs from device, rather than host"),
161183
Flag("output_to_host", &output_to_host, "copy outputs to host after inference"),
184+
Flag("out_dir", &out_dir, "if set, runs the profiler and exports to this directory"),
162185
};
163186
string usage = tensorflow::Flags::Usage(argv[0], flag_list);
164187
const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
@@ -205,18 +228,29 @@ int main(int argc, char* argv[]) {
205228
std::chrono::steady_clock::time_point eval_start_time;
206229
std::chrono::steady_clock::time_point start_time;
207230
std::chrono::steady_clock::time_point end_time;
231+
std::unique_ptr<tensorflow::ProfilerSession> profiler;
208232
for (int i = 0; i < warmup_iters + eval_iters; i++) {
209233
if (i == warmup_iters) {
210234
LOG(INFO) << "Warmup done";
235+
if (!out_dir.empty()) {
236+
StartProfiling(profiler);
237+
}
211238
eval_start_time = std::chrono::steady_clock::now();
212239
}
213240

214-
start_time = std::chrono::steady_clock::now();
215-
TFTRT_ENSURE_OK(
216-
bundle.session->RunCallable(handle, inputs_device, &outputs, nullptr));
217-
// Sync, as `set_fetch_skip_sync(false)` is currently not implemented
218-
TFTRT_ENSURE_OK(device->Sync());
219-
end_time = std::chrono::steady_clock::now();
241+
{
242+
tensorflow::profiler::TraceMe trace([&i, &warmup_iters]() {
243+
return tensorflow::profiler::TraceMeEncode(
244+
"gpu_compute", {{"iter", i - warmup_iters}}
245+
);
246+
}, 1);
247+
start_time = std::chrono::steady_clock::now();
248+
TFTRT_ENSURE_OK(
249+
bundle.session->RunCallable(handle, inputs_device, &outputs, nullptr));
250+
// Sync, as `set_fetch_skip_sync(false)` is currently not implemented
251+
TFTRT_ENSURE_OK(device->Sync());
252+
end_time = std::chrono::steady_clock::now();
253+
}
220254

221255
if ((i % 10) == 0) {
222256
LOG(INFO) << "step: " << i;
@@ -225,6 +259,9 @@ int main(int argc, char* argv[]) {
225259
double duration = (end_time - start_time).count() / 1e6;
226260
infer_time.push_back(duration);
227261
}
262+
if (!out_dir.empty()) {
263+
StopProfiling(profiler, out_dir);
264+
}
228265
TFTRT_ENSURE_OK(bundle.session->ReleaseCallable(handle));
229266

230267
// Print results

0 commit comments

Comments
 (0)