Skip to content

Commit 3cf2099

Browse files
tgerdesnvmc-nv
authored andcommitted
Support for fixed number of requests (#633)
* first pass. Hardcoded values * Working for concurrency (hardcoded whenever count windows is used for now) * working for req rate as well * Add CLI. Add/fix unit tests * Remove hack. Restore all normal functionality * Refactor thread config into one class. Add more testing * Rename arg to request-count * Fix request rate bug * Update info print * fix corner case * move fixme to a story tag * add assert to avoid corner case * rename variables * self review #1 * copyright changes * add doxygen to functions * Don't allow sweeping over multiple concurrency or request rate with request-count
1 parent 80c9273 commit 3cf2099

24 files changed

+516
-189
lines changed

src/c++/perf_analyzer/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ set(
112112
profile_data_exporter.h
113113
periodic_concurrency_manager.h
114114
periodic_concurrency_worker.h
115+
thread_config.h
115116
)
116117

117118
add_executable(

src/c++/perf_analyzer/client_backend/client_backend.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ enum BackendKind {
138138
TRITON_C_API = 3,
139139
OPENAI = 4
140140
};
141+
std::string BackendKindToString(const BackendKind kind);
142+
141143
enum ProtocolType { HTTP = 0, GRPC = 1, UNKNOWN = 2 };
142144
enum GrpcCompressionAlgorithm {
143145
COMPRESS_NONE = 0,

src/c++/perf_analyzer/command_line_parser.cc

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ CLParser::Usage(const std::string& msg)
137137
"profiling>"
138138
<< std::endl;
139139
std::cerr << "\t--percentile <percentile>" << std::endl;
140+
std::cerr << "\t--request-count <number of requests>" << std::endl;
140141
std::cerr << "\tDEPRECATED OPTIONS" << std::endl;
141142
std::cerr << "\t-t <number of concurrent requests>" << std::endl;
142143
std::cerr << "\t-c <maximum concurrency>" << std::endl;
@@ -463,6 +464,14 @@ CLParser::Usage(const std::string& msg)
463464
"that the average latency is used to determine stability",
464465
18)
465466
<< std::endl;
467+
std::cerr
468+
<< FormatMessage(
469+
" --request-count: Specifies a total number of requests to "
470+
"use for measurement. The default is 0, which means that there is "
471+
"no request count and the measurement will proceed using windows "
472+
"until stabilization is detected.",
473+
18)
474+
<< std::endl;
466475
std::cerr << FormatMessage(
467476
" --serial-sequences: Enables serial sequence mode "
468477
"where a maximum of one request is outstanding at a time "
@@ -879,6 +888,7 @@ CLParser::ParseCommandLine(int argc, char** argv)
879888
{"request-period", required_argument, 0, 59},
880889
{"request-parameter", required_argument, 0, 60},
881890
{"endpoint", required_argument, 0, 61},
891+
{"request-count", required_argument, 0, 62},
882892
{0, 0, 0, 0}};
883893

884894
// Parse commandline...
@@ -1614,6 +1624,13 @@ CLParser::ParseCommandLine(int argc, char** argv)
16141624
params_->endpoint = optarg;
16151625
break;
16161626
}
1627+
case 62: {
1628+
if (std::stoi(optarg) < 0) {
1629+
Usage("Failed to parse --request-count. The value must be > 0.");
1630+
}
1631+
params_->request_count = std::stoi(optarg);
1632+
break;
1633+
}
16171634
case 'v':
16181635
params_->extra_verbose = params_->verbose;
16191636
params_->verbose = true;
@@ -1705,6 +1722,13 @@ CLParser::ParseCommandLine(int argc, char** argv)
17051722
// Will be using user-provided time intervals, hence no control variable.
17061723
params_->search_mode = SearchMode::NONE;
17071724
}
1725+
1726+
// When the request-count feature is enabled, override the measurement mode to
1727+
// be count windows with a window size of the requested count
1728+
if (params_->request_count) {
1729+
params_->measurement_mode = MeasurementMode::COUNT_WINDOWS;
1730+
params_->measurement_request_count = params_->request_count;
1731+
}
17081732
}
17091733

17101734
void
@@ -1874,6 +1898,31 @@ CLParser::VerifyOptions()
18741898
"binary search mode.");
18751899
}
18761900

1901+
if (params_->request_count != 0) {
1902+
if (params_->using_concurrency_range) {
1903+
if (params_->request_count < params_->concurrency_range.start) {
1904+
Usage("request-count can not be less than concurrency");
1905+
}
1906+
if (params_->concurrency_range.start < params_->concurrency_range.end) {
1907+
Usage(
1908+
"request-count not supported with multiple concurrency values in "
1909+
"one run");
1910+
}
1911+
}
1912+
if (params_->using_request_rate_range) {
1913+
if (params_->request_count <
1914+
static_cast<int>(params_->request_rate_range[0])) {
1915+
Usage("request-count can not be less than request-rate");
1916+
}
1917+
if (params_->request_rate_range[SEARCH_RANGE::kSTART] <
1918+
params_->request_rate_range[SEARCH_RANGE::kEND]) {
1919+
Usage(
1920+
"request-count not supported with multiple request-rate values in "
1921+
"one run");
1922+
}
1923+
}
1924+
}
1925+
18771926
if (params_->kind == cb::TENSORFLOW_SERVING) {
18781927
if (params_->protocol != cb::ProtocolType::GRPC) {
18791928
Usage(

src/c++/perf_analyzer/command_line_parser.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ struct PerfAnalyzerParameters {
6262
uint64_t latency_threshold_ms = NO_LIMIT;
6363
double stability_threshold = 0.1;
6464
size_t max_trials = 10;
65+
size_t request_count = 0;
6566
bool zero_input = false;
6667
size_t string_length = 128;
6768
std::string string_data;

src/c++/perf_analyzer/concurrency_manager.cc

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -84,10 +84,10 @@ ConcurrencyManager::InitManagerFinalize()
8484

8585
cb::Error
8686
ConcurrencyManager::ChangeConcurrencyLevel(
87-
const size_t concurrent_request_count)
87+
const size_t concurrent_request_count, const size_t request_count)
8888
{
8989
PauseSequenceWorkers();
90-
ReconfigThreads(concurrent_request_count);
90+
ReconfigThreads(concurrent_request_count, request_count);
9191
ResumeSequenceWorkers();
9292

9393
std::cout << "Request concurrency: " << concurrent_request_count << std::endl;
@@ -109,7 +109,8 @@ ConcurrencyManager::PauseSequenceWorkers()
109109
}
110110

111111
void
112-
ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
112+
ConcurrencyManager::ReconfigThreads(
113+
size_t concurrent_request_count, size_t request_count)
113114
{
114115
// Always prefer to create new threads if the maximum limit has not been met
115116
//
@@ -121,8 +122,7 @@ ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
121122
(threads_.size() < max_threads_)) {
122123
// Launch new thread for inferencing
123124
threads_stat_.emplace_back(new ThreadStat());
124-
threads_config_.emplace_back(
125-
new ConcurrencyWorker::ThreadConfig(threads_config_.size()));
125+
threads_config_.emplace_back(new ThreadConfig(threads_config_.size()));
126126

127127
workers_.push_back(
128128
MakeWorker(threads_stat_.back(), threads_config_.back()));
@@ -138,13 +138,21 @@ ConcurrencyManager::ReconfigThreads(const size_t concurrent_request_count)
138138
// and spread the remaining value
139139
size_t avg_concurrency = concurrent_request_count / threads_.size();
140140
size_t threads_add_one = concurrent_request_count % threads_.size();
141+
142+
size_t avg_req_count = request_count / threads_.size();
143+
size_t req_count_add_one = request_count % threads_.size();
144+
141145
size_t seq_stat_index_offset = 0;
142146
active_threads_ = 0;
143147
for (size_t i = 0; i < threads_stat_.size(); i++) {
144148
size_t concurrency = avg_concurrency + (i < threads_add_one ? 1 : 0);
145149

146150
threads_config_[i]->concurrency_ = concurrency;
147151
threads_config_[i]->seq_stat_index_offset_ = seq_stat_index_offset;
152+
153+
size_t thread_num_reqs = avg_req_count + (i < req_count_add_one ? 1 : 0);
154+
threads_config_[i]->num_requests_ = thread_num_reqs;
155+
148156
seq_stat_index_offset += concurrency;
149157

150158
if (concurrency) {
@@ -171,7 +179,7 @@ ConcurrencyManager::ResumeSequenceWorkers()
171179
std::shared_ptr<IWorker>
172180
ConcurrencyManager::MakeWorker(
173181
std::shared_ptr<ThreadStat> thread_stat,
174-
std::shared_ptr<ConcurrencyWorker::ThreadConfig> thread_config)
182+
std::shared_ptr<ThreadConfig> thread_config)
175183
{
176184
uint32_t id = workers_.size();
177185

src/c++/perf_analyzer/concurrency_manager.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -89,14 +89,16 @@ class ConcurrencyManager : public LoadManager {
8989
/// Adjusts the number of concurrent requests to be the same as
9090
/// 'concurrent_request_count' (by creating or pausing threads)
9191
/// \param concurent_request_count The number of concurrent requests.
92+
/// \param request_count The number of requests to generate. If 0, then
93+
/// there is no limit, and it will generate until told to stop.
9294
/// \return cb::Error object indicating success or failure.
93-
cb::Error ChangeConcurrencyLevel(const size_t concurrent_request_count);
95+
cb::Error ChangeConcurrencyLevel(
96+
const size_t concurrent_request_count, const size_t request_count = 0);
9497

9598
protected:
9699
// Makes a new worker
97100
virtual std::shared_ptr<IWorker> MakeWorker(
98-
std::shared_ptr<ThreadStat>,
99-
std::shared_ptr<ConcurrencyWorker::ThreadConfig>);
101+
std::shared_ptr<ThreadStat>, std::shared_ptr<ThreadConfig>);
100102

101103
ConcurrencyManager(
102104
const bool async, const bool streaming, const int32_t batch_size,
@@ -114,7 +116,7 @@ class ConcurrencyManager : public LoadManager {
114116

115117
size_t max_concurrency_;
116118

117-
std::vector<std::shared_ptr<ConcurrencyWorker::ThreadConfig>> threads_config_;
119+
std::vector<std::shared_ptr<ThreadConfig>> threads_config_;
118120

119121
private:
120122
void InitManagerFinalize() override;
@@ -126,7 +128,7 @@ class ConcurrencyManager : public LoadManager {
126128
// Create new threads (if necessary), and then reconfigure all worker threads
127129
// to handle the new concurrent request count
128130
//
129-
void ReconfigThreads(size_t concurrent_request_count);
131+
void ReconfigThreads(size_t concurrent_request_count, size_t request_count);
130132

131133
// Restart all worker threads that were working on sequences
132134
//

src/c++/perf_analyzer/concurrency_worker.h

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -29,6 +29,7 @@
2929

3030
#include "load_worker.h"
3131
#include "sequence_manager.h"
32+
#include "thread_config.h"
3233

3334
namespace triton { namespace perfanalyzer {
3435

@@ -49,28 +50,6 @@ class NaggyMockConcurrencyWorker;
4950
///
5051
class ConcurrencyWorker : public LoadWorker {
5152
public:
52-
struct ThreadConfig {
53-
ThreadConfig(
54-
size_t thread_id, size_t concurrency = 0,
55-
size_t seq_stat_index_offset = 0)
56-
: thread_id_(thread_id), concurrency_(concurrency),
57-
seq_stat_index_offset_(seq_stat_index_offset), is_paused_(false)
58-
{
59-
}
60-
61-
// ID of corresponding worker thread
62-
size_t thread_id_;
63-
64-
// The concurrency level that the worker should produce
65-
size_t concurrency_;
66-
67-
// The starting sequence stat index for this worker
68-
size_t seq_stat_index_offset_;
69-
70-
// Whether or not the thread is issuing new inference requests
71-
bool is_paused_;
72-
};
73-
7453
ConcurrencyWorker(
7554
uint32_t id, std::shared_ptr<ThreadStat> thread_stat,
7655
std::shared_ptr<ThreadConfig> thread_config,
@@ -85,11 +64,11 @@ class ConcurrencyWorker : public LoadWorker {
8564
const std::shared_ptr<IInferDataManager>& infer_data_manager,
8665
std::shared_ptr<SequenceManager> sequence_manager)
8766
: LoadWorker(
88-
id, thread_stat, parser, data_loader, factory, on_sequence_model,
89-
async, streaming, batch_size, using_json_data, wake_signal,
90-
wake_mutex, execute, infer_data_manager, sequence_manager),
91-
thread_config_(thread_config), max_concurrency_(max_concurrency),
92-
active_threads_(active_threads)
67+
id, thread_stat, thread_config, parser, data_loader, factory,
68+
on_sequence_model, async, streaming, batch_size, using_json_data,
69+
wake_signal, wake_mutex, execute, infer_data_manager,
70+
sequence_manager),
71+
max_concurrency_(max_concurrency), active_threads_(active_threads)
9372
{
9473
}
9574

@@ -109,8 +88,6 @@ class ConcurrencyWorker : public LoadWorker {
10988
// threads?
11089
size_t& active_threads_;
11190

112-
std::shared_ptr<ThreadConfig> thread_config_;
113-
11491
// Handle the case where execute_ is false
11592
void HandleExecuteOff();
11693

src/c++/perf_analyzer/custom_load_manager.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -76,10 +76,10 @@ CustomLoadManager::CustomLoadManager(
7676
}
7777

7878
cb::Error
79-
CustomLoadManager::InitCustomIntervals()
79+
CustomLoadManager::InitCustomIntervals(const size_t request_count)
8080
{
8181
PauseWorkers();
82-
ConfigureThreads();
82+
ConfigureThreads(request_count);
8383
auto status = GenerateSchedule();
8484
ResumeWorkers();
8585
return status;

src/c++/perf_analyzer/custom_load_manager.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -88,8 +88,10 @@ class CustomLoadManager : public RequestRateManager {
8888

8989
/// Initializes the load manager with the provided file containing request
9090
/// intervals
91+
/// \param request_count The number of requests to generate. If 0, then
92+
/// there is no limit, and it will generate until told to stop.
9193
/// \return cb::Error object indicating success or failure.
92-
cb::Error InitCustomIntervals();
94+
cb::Error InitCustomIntervals(const size_t request_count);
9395

9496
/// Computes the request rate from the time interval file. Fails with an error
9597
/// if the file is not present or is empty.

0 commit comments

Comments
 (0)