Skip to content

Commit d135dc2

Browse files
authored
feat(framework): Add scaling thread pool for the message handler (EVerest#1816)
A thread pool for handling the operation worker was added so that commands and vars don't accumulate in the queue and cause delays which can lead to timeouts in certain parts of the code. Signed-off-by: Piet Gömpel <pietgoempel@gmail.com> Signed-off-by: Martin Litre <mnlitre@gmail.com>
1 parent faa7d89 commit d135dc2

File tree

7 files changed

+994
-11
lines changed

7 files changed

+994
-11
lines changed

lib/everest/framework/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ cc_library(
5050
"@rapidyaml//:rapidyaml",
5151
"//lib/everest/log:liblog",
5252
"//lib/everest/sqlite:everest-sqlite",
53+
"//lib/everest/util:util",
5354
"@com_github_fmtlib_fmt//:fmt",
5455
"@com_github_nlohmann_json//:json",
5556
"@com_github_pboettch_json-schema-validator//:json-schema-validator",

lib/everest/framework/include/utils/message_handler.hpp

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@
55

66
#include <atomic>
77
#include <condition_variable>
8-
#include <functional>
98
#include <map>
109
#include <memory>
1110
#include <mutex>
1211
#include <queue>
1312
#include <string>
1413
#include <thread>
14+
#include <unordered_map>
15+
#include <unordered_set>
1516
#include <vector>
1617

18+
#include <everest/util/async/thread_pool_scaling.hpp>
1719
#include <utils/message_queue.hpp>
1820
#include <utils/types.hpp>
1921

@@ -22,6 +24,10 @@ using CmdId = std::string;
2224

2325
namespace Everest {
2426

27+
constexpr int THREAD_POOL_SCALING_LATENCY_THRESHOLD_MS = 50;
28+
constexpr int THREAD_POOL_SCALING_LATENCY_THREAD_IDLE_TIMEOUT_S = 2;
29+
constexpr int THREAD_POOL_SCALING_MIN_THREAD_COUNT = 1;
30+
2531
/// \brief Handles message dispatching and thread-safe queuing of different message types. This class uses two separate
2632
/// threads and message queues: one for operation messages (vars, cmds, errors, GetConfig, ModuleReady) and one for
2733
/// result messages (cmd results, GetConfig responses).
@@ -40,10 +46,14 @@ class MessageHandler {
4046
void register_handler(const std::string& topic, std::shared_ptr<TypedHandler> handler);
4147

4248
private:
43-
void run_operation_message_worker();
49+
void run_operation_dispatcher();
4450
void run_result_message_worker();
4551
void run_external_mqtt_worker();
4652

53+
void dispatch_operation_message(ParsedMessage&& message);
54+
void schedule_operation_message(ParsedMessage&& message);
55+
void on_operation_message_done(const std::string& topic);
56+
4757
void handle_operation_message(const std::string& topic, const json& payload);
4858
void handle_result_message(const std::string& topic, const json& payload);
4959

@@ -69,20 +79,30 @@ class MessageHandler {
6979
void execute_single_handler(HandlerMap& handlers, const std::string& topic, ExecuteFn execute_fn);
7080

7181
// Threads
72-
std::thread
73-
operation_worker_thread; // processes vars, commands, external MQTT, errors, GetConfig and ModuleReady messages
82+
std::thread operation_dispatcher_thread; // processes vars, commands, external MQTT, errors, GetConfig and
83+
// ModuleReady messages
7484
std::thread result_worker_thread; // processes cmd results and GetConfig responses
7585
std::thread external_mqtt_worker_thread; // processes external MQTT messages
7686
std::thread ready_thread; // runs the modules ready function
7787

7888
// Queues and sync primitives
89+
std::unique_ptr<everest::lib::util::thread_pool_scaling<
90+
everest::lib::util::LatencyScaling<THREAD_POOL_SCALING_LATENCY_THRESHOLD_MS>>>
91+
operation_thread_pool{std::make_unique<everest::lib::util::thread_pool_scaling<
92+
everest::lib::util::LatencyScaling<THREAD_POOL_SCALING_LATENCY_THRESHOLD_MS>>>(
93+
THREAD_POOL_SCALING_MIN_THREAD_COUNT, std::thread::hardware_concurrency(),
94+
std::chrono::seconds(THREAD_POOL_SCALING_LATENCY_THREAD_IDLE_TIMEOUT_S))};
7995
std::queue<ParsedMessage> operation_message_queue;
8096
std::queue<ParsedMessage> result_message_queue;
8197
std::queue<ParsedMessage> external_mqtt_message_queue;
8298

8399
std::mutex operation_queue_mutex;
84100
std::condition_variable operation_cv;
85101

102+
std::mutex operation_topic_state_mutex;
103+
std::unordered_set<std::string> operation_topics_in_flight;
104+
std::unordered_map<std::string, std::queue<ParsedMessage>> pending_operation_messages_by_topic;
105+
86106
std::mutex result_queue_mutex;
87107
std::condition_variable result_cv;
88108

lib/everest/framework/lib/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ target_link_libraries(framework
7777

7878
everest::log
7979
everest::sqlite
80+
everest::util
8081
${STD_FILESYSTEM_COMPAT_LIB}
8182
PRIVATE
8283
${EVEREST_FRAMEWORK_BOOST_SYSTEM_LINK_LIBRARY}

lib/everest/framework/lib/message_handler.cpp

Lines changed: 76 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
#include <everest/logging.hpp>
77
#include <fmt/format.h>
88

9+
#include <optional>
10+
911
namespace Everest {
1012

1113
namespace {
@@ -62,7 +64,7 @@ bool check_topic_matches(const std::string& full_topic, const std::string& wildc
6264
} // namespace
6365

6466
MessageHandler::MessageHandler() {
65-
operation_worker_thread = std::thread([this] { run_operation_message_worker(); });
67+
operation_dispatcher_thread = std::thread([this] { run_operation_dispatcher(); });
6668
result_worker_thread = std::thread([this] { run_result_message_worker(); });
6769
external_mqtt_worker_thread = std::thread([this] { run_external_mqtt_worker(); });
6870
}
@@ -120,9 +122,13 @@ void MessageHandler::stop() {
120122
result_cv.notify_all();
121123
external_mqtt_cv.notify_all();
122124

123-
if (operation_worker_thread.joinable()) {
124-
operation_worker_thread.join();
125+
// Join the dispatcher first: it must not be able to call schedule_operation_message()
126+
// (which dereferences operation_thread_pool) after the pool is destroyed.
127+
if (operation_dispatcher_thread.joinable()) {
128+
operation_dispatcher_thread.join();
125129
}
130+
// The thread_pool destructor handles stopping and joining its workers.
131+
operation_thread_pool.reset();
126132
if (result_worker_thread.joinable()) {
127133
result_worker_thread.join();
128134
}
@@ -134,7 +140,7 @@ void MessageHandler::stop() {
134140
}
135141
}
136142

137-
void MessageHandler::run_operation_message_worker() {
143+
void MessageHandler::run_operation_dispatcher() {
138144
while (true) {
139145
std::unique_lock<std::mutex> lock(operation_queue_mutex);
140146
operation_cv.wait(lock, [this] { return !operation_message_queue.empty() || !running; });
@@ -145,9 +151,73 @@ void MessageHandler::run_operation_message_worker() {
145151
operation_message_queue.pop();
146152
lock.unlock();
147153

148-
handle_operation_message(message.topic, message.data);
154+
dispatch_operation_message(std::move(message));
155+
}
156+
EVLOG_info << "Operation dispatcher thread stopped";
157+
}
158+
159+
void MessageHandler::dispatch_operation_message(ParsedMessage&& message) {
160+
{
161+
std::lock_guard<std::mutex> lock(operation_topic_state_mutex);
162+
if (operation_topics_in_flight.find(message.topic) != operation_topics_in_flight.end()) {
163+
pending_operation_messages_by_topic[message.topic].push(std::move(message));
164+
return;
165+
}
166+
167+
operation_topics_in_flight.insert(message.topic);
168+
}
169+
170+
schedule_operation_message(std::move(message));
171+
}
172+
173+
// NOLINTNEXTLINE(misc-no-recursion)
174+
void MessageHandler::schedule_operation_message(ParsedMessage&& message) {
175+
// NOLINTNEXTLINE(misc-no-recursion)
176+
auto operation = [this, message = std::move(message)]() {
177+
// Wrap in try-catch so that on_operation_message_done is always called: an exception in
178+
// the handler must not leave the topic permanently stuck in operation_topics_in_flight,
179+
// which would block all subsequent messages for that topic.
180+
try {
181+
handle_operation_message(message.topic, message.data);
182+
} catch (const std::exception& e) {
183+
EVLOG_error << "Exception while handling operation message on topic '" << message.topic
184+
<< "': " << e.what();
185+
} catch (...) {
186+
EVLOG_error << "Unknown exception while handling operation message on topic '" << message.topic << "'";
187+
}
188+
on_operation_message_done(message.topic);
189+
};
190+
191+
if (operation_thread_pool) {
192+
operation_thread_pool->run(std::move(operation));
193+
}
194+
}
195+
196+
// NOLINTNEXTLINE(misc-no-recursion)
197+
void MessageHandler::on_operation_message_done(const std::string& topic) {
198+
std::optional<ParsedMessage> next_message;
199+
{
200+
std::lock_guard<std::mutex> lock(operation_topic_state_mutex);
201+
if (!running) {
202+
// Shutting down: stop scheduling and release the in-flight slot.
203+
operation_topics_in_flight.erase(topic);
204+
return;
205+
}
206+
auto pending_it = pending_operation_messages_by_topic.find(topic);
207+
if (pending_it != pending_operation_messages_by_topic.end() && !pending_it->second.empty()) {
208+
next_message = std::move(pending_it->second.front());
209+
pending_it->second.pop();
210+
211+
if (pending_it->second.empty()) {
212+
pending_operation_messages_by_topic.erase(pending_it);
213+
}
214+
} else {
215+
operation_topics_in_flight.erase(topic);
216+
return;
217+
}
149218
}
150-
EVLOG_info << "Main worker thread stopped";
219+
220+
schedule_operation_message(std::move(*next_message));
151221
}
152222

153223
void MessageHandler::run_result_message_worker() {

lib/everest/framework/schemas/config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,5 +231,5 @@ properties:
231231
default: {}
232232
# don't allow arbitrary additional properties
233233
additionalProperties: false
234-
234+
235235
x-module-layout: {}

lib/everest/framework/tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ target_sources(${TEST_TARGET_NAME} PRIVATE
1616
test_config_sqlite.cpp
1717
test_conversions.cpp
1818
test_filesystem_helpers.cpp
19+
test_message_handler.cpp
1920
helpers.cpp
2021
)
2122

0 commit comments

Comments
 (0)