Skip to content

Commit 16bc99b

Browse files
authored
fix(thread_pool_scaling): Exception handling and edge case (EVerest#1903)
This commit fixes a potential (highly unlikely dealock), as well as a missing try catch. Signed-off-by: Martin Litre <mnlitre@gmail.com>
1 parent d135dc2 commit 16bc99b

File tree

2 files changed

+304
-48
lines changed

2 files changed

+304
-48
lines changed

lib/everest/util/include/everest/util/async/thread_pool_scaling.hpp

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,14 @@ template <typename ScalingPolicy = LatencyScaling<10>> class thread_pool_scaling
154154
}
155155
m_action_queue.stop();
156156

157-
// 2. Steal the active workers list
157+
// 2. Steal the active workers list. Explicitly clear the source so that any
158+
// worker that acquires the lock afterwards sees size()==0 and cannot
159+
// voluntarily retire into the zombies deque after step 4's final reap.
158160
std::list<std::thread> workers_to_join;
159161
{
160162
auto reg_h = m_reg.handle();
161163
workers_to_join = std::move(reg_h->workers);
164+
reg_h->workers.clear();
162165
}
163166

164167
// 3. Join everything in our stolen list
@@ -168,9 +171,18 @@ template <typename ScalingPolicy = LatencyScaling<10>> class thread_pool_scaling
168171
}
169172
}
170173

171-
// 4. Join and clear any zombies that retired before the move
172-
auto reg_h = m_reg.handle();
173-
reap_zombies_internal(reg_h);
174+
// 4. Join any zombies that retired before the steal. Steal the deque first
175+
// so the join happens outside the lock (same pattern as the worker loop).
176+
std::deque<std::thread> zombies_to_join;
177+
{
178+
auto reg_h = m_reg.handle();
179+
zombies_to_join = std::move(reg_h->zombies);
180+
}
181+
for (auto& t : zombies_to_join) {
182+
if (t.joinable()) {
183+
t.join();
184+
}
185+
}
174186
}
175187

176188
/**
@@ -250,10 +262,29 @@ template <typename ScalingPolicy = LatencyScaling<10>> class thread_pool_scaling
250262
auto task_opt = m_action_queue.try_pop(m_idle_timeout);
251263

252264
if (task_opt) {
253-
task_opt->func();
254-
auto reg_h = m_reg.handle();
255-
if (!reg_h->zombies.empty()) {
256-
reap_zombies_internal(reg_h);
265+
try {
266+
task_opt->func();
267+
} catch (...) {
268+
// Suppress exception to prevent thread termination.
269+
// Fire-and-forget tasks are responsible for their own error handling.
270+
}
271+
// Steal the zombie deque under the lock, then join outside it.
272+
// Joining while holding the lock is safe in practice (the zombie has already
273+
// released the lock before it can appear in the deque), but it blocks the
274+
// registry mutex for the duration of the join — delaying scaling decisions
275+
// and the destructor. Stealing first bounds the critical section to a cheap
276+
// list move.
277+
std::deque<std::thread> zombies_to_join;
278+
{
279+
auto reg_h = m_reg.handle();
280+
if (!reg_h->zombies.empty()) {
281+
zombies_to_join = std::move(reg_h->zombies);
282+
}
283+
}
284+
for (auto& t : zombies_to_join) {
285+
if (t.joinable()) {
286+
t.join();
287+
}
257288
}
258289
} else {
259290
auto reg_h = m_reg.handle();
@@ -279,19 +310,6 @@ template <typename ScalingPolicy = LatencyScaling<10>> class thread_pool_scaling
279310
});
280311
}
281312

282-
/**
283-
* @brief Joins and clears retired threads.
284-
* @param[in] reg_h Handle to the monitor-protected registry data.
285-
*/
286-
void reap_zombies_internal(handle& reg_h) {
287-
while (!reg_h->zombies.empty()) {
288-
if (reg_h->zombies.front().joinable()) {
289-
reg_h->zombies.front().join();
290-
}
291-
reg_h->zombies.pop_front();
292-
}
293-
}
294-
295313
const std::size_t m_min_threads; ///< Minimum persistent thread count.
296314
const std::size_t m_max_threads; ///< Maximum allowed thread count.
297315
const std::chrono::milliseconds m_idle_timeout; ///< Surplus thread idle timeout.

0 commit comments

Comments
 (0)