Skip to content

Commit ca3ad7d

Browse files
authored
Fix action server deadlock (ros2#1285) (ros2#1313)
* unlock action_server_reentrant_mutex_ before calling user callback functions add an additional lock to keep previous behavior broken by deadlock fix Also add a test case to reproduce deadlock situation in rclcpp_action Signed-off-by: Daisuke Sato <[email protected]>
1 parent 7ccd64c commit ca3ad7d

File tree

2 files changed

+183
-50
lines changed

2 files changed

+183
-50
lines changed

rclcpp_action/src/server.cpp

Lines changed: 94 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class ServerBaseImpl
4545
{
4646
}
4747

48-
// Lock everything except user callbacks
48+
// Lock for action_server_
4949
std::recursive_mutex action_server_reentrant_mutex_;
5050

5151
std::shared_ptr<rcl_action_server_t> action_server_;
@@ -58,10 +58,13 @@ class ServerBaseImpl
5858
size_t num_services_ = 0;
5959
size_t num_guard_conditions_ = 0;
6060

61-
bool goal_request_ready_ = false;
62-
bool cancel_request_ready_ = false;
63-
bool result_request_ready_ = false;
64-
bool goal_expired_ = false;
61+
std::atomic<bool> goal_request_ready_{false};
62+
std::atomic<bool> cancel_request_ready_{false};
63+
std::atomic<bool> result_request_ready_{false};
64+
std::atomic<bool> goal_expired_{false};
65+
66+
// Lock for unordered_maps
67+
std::recursive_mutex unordered_map_mutex_;
6568

6669
// Results to be kept until the goal expires after reaching a terminal state
6770
std::unordered_map<GoalUUID, std::shared_ptr<void>> goal_results_;
@@ -171,29 +174,41 @@ ServerBase::add_to_wait_set(rcl_wait_set_t * wait_set)
171174
bool
172175
ServerBase::is_ready(rcl_wait_set_t * wait_set)
173176
{
174-
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
175-
rcl_ret_t ret = rcl_action_server_wait_set_get_entities_ready(
176-
wait_set,
177-
pimpl_->action_server_.get(),
178-
&pimpl_->goal_request_ready_,
179-
&pimpl_->cancel_request_ready_,
180-
&pimpl_->result_request_ready_,
181-
&pimpl_->goal_expired_);
177+
bool goal_request_ready;
178+
bool cancel_request_ready;
179+
bool result_request_ready;
180+
bool goal_expired;
181+
rcl_ret_t ret;
182+
{
183+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
184+
ret = rcl_action_server_wait_set_get_entities_ready(
185+
wait_set,
186+
pimpl_->action_server_.get(),
187+
&goal_request_ready,
188+
&cancel_request_ready,
189+
&result_request_ready,
190+
&goal_expired);
191+
}
192+
193+
pimpl_->goal_request_ready_ = goal_request_ready;
194+
pimpl_->cancel_request_ready_ = cancel_request_ready;
195+
pimpl_->result_request_ready_ = result_request_ready;
196+
pimpl_->goal_expired_ = goal_expired;
182197

183198
if (RCL_RET_OK != ret) {
184199
rclcpp::exceptions::throw_from_rcl_error(ret);
185200
}
186201

187-
return pimpl_->goal_request_ready_ ||
188-
pimpl_->cancel_request_ready_ ||
189-
pimpl_->result_request_ready_ ||
190-
pimpl_->goal_expired_;
202+
return pimpl_->goal_request_ready_.load() ||
203+
pimpl_->cancel_request_ready_.load() ||
204+
pimpl_->result_request_ready_.load() ||
205+
pimpl_->goal_expired_.load();
191206
}
192207

193208
std::shared_ptr<void>
194209
ServerBase::take_data()
195210
{
196-
if (pimpl_->goal_request_ready_) {
211+
if (pimpl_->goal_request_ready_.load()) {
197212
rcl_ret_t ret;
198213
rcl_action_goal_info_t goal_info = rcl_action_get_zero_initialized_goal_info();
199214
rmw_request_id_t request_header;
@@ -212,7 +227,7 @@ ServerBase::take_data()
212227
ret,
213228
goal_info,
214229
request_header, message));
215-
} else if (pimpl_->cancel_request_ready_) {
230+
} else if (pimpl_->cancel_request_ready_.load()) {
216231
rcl_ret_t ret;
217232
rmw_request_id_t request_header;
218233

@@ -229,7 +244,7 @@ ServerBase::take_data()
229244
std::make_shared
230245
<std::tuple<rcl_ret_t, std::shared_ptr<action_msgs::srv::CancelGoal::Request>,
231246
rmw_request_id_t>>(ret, request, request_header));
232-
} else if (pimpl_->result_request_ready_) {
247+
} else if (pimpl_->result_request_ready_.load()) {
233248
rcl_ret_t ret;
234249
// Get the result request message
235250
rmw_request_id_t request_header;
@@ -241,7 +256,7 @@ ServerBase::take_data()
241256
return std::static_pointer_cast<void>(
242257
std::make_shared<std::tuple<rcl_ret_t, std::shared_ptr<void>, rmw_request_id_t>>(
243258
ret, result_request, request_header));
244-
} else if (pimpl_->goal_expired_) {
259+
} else if (pimpl_->goal_expired_.load()) {
245260
return nullptr;
246261
} else {
247262
throw std::runtime_error("Taking data from action server but nothing is ready");
@@ -251,17 +266,17 @@ ServerBase::take_data()
251266
void
252267
ServerBase::execute(std::shared_ptr<void> & data)
253268
{
254-
if (!data && !pimpl_->goal_expired_) {
269+
if (!data && !pimpl_->goal_expired_.load()) {
255270
throw std::runtime_error("'data' is empty");
256271
}
257272

258-
if (pimpl_->goal_request_ready_) {
273+
if (pimpl_->goal_request_ready_.load()) {
259274
execute_goal_request_received(data);
260-
} else if (pimpl_->cancel_request_ready_) {
275+
} else if (pimpl_->cancel_request_ready_.load()) {
261276
execute_cancel_request_received(data);
262-
} else if (pimpl_->result_request_ready_) {
277+
} else if (pimpl_->result_request_ready_.load()) {
263278
execute_result_request_received(data);
264-
} else if (pimpl_->goal_expired_) {
279+
} else if (pimpl_->goal_expired_.load()) {
265280
execute_check_expired_goals();
266281
} else {
267282
throw std::runtime_error("Executing action server but nothing is ready");
@@ -286,21 +301,24 @@ ServerBase::execute_goal_request_received(std::shared_ptr<void> & data)
286301
rmw_request_id_t request_header = std::get<2>(*shared_ptr);
287302
std::shared_ptr<void> message = std::get<3>(*shared_ptr);
288303

289-
290-
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
291-
292-
pimpl_->goal_request_ready_ = false;
304+
bool expected = true;
305+
if (!pimpl_->goal_request_ready_.compare_exchange_strong(expected, false)) {
306+
return;
307+
}
293308

294309
GoalUUID uuid = get_goal_id_from_goal_request(message.get());
295310
convert(uuid, &goal_info);
296311

297312
// Call user's callback, getting the user's response and a ros message to send back
298313
auto response_pair = call_handle_goal_callback(uuid, message);
299314

300-
ret = rcl_action_send_goal_response(
301-
pimpl_->action_server_.get(),
302-
&request_header,
303-
response_pair.second.get());
315+
{
316+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
317+
ret = rcl_action_send_goal_response(
318+
pimpl_->action_server_.get(),
319+
&request_header,
320+
response_pair.second.get());
321+
}
304322

305323
if (RCL_RET_OK != ret) {
306324
rclcpp::exceptions::throw_from_rcl_error(ret);
@@ -325,7 +343,10 @@ ServerBase::execute_goal_request_received(std::shared_ptr<void> & data)
325343
}
326344
};
327345
rcl_action_goal_handle_t * rcl_handle;
328-
rcl_handle = rcl_action_accept_new_goal(pimpl_->action_server_.get(), &goal_info);
346+
{
347+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
348+
rcl_handle = rcl_action_accept_new_goal(pimpl_->action_server_.get(), &goal_info);
349+
}
329350
if (!rcl_handle) {
330351
throw std::runtime_error("Failed to accept new goal\n");
331352
}
@@ -334,7 +355,10 @@ ServerBase::execute_goal_request_received(std::shared_ptr<void> & data)
334355
// Copy out goal handle since action server storage disappears when it is fini'd
335356
*handle = *rcl_handle;
336357

337-
pimpl_->goal_handles_[uuid] = handle;
358+
{
359+
std::lock_guard<std::recursive_mutex> lock(pimpl_->unordered_map_mutex_);
360+
pimpl_->goal_handles_[uuid] = handle;
361+
}
338362

339363
if (GoalResponse::ACCEPT_AND_EXECUTE == status) {
340364
// Change status to executing
@@ -359,7 +383,6 @@ ServerBase::execute_cancel_request_received(std::shared_ptr<void> & data)
359383
<std::tuple<rcl_ret_t, std::shared_ptr<action_msgs::srv::CancelGoal::Request>,
360384
rmw_request_id_t>>(data);
361385
auto ret = std::get<0>(*shared_ptr);
362-
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
363386
if (RCL_RET_ACTION_SERVER_TAKE_FAILED == ret) {
364387
// Ignore take failure because connext fails if it receives a sample without valid data.
365388
// This happens when a client shuts down and connext receives a sample saying the client is
@@ -380,10 +403,14 @@ ServerBase::execute_cancel_request_received(std::shared_ptr<void> & data)
380403
// Get a list of goal info that should be attempted to be cancelled
381404
rcl_action_cancel_response_t cancel_response = rcl_action_get_zero_initialized_cancel_response();
382405

383-
ret = rcl_action_process_cancel_request(
384-
pimpl_->action_server_.get(),
385-
&cancel_request,
386-
&cancel_response);
406+
{
407+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
408+
ret = rcl_action_process_cancel_request(
409+
pimpl_->action_server_.get(),
410+
&cancel_request,
411+
&cancel_response);
412+
}
413+
387414
if (RCL_RET_OK != ret) {
388415
rclcpp::exceptions::throw_from_rcl_error(ret);
389416
}
@@ -426,8 +453,12 @@ ServerBase::execute_cancel_request_received(std::shared_ptr<void> & data)
426453
publish_status();
427454
}
428455

429-
ret = rcl_action_send_cancel_response(
430-
pimpl_->action_server_.get(), &request_header, response.get());
456+
{
457+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
458+
ret = rcl_action_send_cancel_response(
459+
pimpl_->action_server_.get(), &request_header, response.get());
460+
}
461+
431462
if (RCL_RET_OK != ret) {
432463
rclcpp::exceptions::throw_from_rcl_error(ret);
433464
}
@@ -440,7 +471,6 @@ ServerBase::execute_result_request_received(std::shared_ptr<void> & data)
440471
auto shared_ptr = std::static_pointer_cast
441472
<std::tuple<rcl_ret_t, std::shared_ptr<void>, rmw_request_id_t>>(data);
442473
auto ret = std::get<0>(*shared_ptr);
443-
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
444474
if (RCL_RET_ACTION_SERVER_TAKE_FAILED == ret) {
445475
// Ignore take failure because connext fails if it receives a sample without valid data.
446476
// This happens when a client shuts down and connext receives a sample saying the client is
@@ -460,7 +490,10 @@ ServerBase::execute_result_request_received(std::shared_ptr<void> & data)
460490
rcl_action_goal_info_t goal_info;
461491
convert(uuid, &goal_info);
462492
bool goal_exists;
463-
goal_exists = rcl_action_server_goal_exists(pimpl_->action_server_.get(), &goal_info);
493+
{
494+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
495+
goal_exists = rcl_action_server_goal_exists(pimpl_->action_server_.get(), &goal_info);
496+
}
464497
if (!goal_exists) {
465498
// Goal does not exists
466499
result_response = create_result_response(action_msgs::msg::GoalStatus::STATUS_UNKNOWN);
@@ -474,13 +507,15 @@ ServerBase::execute_result_request_received(std::shared_ptr<void> & data)
474507

475508
if (result_response) {
476509
// Send the result now
510+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
477511
rcl_ret_t rcl_ret = rcl_action_send_result_response(
478512
pimpl_->action_server_.get(), &request_header, result_response.get());
479513
if (RCL_RET_OK != rcl_ret) {
480514
rclcpp::exceptions::throw_from_rcl_error(rcl_ret);
481515
}
482516
} else {
483517
// Store the request so it can be responded to later
518+
std::lock_guard<std::recursive_mutex> lock(pimpl_->unordered_map_mutex_);
484519
pimpl_->result_requests_[uuid].push_back(request_header);
485520
}
486521
data.reset();
@@ -495,16 +530,19 @@ ServerBase::execute_check_expired_goals()
495530

496531
// Loop in case more than 1 goal expired
497532
while (num_expired > 0u) {
498-
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
499533
rcl_ret_t ret;
500-
ret = rcl_action_expire_goals(pimpl_->action_server_.get(), expired_goals, 1, &num_expired);
534+
{
535+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
536+
ret = rcl_action_expire_goals(pimpl_->action_server_.get(), expired_goals, 1, &num_expired);
537+
}
501538
if (RCL_RET_OK != ret) {
502539
rclcpp::exceptions::throw_from_rcl_error(ret);
503540
} else if (num_expired) {
504541
// A goal expired!
505542
GoalUUID uuid;
506543
convert(expired_goals[0], &uuid);
507544
RCLCPP_DEBUG(pimpl_->logger_, "Expired goal %s", to_string(uuid).c_str());
545+
std::lock_guard<std::recursive_mutex> lock(pimpl_->unordered_map_mutex_);
508546
pimpl_->goal_results_.erase(uuid);
509547
pimpl_->result_requests_.erase(uuid);
510548
pimpl_->goal_handles_.erase(uuid);
@@ -577,20 +615,26 @@ ServerBase::publish_result(const GoalUUID & uuid, std::shared_ptr<void> result_m
577615
// Check that the goal exists
578616
rcl_action_goal_info_t goal_info;
579617
convert(uuid, &goal_info);
580-
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
581618
bool goal_exists;
582-
goal_exists = rcl_action_server_goal_exists(pimpl_->action_server_.get(), &goal_info);
619+
{
620+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
621+
goal_exists = rcl_action_server_goal_exists(pimpl_->action_server_.get(), &goal_info);
622+
}
583623

584624
if (!goal_exists) {
585625
throw std::runtime_error("Asked to publish result for goal that does not exist");
586626
}
587627

588-
pimpl_->goal_results_[uuid] = result_msg;
628+
{
629+
std::lock_guard<std::recursive_mutex> lock(pimpl_->unordered_map_mutex_);
630+
pimpl_->goal_results_[uuid] = result_msg;
631+
}
589632

590633
// if there are clients who already asked for the result, send it to them
591634
auto iter = pimpl_->result_requests_.find(uuid);
592635
if (iter != pimpl_->result_requests_.end()) {
593636
for (auto & request_header : iter->second) {
637+
std::lock_guard<std::recursive_mutex> lock(pimpl_->action_server_reentrant_mutex_);
594638
rcl_ret_t ret = rcl_action_send_result_response(
595639
pimpl_->action_server_.get(), &request_header, result_msg.get());
596640
if (RCL_RET_OK != ret) {

0 commit comments

Comments
 (0)