Skip to content

Commit 1e881c6

Browse files
mergify[bot]juanlofer-eprosimacferreiragonz
authored
Solve Discovery Server race conditions (#5780) (#5807)
* Solve Discovery Server race conditions (#5780) * Refs #23088: Test reconnection when removing participant Signed-off-by: cferreiragonz <[email protected]> * Refs #23088: Solve EDP-PDP queues race condition Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Solve data UP + data P race condition Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Abort writer/reader processing if associated participant not alive Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Apply suggestions Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Release change when writer/reader insertion in DB failed Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Match servers after change update Signed-off-by: Juan Lopez Fernandez <[email protected]> --------- Signed-off-by: cferreiragonz <[email protected]> Signed-off-by: Juan Lopez Fernandez <[email protected]> Co-authored-by: cferreiragonz <[email protected]> (cherry picked from commit ec666f7) # Conflicts: # src/cpp/rtps/builtin/discovery/database/DiscoveryDataBase.cpp # test/blackbox/common/BlackboxTestsDiscovery.cpp * Add core types for testing Signed-off-by: cferreiragonz <[email protected]> * Fix conflicts test Signed-off-by: cferreiragonz <[email protected]> * Fix conflicts src Signed-off-by: cferreiragonz <[email protected]> * Compatibility with Fast CDR v1 Signed-off-by: cferreiragonz <[email protected]> --------- Signed-off-by: cferreiragonz <[email protected]> Co-authored-by: juanlofer-eprosima <[email protected]> Co-authored-by: cferreiragonz <[email protected]>
1 parent f811631 commit 1e881c6

16 files changed

+23591
-40
lines changed

src/cpp/rtps/builtin/discovery/database/DiscoveryDataBase.cpp

Lines changed: 104 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -481,9 +481,6 @@ void DiscoveryDataBase::process_pdp_data_queue()
481481
// Lock(exclusive mode) mutex locally
482482
std::lock_guard<std::recursive_mutex> guard(mutex_);
483483

484-
// Swap DATA queues
485-
pdp_data_queue_.Swap();
486-
487484
// Process all messages in the queque
488485
while (!pdp_data_queue_.Empty())
489486
{
@@ -521,9 +518,6 @@ bool DiscoveryDataBase::process_edp_data_queue()
521518
// Lock(exclusive mode) mutex locally
522519
std::lock_guard<std::recursive_mutex> guard(mutex_);
523520

524-
// Swap DATA queues
525-
edp_data_queue_.Swap();
526-
527521
eprosima::fastrtps::rtps::CacheChange_t* change;
528522
std::string topic_name;
529523

@@ -730,32 +724,71 @@ void DiscoveryDataBase::update_participant_from_change_(
730724
{
731725
fastrtps::rtps::GUID_t change_guid = guid_from_change(ch);
732726

727+
assert(ch->kind == eprosima::fastrtps::rtps::ALIVE);
728+
729+
// If the change corresponds to a previously removed participant (which hasn't yet been removed from the map since
730+
// the DATA(Up) is still unacked), update map with new data and behave as if it was a new participant.
731+
// Remove also the old change from the disposals collection, if it was added just before
732+
if (participant_info.change()->kind != eprosima::fastrtps::rtps::ALIVE)
733+
{
734+
// Update the change data
735+
participant_info.participant_change_data(change_data);
736+
737+
// Remove old change from disposals if it was added just before to avoid sending data UP
738+
auto it = std::find(disposals_.begin(), disposals_.end(), participant_info.change());
739+
if (it != disposals_.end())
740+
{
741+
disposals_.erase(it);
742+
}
743+
744+
// Update change. This should add the UNALIVE change to changes_to_release_, which should later both remove the
745+
// change from the writer's history and release the change
746+
update_change_and_unmatch_(ch, participant_info);
747+
748+
// If it is local and server we have to create virtual endpoints, except for our own server
749+
if (change_guid.guidPrefix != server_guid_prefix_ && !change_data.is_client() && change_data.is_local())
750+
{
751+
// Match new server and create virtual endpoints
752+
// NOTE: match after having updated the change, so virtual endpoints are not discarded for having
753+
// an associated unalive participant
754+
match_new_server_(change_guid.guidPrefix);
755+
}
756+
757+
// Treat as a new participant found
758+
new_updates_++;
759+
if (change_guid.guidPrefix != server_guid_prefix_)
760+
{
761+
server_acked_by_all(false);
762+
}
763+
}
764+
733765
// Specific case when a Data(P) from an entity A known as remote comes from the very entity A (we have
734766
// the Data(P) because of other server B, but now it arrives from A itself)
735767
// The entity A changes to local
736768
// Must be local data, or else it is a remote endpoint and should not be changed
737-
if (change_guid.guidPrefix != server_guid_prefix_ && change_data.is_local() &&
769+
else if (change_guid.guidPrefix != server_guid_prefix_ && change_data.is_local() &&
738770
DiscoveryDataBase::participant_data_has_changed_(participant_info, change_data))
739771
{
772+
// Update the change data
773+
participant_info.participant_change_data(change_data);
774+
775+
// Update change
776+
update_change_and_unmatch_(ch, participant_info);
777+
740778
// If the participant changes to server local, virtual endpoints must be added
741779
// If it is local and server the only possibility is it was a remote server and it must be converted to local
742780
if (!change_data.is_client())
743781
{
782+
// NOTE: match after having updated the change in order to send the new Data(P)
744783
match_new_server_(change_guid.guidPrefix);
745784
}
746785

747-
// Update the change data
748-
participant_info.participant_change_data(change_data);
749-
750-
// Update change
751-
update_change_and_unmatch_(ch, participant_info);
752-
753786
// Treat as a new participant found
754787
new_updates_++;
755788
server_acked_by_all(false);
756789

757790
// It is possible that this Data(P) is in our history if it has not been acked by all
758-
// In this case we have to resent it with the new update
791+
// In this case we have to resend it with the new update
759792
if (!participant_info.is_acked_by_all())
760793
{
761794
add_pdp_to_send_(ch);
@@ -858,6 +891,29 @@ void DiscoveryDataBase::create_writers_from_change_(
858891
// The writer was NOT known by the database
859892
else
860893
{
894+
// Check if corresponding participant is known, abort otherwise
895+
// NOTE: Processing a DATA(w) should always be preceded by the reception and processing of its corresponding
896+
// participant. However, one may receive a DATA(w) just after the participant has been removed, case in which the
897+
// former should no longer be processed.
898+
std::map<eprosima::fastrtps::rtps::GuidPrefix_t, DiscoveryParticipantInfo>::iterator writer_part_it =
899+
participants_.find(writer_guid.guidPrefix);
900+
if (writer_part_it == participants_.end())
901+
{
902+
EPROSIMA_LOG_ERROR(DISCOVERY_DATABASE,
903+
"Writer " << writer_guid << " has no associated participant. Skipping");
904+
assert(topic_name != virtual_topic_);
905+
changes_to_release_.push_back(ch); // Release change so it can be reused
906+
return;
907+
}
908+
else if (writer_part_it->second.change()->kind != fastrtps::rtps::ChangeKind_t::ALIVE)
909+
{
910+
EPROSIMA_LOG_WARNING(DISCOVERY_DATABASE,
911+
"Writer " << writer_guid << " is associated to a removed participant. Skipping");
912+
assert(topic_name != virtual_topic_);
913+
changes_to_release_.push_back(ch); // Release change so it can be reused
914+
return;
915+
}
916+
861917
// Add entry to writers_
862918
DiscoveryEndpointInfo tmp_writer(
863919
ch,
@@ -878,18 +934,7 @@ void DiscoveryDataBase::create_writers_from_change_(
878934
new_updates_++;
879935

880936
// Add entry to participants_[guid_prefix]::writers
881-
std::map<eprosima::fastrtps::rtps::GuidPrefix_t, DiscoveryParticipantInfo>::iterator writer_part_it =
882-
participants_.find(writer_guid.guidPrefix);
883-
if (writer_part_it != participants_.end())
884-
{
885-
writer_part_it->second.add_writer(writer_guid);
886-
}
887-
else
888-
{
889-
EPROSIMA_LOG_ERROR(DISCOVERY_DATABASE,
890-
"Writer " << writer_guid << " has no associated participant. Skipping");
891-
return;
892-
}
937+
writer_part_it->second.add_writer(writer_guid);
893938

894939
// Add writer to writers_by_topic_[topic_name]
895940
add_writer_to_topic_(writer_guid, topic_name);
@@ -976,6 +1021,29 @@ void DiscoveryDataBase::create_readers_from_change_(
9761021
// The reader was NOT known by the database
9771022
else
9781023
{
1024+
// Check if corresponding participant is known, abort otherwise
1025+
// NOTE: Processing a DATA(r) should always be preceded by the reception and processing of its corresponding
1026+
// participant. However, one may receive a DATA(r) just after the participant has been removed, case in which the
1027+
// former should no longer be processed.
1028+
std::map<eprosima::fastrtps::rtps::GuidPrefix_t, DiscoveryParticipantInfo>::iterator reader_part_it =
1029+
participants_.find(reader_guid.guidPrefix);
1030+
if (reader_part_it == participants_.end())
1031+
{
1032+
EPROSIMA_LOG_ERROR(DISCOVERY_DATABASE,
1033+
"Reader " << reader_guid << " has no associated participant. Skipping");
1034+
assert(topic_name != virtual_topic_);
1035+
changes_to_release_.push_back(ch); // Release change so it can be reused
1036+
return;
1037+
}
1038+
else if (reader_part_it->second.change()->kind != fastrtps::rtps::ChangeKind_t::ALIVE)
1039+
{
1040+
EPROSIMA_LOG_WARNING(DISCOVERY_DATABASE,
1041+
"Reader " << reader_guid << " is associated to a removed participant. Skipping");
1042+
assert(topic_name != virtual_topic_);
1043+
changes_to_release_.push_back(ch); // Release change so it can be reused
1044+
return;
1045+
}
1046+
9791047
// Add entry to readers_
9801048
DiscoveryEndpointInfo tmp_reader(
9811049
ch,
@@ -996,18 +1064,7 @@ void DiscoveryDataBase::create_readers_from_change_(
9961064
new_updates_++;
9971065

9981066
// Add entry to participants_[guid_prefix]::readers
999-
std::map<eprosima::fastrtps::rtps::GuidPrefix_t, DiscoveryParticipantInfo>::iterator reader_part_it =
1000-
participants_.find(reader_guid.guidPrefix);
1001-
if (reader_part_it != participants_.end())
1002-
{
1003-
reader_part_it->second.add_reader(reader_guid);
1004-
}
1005-
else
1006-
{
1007-
EPROSIMA_LOG_ERROR(DISCOVERY_DATABASE,
1008-
"Reader " << reader_guid << " has no associated participant. Skipping");
1009-
return;
1010-
}
1067+
reader_part_it->second.add_reader(reader_guid);
10111068

10121069
// Add reader to readers_by_topic_[topic_name]
10131070
add_reader_to_topic_(reader_guid, topic_name);
@@ -1287,7 +1344,7 @@ void DiscoveryDataBase::process_dispose_participant_(
12871344
delete_reader_entity_(reader_guid);
12881345
}
12891346

1290-
// All participant endoints must be already unmatched in others endopoints relevant_ack maps
1347+
// All participant endpoints must be already unmatched in others endpoints relevant_ack maps
12911348

12921349
// Unmatch own participant
12931350
unmatch_participant_(participant_guid.guidPrefix);
@@ -1558,6 +1615,14 @@ bool DiscoveryDataBase::data_queue_empty()
15581615
return (pdp_data_queue_.BothEmpty() && edp_data_queue_.BothEmpty());
15591616
}
15601617

1618+
void DiscoveryDataBase::swap_data_queues()
1619+
{
1620+
// Swap EDP before PDP to avoid race condition in which both data P and w/r are received at the same time,
1621+
// just after having swapped the PDP queue
1622+
edp_data_queue_.Swap();
1623+
pdp_data_queue_.Swap();
1624+
}
1625+
15611626
bool DiscoveryDataBase::is_participant(
15621627
const eprosima::fastrtps::rtps::GUID_t& guid)
15631628
{

src/cpp/rtps/builtin/discovery/database/DiscoveryDataBase.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,9 @@ class DiscoveryDataBase
300300
// Check if the data queue is empty
301301
bool data_queue_empty();
302302

303+
// Swap both EDP and PDP data queues
304+
void swap_data_queues();
305+
303306
void to_json(
304307
nlohmann::json& j) const;
305308

src/cpp/rtps/builtin/discovery/participant/PDPServer.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,6 +1075,9 @@ bool PDPServer::remove_remote_participant(
10751075
bool PDPServer::process_data_queues()
10761076
{
10771077
EPROSIMA_LOG_INFO(RTPS_PDP_SERVER, "process_data_queues start");
1078+
// Swap both as a first step in order to avoid the following race condition: reception of data w/r while processing
1079+
// the PDP queue, not having processed yet the corresponding data P (also received while processing the queue).
1080+
discovery_db_.swap_data_queues();
10781081
discovery_db_.process_pdp_data_queue();
10791082
return discovery_db_.process_edp_data_queue();
10801083
}

test/blackbox/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ gtest_discover_tests(BlackboxTests_RTPS
116116

117117
file(GLOB BLACKBOXTESTS_TEST_SOURCE "common/BlackboxTests*.cpp")
118118
set(BLACKBOXTESTS_SOURCE ${BLACKBOXTESTS_TEST_SOURCE}
119+
types/core/core_types.cxx
120+
types/core/core_typesPubSubTypes.cxx
121+
types/core/core_typesv1.cxx
119122
types/Data1mb.cxx
120123
types/Data1mbPubSubTypes.cxx
121124
types/Data1mbv1.cxx

0 commit comments

Comments
 (0)