Skip to content

Commit 7607cff

Browse files
committed
crimson/osd/pg: set log_entry_update_waiting_on prior to sending requests
Before this patch, we would first send the MOSDPGUpdateLogMissing to all peers and only then insert this rep_tid to log_entry_update_waiting_on. This could have resulted in race where we receive the reply prior to actually inserting the rep_tid. The reply would have been discarded with "reply on unknown tid" (which is now aborting). The unhandled reply would have not let submit_error to return and would keep holding the lock on this obc. Fixes: https://tracker.ceph.com/issues/71204 Signed-off-by: Matan Breizman <[email protected]> (cherry picked from commit a2121eb)
1 parent 4e89b99 commit 7607cff

File tree

1 file changed

+23
-9
lines changed

1 file changed

+23
-9
lines changed

src/crimson/osd/pg.cc

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1093,8 +1093,26 @@ PG::interruptible_future<eversion_t> PG::submit_error_log(
10931093
log_entries, t, peering_state.get_pg_trim_to(),
10941094
peering_state.get_pg_committed_to());
10951095

1096-
10971096
set<pg_shard_t> waiting_on;
1097+
1098+
waiting_on.insert(pg_whoami);
1099+
1100+
// preapre log_entry_update_waiting_on prior to sending requests
1101+
for (const auto &peer: get_acting_recovery_backfill()) {
1102+
if (peer == pg_whoami) {
1103+
continue;
1104+
}
1105+
ceph_assert(peering_state.get_peer_missing().count(peer));
1106+
ceph_assert(peering_state.has_peer_info(peer));
1107+
waiting_on.insert(peer);
1108+
}
1109+
1110+
DEBUGDPP("inserting rep_tid {} waiting on {}", *this, rep_tid, waiting_on);
1111+
log_entry_update_waiting_on.insert(
1112+
std::make_pair(rep_tid,
1113+
log_update_t{std::move(waiting_on)}));
1114+
1115+
// Send missing_requests to peers
10981116
for (const auto &peer: get_acting_recovery_backfill()) {
10991117
if (peer == pg_whoami) {
11001118
continue;
@@ -1110,7 +1128,6 @@ PG::interruptible_future<eversion_t> PG::submit_error_log(
11101128
rep_tid,
11111129
peering_state.get_pg_trim_to(),
11121130
peering_state.get_pg_committed_to());
1113-
waiting_on.insert(peer);
11141131

11151132
DEBUGDPP("sending log missing_request (rep_tid: {} entries: {}) to osd {}",
11161133
*this, rep_tid, log_entries, peer.osd);
@@ -1120,11 +1137,7 @@ PG::interruptible_future<eversion_t> PG::submit_error_log(
11201137
std::move(log_m),
11211138
get_osdmap_epoch()));
11221139
}
1123-
waiting_on.insert(pg_whoami);
1124-
DEBUGDPP("inserting rep_tid {}", *this, rep_tid);
1125-
log_entry_update_waiting_on.insert(
1126-
std::make_pair(rep_tid,
1127-
log_update_t{std::move(waiting_on)}));
1140+
11281141
co_await interruptor::make_interruptible(
11291142
shard_services.get_store().do_transaction(
11301143
get_collection_ref(), std::move(t)
@@ -1471,8 +1484,9 @@ PG::interruptible_future<> PG::do_update_log_missing_reply(
14711484
log_entry_update_waiting_on.erase(it);
14721485
}
14731486
} else {
1474-
logger().error("{} : {} got reply {} on unknown tid {}",
1475-
__func__, peering_state.get_info().pgid, *m, m->get_tid());
1487+
ceph_abort_msg(fmt::format(
1488+
"{} : {} got reply {} on unknown tid {}",
1489+
__func__, peering_state.get_info().pgid, *m, m->get_tid()));
14761490
}
14771491
return seastar::now();
14781492
}

0 commit comments

Comments
 (0)