Skip to content

Commit 0fe3ccb

Browse files
authored
Merge pull request ceph#55217 from ronen-fr/wip-rf-old-reserv
osd/scrub: check reservation replies for relevance Reviewed-by: Samuel Just <[email protected]>-
2 parents 24dcb42 + 7559a6c commit 0fe3ccb

File tree

5 files changed

+200
-52
lines changed

5 files changed

+200
-52
lines changed

src/messages/MOSDScrubReserve.h

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@
1919

2020
class MOSDScrubReserve : public MOSDFastDispatchOp {
2121
private:
22-
static constexpr int HEAD_VERSION = 1;
22+
static constexpr int HEAD_VERSION = 2;
2323
static constexpr int COMPAT_VERSION = 1;
2424
public:
25+
using reservation_nonce_t = uint32_t;
26+
2527
spg_t pgid;
2628
epoch_t map_epoch;
2729
enum ReserveMsgOp {
@@ -32,6 +34,7 @@ class MOSDScrubReserve : public MOSDFastDispatchOp {
3234
};
3335
int32_t type;
3436
pg_shard_t from;
37+
reservation_nonce_t reservation_nonce{0};
3538

3639
epoch_t get_map_epoch() const override {
3740
return map_epoch;
@@ -46,10 +49,11 @@ class MOSDScrubReserve : public MOSDFastDispatchOp {
4649
MOSDScrubReserve(spg_t pgid,
4750
epoch_t map_epoch,
4851
int type,
49-
pg_shard_t from)
52+
pg_shard_t from,
53+
reservation_nonce_t nonce)
5054
: MOSDFastDispatchOp{MSG_OSD_SCRUB_RESERVE, HEAD_VERSION, COMPAT_VERSION},
5155
pgid(pgid), map_epoch(map_epoch),
52-
type(type), from(from) {}
56+
type(type), from(from), reservation_nonce{nonce} {}
5357

5458
std::string_view get_type_name() const {
5559
return "MOSDScrubReserve";
@@ -71,7 +75,8 @@ class MOSDScrubReserve : public MOSDFastDispatchOp {
7175
out << "RELEASE ";
7276
break;
7377
}
74-
out << "e" << map_epoch << ")";
78+
out << "e" << map_epoch << " from: " << from
79+
<< " reservation_nonce: " << reservation_nonce << ")";
7580
return;
7681
}
7782

@@ -82,6 +87,13 @@ class MOSDScrubReserve : public MOSDFastDispatchOp {
8287
decode(map_epoch, p);
8388
decode(type, p);
8489
decode(from, p);
90+
if (header.version >= 2) {
91+
decode(reservation_nonce, p);
92+
} else {
93+
// a zero nonce (identifying legacy senders) is ignored when
94+
// checking the request for obsolescence
95+
reservation_nonce = 0;
96+
}
8597
}
8698

8799
void encode_payload(uint64_t features) {
@@ -90,6 +102,7 @@ class MOSDScrubReserve : public MOSDFastDispatchOp {
90102
encode(map_epoch, payload);
91103
encode(type, payload);
92104
encode(from, payload);
105+
encode(reservation_nonce, payload);
93106
}
94107
private:
95108
template<class T, typename... Args>

src/osd/scrubber/scrub_machine.cc

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,9 @@ ReservingReplicas::ReservingReplicas(my_context ctx)
230230
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
231231

232232
// initiate the reservation process
233-
session.m_reservations.emplace(*scrbr, *session.m_perf_set);
233+
session.m_reservations.emplace(
234+
*scrbr, context<PrimaryActive>().last_request_sent_nonce,
235+
*session.m_perf_set);
234236

235237
if (session.m_reservations->get_last_sent()) {
236238
// the 1'st reservation request was sent
@@ -263,9 +265,9 @@ sc::result ReservingReplicas::react(const ReplicaGrant& ev)
263265
{
264266
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
265267
dout(10) << "ReservingReplicas::react(const ReplicaGrant&)" << dendl;
268+
const auto& m = ev.m_op->get_req<MOSDScrubReserve>();
266269

267-
if (context<Session>().m_reservations->handle_reserve_grant(
268-
ev.m_op, ev.m_from)) {
270+
if (context<Session>().m_reservations->handle_reserve_grant(*m, ev.m_from)) {
269271
// we are done with the reservation process
270272
return transit<ActiveScrubbing>();
271273
}
@@ -277,13 +279,21 @@ sc::result ReservingReplicas::react(const ReplicaReject& ev)
277279
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
278280
auto& session = context<Session>();
279281
dout(10) << "ReservingReplicas::react(const ReplicaReject&)" << dendl;
280-
session.m_reservations->log_failure_and_duration(scrbcnt_resrv_rejected);
281-
282-
// manipulate the 'next to reserve' iterator to exclude
283-
// the rejecting replica from the set of replicas requiring release
284-
session.m_reservations->verify_rejections_source(ev.m_op, ev.m_from);
282+
const auto m = ev.m_op->get_req<MOSDScrubReserve>();
283+
284+
// Verify that the message is from the replica we were expecting a reply from,
285+
// and that the message is not stale. If all is well - this is a real rejection:
286+
// - log required details;
287+
// - manipulate the 'next to reserve' iterator to exclude
288+
// the rejecting replica from the set of replicas requiring release
289+
if (!session.m_reservations->handle_reserve_rejection(*m, ev.m_from)) {
290+
// stale or unexpected
291+
return discard_event();
292+
}
285293

286-
// set 'reservation failure' as the scrub termination cause (affecting
294+
// The rejection was carrying the correct reservation_nonce. It was
295+
// logged by handle_reserve_rejection().
296+
// Set 'reservation failure' as the scrub termination cause (affecting
287297
// the rescheduling of this PG)
288298
scrbr->flag_reservations_failure();
289299

@@ -777,7 +787,13 @@ ReplicaActive::~ReplicaActive()
777787
void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev)
778788
{
779789
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
780-
dout(10) << "ReplicaActive::on_reserve_req()" << dendl;
790+
const auto m = ev.m_op->get_req<MOSDScrubReserve>();
791+
const auto msg_nonce = m->reservation_nonce;
792+
dout(10)
793+
<< fmt::format(
794+
"ReplicaActive::on_reserve_req() from {} (reservation_nonce:{})",
795+
ev.m_from, msg_nonce)
796+
<< dendl;
781797

782798
if (reserved_by_my_primary) {
783799
dout(10) << "ReplicaActive::on_reserve_req(): already reserved" << dendl;
@@ -797,7 +813,7 @@ void ReplicaActive::on_reserve_req(const ReplicaReserveReq& ev)
797813

798814
Message* reply = new MOSDScrubReserve(
799815
spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch, ret.op,
800-
m_pg->pg_whoami);
816+
m_pg->pg_whoami, msg_nonce);
801817
m_osds->send_message_osd_cluster(reply, ev.m_op->get_req()->get_connection());
802818
}
803819

src/osd/scrubber/scrub_machine.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,22 @@ struct PrimaryActive : sc::state<PrimaryActive, ScrubMachine, PrimaryIdle>,
440440
using reactions = mpl::list<
441441
// when the interval ends - we may not be a primary anymore
442442
sc::transition<IntervalChanged, NotActive>>;
443+
444+
/**
445+
* Identifies a specific reservation request.
446+
* The primary is permitted to cancel outstanding reservation requests without
447+
* waiting for the pending response from the replica. Thus, we may, in general,
448+
* see responses from prior reservation attempts that we need to ignore. Each
449+
* reservation request is therefore associated with a nonce incremented within
450+
* an interval with each reservation request. Any response with a non-matching
451+
* nonce must be from a reservation request we canceled. Note that this check
452+
* occurs after validating that the message is from the current interval, so
453+
* reusing nonces between intervals is safe.
454+
*
455+
* 0 is a special value used to indicate that the sender did not include a nonce due
456+
* to not being a sufficiently recent version.
457+
*/
458+
reservation_nonce_t last_request_sent_nonce{1};
443459
};
444460

445461
/**

src/osd/scrubber/scrub_reservations.cc

Lines changed: 86 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
#include <span>
77

88
#include "common/ceph_time.h"
9-
#include "messages/MOSDScrubReserve.h"
109
#include "osd/OSD.h"
1110
#include "osd/PG.h"
1211
#include "osd/osd_types_fmt.h"
@@ -31,11 +30,13 @@ namespace Scrub {
3130

3231
ReplicaReservations::ReplicaReservations(
3332
ScrubMachineListener& scrbr,
33+
reservation_nonce_t& nonce,
3434
PerfCounters& pc)
3535
: m_scrubber{scrbr}
3636
, m_pg{m_scrubber.get_pg()}
3737
, m_pgid{m_scrubber.get_spgid().pgid}
3838
, m_osds{m_pg->get_pg_osd(ScrubberPasskey())}
39+
, m_last_request_sent_nonce{nonce}
3940
, m_perf_set{pc}
4041
{
4142
// the acting set is sorted by pg_shard_t. The reservations are to be issued
@@ -80,7 +81,7 @@ void ReplicaReservations::release_all()
8081
for (const auto& peer : replicas) {
8182
auto m = make_message<MOSDScrubReserve>(
8283
spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::RELEASE,
83-
m_pg->pg_whoami);
84+
m_pg->pg_whoami, 0);
8485
m_pg->send_cluster_message(peer.osd, m, epoch, false);
8586
}
8687

@@ -125,16 +126,50 @@ ReplicaReservations::~ReplicaReservations()
125126
log_failure_and_duration(scrbcnt_resrv_aborted);
126127
}
127128

128-
bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
129+
bool ReplicaReservations::is_reservation_response_relevant(
130+
reservation_nonce_t msg_nonce) const
129131
{
130-
// verify that the grant is from the peer we expected. If not?
131-
// for now - abort the OSD. \todo reconsider the reaction.
132-
if (!get_last_sent().has_value() || from != *get_last_sent()) {
132+
return (msg_nonce == 0) || (msg_nonce == m_last_request_sent_nonce);
133+
}
134+
135+
bool ReplicaReservations::is_msg_source_correct(pg_shard_t from) const
136+
{
137+
const auto exp_source = get_last_sent();
138+
return exp_source && from == *exp_source;
139+
}
140+
141+
bool ReplicaReservations::handle_reserve_grant(
142+
const MOSDScrubReserve& msg,
143+
pg_shard_t from)
144+
{
145+
if (!is_reservation_response_relevant(msg.reservation_nonce)) {
146+
// this is a stale response to a previous request (e.g. one that
147+
// timed-out). See m_last_request_sent_nonce for details.
133148
dout(1) << fmt::format(
134-
"unexpected grant from {} (expected {})", from,
135-
get_last_sent().value_or(pg_shard_t{}))
149+
"stale reservation response from {} with nonce {} vs. "
150+
"expected {} (e:{})",
151+
from, msg.reservation_nonce, m_last_request_sent_nonce,
152+
msg.map_epoch)
136153
<< dendl;
137-
ceph_assert(from == get_last_sent());
154+
return false;
155+
}
156+
157+
// verify that the grant is from the peer we expected. If not?
158+
// for now - abort the OSD. There is no known scenario in which a
159+
// grant message with a correct nonce can arrive from the wrong peer.
160+
// (we would not abort for arriving messages with nonce 0, as those
161+
// are legacy messages, for which the nonce was not verified).
162+
if (!is_msg_source_correct(from)) {
163+
const auto error_text = fmt::format(
164+
"unexpected reservation grant from {} vs. the expected {} (e:{} "
165+
"message nonce:{})",
166+
from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch,
167+
msg.reservation_nonce);
168+
dout(1) << error_text << dendl;
169+
if (msg.reservation_nonce != 0) {
170+
m_osds->clog->error() << error_text;
171+
ceph_abort_msg(error_text);
172+
}
138173
return false;
139174
}
140175

@@ -143,15 +178,15 @@ bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
143178
// log a warning if the response was slow to arrive
144179
if ((m_slow_response_warn_timeout > 0ms) &&
145180
(elapsed > m_slow_response_warn_timeout)) {
146-
dout(1) << fmt::format(
147-
"slow reservation response from {} ({}ms)", from,
148-
duration_cast<milliseconds>(elapsed).count())
149-
<< dendl;
181+
m_osds->clog->warn() << fmt::format(
182+
"slow reservation response from {} ({}ms)", from,
183+
duration_cast<milliseconds>(elapsed).count());
150184
// prevent additional warnings
151185
m_slow_response_warn_timeout = 0ms;
152186
}
153187
dout(10) << fmt::format(
154-
"granted by {} ({} of {}) in {}ms", from,
188+
"(e:{} nonce:{}) granted by {} ({} of {}) in {}ms",
189+
msg.map_epoch, msg.reservation_nonce, from,
155190
active_requests_cnt(), m_sorted_secondaries.size(),
156191
duration_cast<milliseconds>(elapsed).count())
157192
<< dendl;
@@ -170,44 +205,64 @@ bool ReplicaReservations::send_next_reservation_or_complete()
170205
// send the next reservation request
171206
const auto peer = *m_next_to_request;
172207
const auto epoch = m_pg->get_osdmap_epoch();
208+
m_last_request_sent_nonce++;
209+
173210
auto m = make_message<MOSDScrubReserve>(
174-
spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST,
175-
m_pg->pg_whoami);
211+
spg_t{m_pgid, peer.shard}, epoch, MOSDScrubReserve::REQUEST, m_pg->pg_whoami,
212+
m_last_request_sent_nonce);
176213
m_pg->send_cluster_message(peer.osd, m, epoch, false);
177214
m_last_request_sent_at = ScrubClock::now();
178215
dout(10) << fmt::format(
179-
"reserving {} (the {} of {} replicas)", *m_next_to_request,
180-
active_requests_cnt() + 1, m_sorted_secondaries.size())
216+
"reserving {} (the {} of {} replicas) e:{} nonce:{}",
217+
*m_next_to_request, active_requests_cnt() + 1,
218+
m_sorted_secondaries.size(), epoch, m_last_request_sent_nonce)
181219
<< dendl;
182220
m_next_to_request++;
183221
return false;
184222
}
185223

186-
void ReplicaReservations::verify_rejections_source(
187-
OpRequestRef op,
224+
bool ReplicaReservations::handle_reserve_rejection(
225+
const MOSDScrubReserve& msg,
188226
pg_shard_t from)
189227
{
190228
// a convenient log message for the reservation process conclusion
191229
// (matches the one in send_next_reservation_or_complete())
192230
dout(10) << fmt::format(
193-
"remote reservation failure. Rejected by {} ({})", from,
194-
*op->get_req())
231+
"remote reservation failure. Rejected by {} ({})", from, msg)
195232
<< dendl;
196233

234+
if (!is_reservation_response_relevant(msg.reservation_nonce)) {
235+
// this is a stale response to a previous request (e.g. one that
236+
// timed-out). See m_last_request_sent_nonce for details.
237+
dout(10) << fmt::format(
238+
"stale reservation response from {} with reservation_nonce "
239+
"{} vs. expected {} (e:{})",
240+
from, msg.reservation_nonce, m_last_request_sent_nonce,
241+
msg.map_epoch)
242+
<< dendl;
243+
return false;
244+
}
245+
246+
log_failure_and_duration(scrbcnt_resrv_rejected);
247+
248+
// we should never see a rejection carrying a valid
249+
// reservation nonce - arriving while we have no pending requests
250+
ceph_assert(get_last_sent().has_value() || msg.reservation_nonce == 0);
251+
197252
// verify that the denial is from the peer we expected. If not?
253+
// There is no known scenario in which this can happen, but if it does -
198254
// we should treat it as though the *correct* peer has rejected the request,
199255
// but remember to release that peer, too.
200-
201-
ceph_assert(get_last_sent().has_value());
202-
const auto expected = *get_last_sent();
203-
if (from != expected) {
204-
dout(1) << fmt::format(
205-
"unexpected rejection from {} (expected {})", from, expected)
206-
<< dendl;
207-
} else {
208-
// correct peer, wrong answer...
256+
if (is_msg_source_correct(from)) {
209257
m_next_to_request--; // no need to release this one
258+
} else {
259+
m_osds->clog->warn() << fmt::format(
260+
"unexpected reservation denial from {} vs the expected {} (e:{} "
261+
"message reservation_nonce:{})",
262+
from, get_last_sent().value_or(pg_shard_t{}), msg.map_epoch,
263+
msg.reservation_nonce);
210264
}
265+
return true;
211266
}
212267

213268
std::optional<pg_shard_t> ReplicaReservations::get_last_sent() const

0 commit comments

Comments
 (0)