66#include < span>
77
88#include " common/ceph_time.h"
9- #include " messages/MOSDScrubReserve.h"
109#include " osd/OSD.h"
1110#include " osd/PG.h"
1211#include " osd/osd_types_fmt.h"
@@ -31,11 +30,13 @@ namespace Scrub {
3130
3231ReplicaReservations::ReplicaReservations (
3332 ScrubMachineListener& scrbr,
33+ reservation_nonce_t & nonce,
3434 PerfCounters& pc)
3535 : m_scrubber{scrbr}
3636 , m_pg{m_scrubber.get_pg ()}
3737 , m_pgid{m_scrubber.get_spgid ().pgid }
3838 , m_osds{m_pg->get_pg_osd (ScrubberPasskey ())}
39+ , m_last_request_sent_nonce{nonce}
3940 , m_perf_set{pc}
4041{
4142 // the acting set is sorted by pg_shard_t. The reservations are to be issued
@@ -80,7 +81,7 @@ void ReplicaReservations::release_all()
8081 for (const auto & peer : replicas) {
8182 auto m = make_message<MOSDScrubReserve>(
8283 spg_t {m_pgid, peer.shard }, epoch, MOSDScrubReserve::RELEASE,
83- m_pg->pg_whoami );
84+ m_pg->pg_whoami , 0 );
8485 m_pg->send_cluster_message (peer.osd , m, epoch, false );
8586 }
8687
@@ -125,16 +126,50 @@ ReplicaReservations::~ReplicaReservations()
125126 log_failure_and_duration (scrbcnt_resrv_aborted);
126127}
127128
128- bool ReplicaReservations::handle_reserve_grant (OpRequestRef op, pg_shard_t from)
129+ bool ReplicaReservations::is_reservation_response_relevant (
130+ reservation_nonce_t msg_nonce) const
129131{
130- // verify that the grant is from the peer we expected. If not?
131- // for now - abort the OSD. \todo reconsider the reaction.
132- if (!get_last_sent ().has_value () || from != *get_last_sent ()) {
132+ return (msg_nonce == 0 ) || (msg_nonce == m_last_request_sent_nonce);
133+ }
134+
135+ bool ReplicaReservations::is_msg_source_correct (pg_shard_t from) const
136+ {
137+ const auto exp_source = get_last_sent ();
138+ return exp_source && from == *exp_source;
139+ }
140+
141+ bool ReplicaReservations::handle_reserve_grant (
142+ const MOSDScrubReserve& msg,
143+ pg_shard_t from)
144+ {
145+ if (!is_reservation_response_relevant (msg.reservation_nonce )) {
146+ // this is a stale response to a previous request (e.g. one that
147+ // timed-out). See m_last_request_sent_nonce for details.
133148 dout (1 ) << fmt::format (
134- " unexpected grant from {} (expected {})" , from,
135- get_last_sent ().value_or (pg_shard_t {}))
149+ " stale reservation response from {} with nonce {} vs. "
150+ " expected {} (e:{})" ,
151+ from, msg.reservation_nonce , m_last_request_sent_nonce,
152+ msg.map_epoch )
136153 << dendl;
137- ceph_assert (from == get_last_sent ());
154+ return false ;
155+ }
156+
157+ // verify that the grant is from the peer we expected. If not?
158+ // for now - abort the OSD. There is no known scenario in which a
159+ // grant message with a correct nonce can arrive from the wrong peer.
160+ // (we would not abort for arriving messages with nonce 0, as those
161+ // are legacy messages, for which the nonce was not verified).
162+ if (!is_msg_source_correct (from)) {
163+ const auto error_text = fmt::format (
164+ " unexpected reservation grant from {} vs. the expected {} (e:{} "
165+ " message nonce:{})" ,
166+ from, get_last_sent ().value_or (pg_shard_t {}), msg.map_epoch ,
167+ msg.reservation_nonce );
168+ dout (1 ) << error_text << dendl;
169+ if (msg.reservation_nonce != 0 ) {
170+ m_osds->clog ->error () << error_text;
171+ ceph_abort_msg (error_text);
172+ }
138173 return false ;
139174 }
140175
@@ -143,15 +178,15 @@ bool ReplicaReservations::handle_reserve_grant(OpRequestRef op, pg_shard_t from)
143178 // log a warning if the response was slow to arrive
144179 if ((m_slow_response_warn_timeout > 0ms) &&
145180 (elapsed > m_slow_response_warn_timeout)) {
146- dout (1 ) << fmt::format (
147- " slow reservation response from {} ({}ms)" , from,
148- duration_cast<milliseconds>(elapsed).count ())
149- << dendl;
181+ m_osds->clog ->warn () << fmt::format (
182+ " slow reservation response from {} ({}ms)" , from,
183+ duration_cast<milliseconds>(elapsed).count ());
150184 // prevent additional warnings
151185 m_slow_response_warn_timeout = 0ms;
152186 }
153187 dout (10 ) << fmt::format (
154- " granted by {} ({} of {}) in {}ms" , from,
188+ " (e:{} nonce:{}) granted by {} ({} of {}) in {}ms" ,
189+ msg.map_epoch , msg.reservation_nonce , from,
155190 active_requests_cnt (), m_sorted_secondaries.size (),
156191 duration_cast<milliseconds>(elapsed).count ())
157192 << dendl;
@@ -170,44 +205,64 @@ bool ReplicaReservations::send_next_reservation_or_complete()
170205 // send the next reservation request
171206 const auto peer = *m_next_to_request;
172207 const auto epoch = m_pg->get_osdmap_epoch ();
208+ m_last_request_sent_nonce++;
209+
173210 auto m = make_message<MOSDScrubReserve>(
174- spg_t {m_pgid, peer.shard }, epoch, MOSDScrubReserve::REQUEST,
175- m_pg-> pg_whoami );
211+ spg_t {m_pgid, peer.shard }, epoch, MOSDScrubReserve::REQUEST, m_pg-> pg_whoami ,
212+ m_last_request_sent_nonce );
176213 m_pg->send_cluster_message (peer.osd , m, epoch, false );
177214 m_last_request_sent_at = ScrubClock::now ();
178215 dout (10 ) << fmt::format (
179- " reserving {} (the {} of {} replicas)" , *m_next_to_request,
180- active_requests_cnt () + 1 , m_sorted_secondaries.size ())
216+ " reserving {} (the {} of {} replicas) e:{} nonce:{}" ,
217+ *m_next_to_request, active_requests_cnt () + 1 ,
218+ m_sorted_secondaries.size (), epoch, m_last_request_sent_nonce)
181219 << dendl;
182220 m_next_to_request++;
183221 return false ;
184222}
185223
186- void ReplicaReservations::verify_rejections_source (
187- OpRequestRef op ,
224+ bool ReplicaReservations::handle_reserve_rejection (
225+ const MOSDScrubReserve& msg ,
188226 pg_shard_t from)
189227{
190228 // a convenient log message for the reservation process conclusion
191229 // (matches the one in send_next_reservation_or_complete())
192230 dout (10 ) << fmt::format (
193- " remote reservation failure. Rejected by {} ({})" , from,
194- *op->get_req ())
231+ " remote reservation failure. Rejected by {} ({})" , from, msg)
195232 << dendl;
196233
234+ if (!is_reservation_response_relevant (msg.reservation_nonce )) {
235+ // this is a stale response to a previous request (e.g. one that
236+ // timed-out). See m_last_request_sent_nonce for details.
237+ dout (10 ) << fmt::format (
238+ " stale reservation response from {} with reservation_nonce "
239+ " {} vs. expected {} (e:{})" ,
240+ from, msg.reservation_nonce , m_last_request_sent_nonce,
241+ msg.map_epoch )
242+ << dendl;
243+ return false ;
244+ }
245+
246+ log_failure_and_duration (scrbcnt_resrv_rejected);
247+
248+ // we should never see a rejection carrying a valid
249+ // reservation nonce - arriving while we have no pending requests
250+ ceph_assert (get_last_sent ().has_value () || msg.reservation_nonce == 0 );
251+
197252 // verify that the denial is from the peer we expected. If not?
253+ // There is no known scenario in which this can happen, but if it does -
198254 // we should treat it as though the *correct* peer has rejected the request,
199255 // but remember to release that peer, too.
200-
201- ceph_assert (get_last_sent ().has_value ());
202- const auto expected = *get_last_sent ();
203- if (from != expected) {
204- dout (1 ) << fmt::format (
205- " unexpected rejection from {} (expected {})" , from, expected)
206- << dendl;
207- } else {
208- // correct peer, wrong answer...
256+ if (is_msg_source_correct (from)) {
209257 m_next_to_request--; // no need to release this one
258+ } else {
259+ m_osds->clog ->warn () << fmt::format (
260+ " unexpected reservation denial from {} vs the expected {} (e:{} "
261+ " message reservation_nonce:{})" ,
262+ from, get_last_sent ().value_or (pg_shard_t {}), msg.map_epoch ,
263+ msg.reservation_nonce );
210264 }
265+ return true ;
211266}
212267
213268std::optional<pg_shard_t > ReplicaReservations::get_last_sent () const
0 commit comments