@@ -85,6 +85,13 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf)
8585 return out;
8686}
8787
88+ void PgScrubber::on_replica_activate ()
89+ {
90+ dout (10 ) << __func__ << dendl;
91+ m_fsm->process_event (ReplicaActivate{});
92+ }
93+
94+
8895/*
8996 * if the incoming message is from a previous interval, it must mean
9097 * PrimaryLogPG::on_change() was called when that interval ended. We can safely
@@ -197,7 +204,6 @@ bool PgScrubber::should_abort() const
197204 *
198205 * Some of the considerations above are also relevant to the replica-side
199206 * initiation
200- * ('StartReplica' & 'StartReplicaNoWait').
201207 */
202208
203209void PgScrubber::initiate_regular_scrub (epoch_t epoch_queued)
@@ -216,11 +222,6 @@ void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
216222 }
217223}
218224
219- void PgScrubber::dec_scrubs_remote ()
220- {
221- m_osds->get_scrub_services ().dec_scrubs_remote ();
222- }
223-
224225void PgScrubber::advance_token ()
225226{
226227 m_current_token++;
@@ -274,13 +275,7 @@ void PgScrubber::send_start_replica(epoch_t epoch_queued,
274275 }
275276
276277 if (check_interval (epoch_queued) && is_token_current (token)) {
277- // save us some time by not waiting for updates if there are none
278- // to wait for. Affects the transition from NotActive into either
279- // ReplicaWaitUpdates or ActiveReplica.
280- if (pending_active_pushes ())
281- m_fsm->process_event (StartReplica{});
282- else
283- m_fsm->process_event (StartReplicaNoWait{});
278+ m_fsm->process_event (StartReplica{});
284279 }
285280 dout (10 ) << " scrubber event --<< " << __func__ << dendl;
286281}
@@ -452,6 +447,11 @@ unsigned int PgScrubber::scrub_requeue_priority(
452447 * Responsible for resetting any scrub state and releasing any resources.
453448 * Any inflight events will be ignored via check_interval/should_drop_message
454449 * or canceled.
450+ * Specifically:
451+ * - if Primary and in an active session - the IntervalChanged handler takes
452+ * care of discarding the remote reservations, and transitioning out of
453+ * Session. That resets both the scrubber and the FSM.
454+ * - if we are a reserved replica - we need to free ourselves;
455455 */
456456void PgScrubber::on_new_interval ()
457457{
@@ -461,13 +461,7 @@ void PgScrubber::on_new_interval()
461461 is_scrub_active (), is_queued_or_active ())
462462 << dendl;
463463
464- // If in active session - the IntervalChanged handler takes care of
465- // discarding the remote reservations, and transitioning out of Session.
466- // That resets both the scrubber and the FSM.
467464 m_fsm->process_event (IntervalChanged{});
468-
469- // The 'FullReset' is only relevant if we are not an active Primary
470- m_fsm->process_event (FullReset{});
471465 rm_from_osd_scrubbing ();
472466}
473467
@@ -806,7 +800,7 @@ void PgScrubber::cancel_callback(scrubber_callback_cancel_token_t token)
806800 m_osds->sleep_timer .cancel_event (token);
807801}
808802
809- LogChannelRef & PgScrubber::get_clog () const
803+ LogChannelRef& PgScrubber::get_clog () const
810804{
811805 return m_osds->clog ;
812806}
@@ -816,6 +810,11 @@ int PgScrubber::get_whoami() const
816810 return m_osds->whoami ;
817811}
818812
813+ [[nodiscard]] bool PgScrubber::is_high_priority () const
814+ {
815+ return m_flags.required ;
816+ }
817+
819818/*
820819 * The selected range is set directly into 'm_start' and 'm_end'
821820 * setting:
@@ -1139,13 +1138,7 @@ void PgScrubber::on_init()
11391138 m_pg->publish_stats_to_osd ();
11401139}
11411140
1142- /*
1143- * Note: as on_replica_init() is likely to be called twice (entering
1144- * both ReplicaWaitUpdates & ActiveReplica), its operations should be
1145- * idempotent.
1146- * Now that it includes some state-changing operations, we need to check
1147- * m_active against double-activation.
1148- */
1141+
11491142void PgScrubber::on_replica_init ()
11501143{
11511144 dout (10 ) << __func__ << " called with 'active' "
@@ -1159,6 +1152,7 @@ void PgScrubber::on_replica_init()
11591152 }
11601153}
11611154
1155+
11621156int PgScrubber::build_primary_map_chunk ()
11631157{
11641158 epoch_t map_building_since = m_pg->get_osdmap_epoch ();
@@ -1217,23 +1211,21 @@ int PgScrubber::build_replica_map_chunk()
12171211
12181212 // the local map has been created. Send it to the primary.
12191213 // Note: once the message reaches the Primary, it may ask us for another
1220- // chunk - and we better be done with the current scrub. Thus - the
1221- // preparation of the reply message is separate, and we clear the scrub
1222- // state before actually sending it.
1214+ // chunk - and we better be done with the current scrub. The clearing of
1215+ // state must be complete before we relinquish the PG lock.
12231216
1224- auto reply = prep_replica_map_msg (PreemptionNoted::no_preemption);
1225- replica_handling_done ();
1226- dout (15 ) << __func__ << " chunk map sent " << dendl;
1227- send_replica_map (reply);
1228- } break ;
1217+ send_replica_map (prep_replica_map_msg (PreemptionNoted::no_preemption));
1218+ dout (15 ) << fmt::format (" {}: chunk map sent" , __func__) << dendl;
1219+ }
1220+ break ;
12291221
12301222 default :
12311223 // negative retval: build_scrub_map_chunk() signalled an error
12321224 // Pre-Pacific code ignored this option, treating it as a success.
12331225 // \todo Add an error flag in the returning message.
1226+ // \todo: must either abort, send a reply, or return some error message
12341227 dout (1 ) << " Error! Aborting. ActiveReplica::react(SchedReplica) Ret: "
12351228 << ret << dendl;
1236- replica_handling_done ();
12371229 // only in debug mode for now:
12381230 assert (false && " backend error" );
12391231 break ;
@@ -1520,6 +1512,7 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
15201512 replica_scrubmap_pos.reset (); // needed? RRR
15211513
15221514 set_queued_or_active ();
1515+ advance_token ();
15231516 m_osds->queue_for_rep_scrub (m_pg,
15241517 m_replica_request_priority,
15251518 m_flags.priority ,
@@ -1675,7 +1668,7 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
16751668 auto m = op->get_req <MOSDScrubReserve>();
16761669 switch (m->type ) {
16771670 case MOSDScrubReserve::REQUEST:
1678- handle_scrub_reserve_request (op );
1671+ m_fsm-> process_event (ReplicaReserveReq{op, m-> from } );
16791672 break ;
16801673 case MOSDScrubReserve::GRANT:
16811674 m_fsm->process_event (ReplicaGrant{op, m->from });
@@ -1684,65 +1677,12 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
16841677 m_fsm->process_event (ReplicaReject{op, m->from });
16851678 break ;
16861679 case MOSDScrubReserve::RELEASE:
1687- handle_scrub_reserve_release (op );
1680+ m_fsm-> process_event (ReplicaRelease{op, m-> from } );
16881681 break ;
16891682 }
16901683}
16911684
16921685
1693- void PgScrubber::handle_scrub_reserve_request (OpRequestRef op)
1694- {
1695- auto request_ep = op->sent_epoch ;
1696- dout (20 ) << fmt::format (" {}: request_ep:{} recovery:{}" ,
1697- __func__,
1698- request_ep,
1699- m_osds->is_recovery_active ())
1700- << dendl;
1701-
1702- // The primary may unilaterally restart the scrub process without notifying
1703- // replicas. Unconditionally clear any existing state prior to handling
1704- // the new reservation.
1705- m_fsm->process_event (FullReset{});
1706-
1707- bool granted{false };
1708- if (m_pg->cct ->_conf ->osd_scrub_during_recovery ||
1709- !m_osds->is_recovery_active ()) {
1710-
1711- granted = m_osds->get_scrub_services ().inc_scrubs_remote ();
1712- if (granted) {
1713- m_fsm->process_event (ReplicaGrantReservation{});
1714- } else {
1715- dout (20 ) << __func__ << " : failed to reserve remotely" << dendl;
1716- }
1717- } else {
1718- dout (10 ) << __func__ << " : recovery is active; not granting" << dendl;
1719- }
1720-
1721- dout (10 ) << __func__ << " reserved? " << (granted ? " yes" : " no" ) << dendl;
1722-
1723- Message* reply = new MOSDScrubReserve (
1724- spg_t (m_pg->info .pgid .pgid , m_pg->get_primary ().shard ),
1725- request_ep,
1726- granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
1727- m_pg_whoami);
1728-
1729- m_osds->send_message_osd_cluster (reply, op->get_req ()->get_connection ());
1730- }
1731-
1732- void PgScrubber::handle_scrub_reserve_release (OpRequestRef op)
1733- {
1734- dout (10 ) << __func__ << " " << *op->get_req () << dendl;
1735- if (should_drop_message (op)) {
1736- // we might have turned into a Primary in the meantime. The interval
1737- // change should have been noticed already, and caused us to reset.
1738- return ;
1739- }
1740-
1741- // this specific scrub session has terminated. All incoming events carrying
1742- // the old tag will be discarded.
1743- m_fsm->process_event (FullReset{});
1744- }
1745-
17461686bool PgScrubber::set_reserving_now () {
17471687 return m_osds->get_scrub_services ().set_reserving_now (m_pg_id,
17481688 ceph_clock_now ());
@@ -2211,6 +2151,7 @@ void PgScrubber::handle_query_state(ceph::Formatter* f)
22112151
22122152PgScrubber::~PgScrubber ()
22132153{
2154+ m_fsm->process_event (IntervalChanged{});
22142155 if (m_scrub_job) {
22152156 // make sure the OSD won't try to scrub this one just now
22162157 rm_from_osd_scrubbing ();
0 commit comments