ArbitCode
diff --git a/‎qa/standalone/scrub/osd-scrub-dump.sh‎
Lines changed: 8 additions & 1 deletion b/‎qa/standalone/scrub/osd-scrub-dump.sh‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎src/messages/MOSDScrubReserve.h‎
Lines changed: 1 addition & 1 deletion b/‎src/messages/MOSDScrubReserve.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/osd/PG.cc‎
Lines changed: 5 additions & 0 deletions b/‎src/osd/PG.cc‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/osd/PG.h‎
Lines changed: 2 additions & 0 deletions b/‎src/osd/PG.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/osd/PeeringState.cc‎
Lines changed: 2 additions & 0 deletions b/‎src/osd/PeeringState.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/osd/PeeringState.h‎
Lines changed: 1 addition & 0 deletions b/‎src/osd/PeeringState.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/osd/scrubber/osd_scrub.cc‎
Lines changed: 4 additions & 4 deletions b/‎src/osd/scrubber/osd_scrub.cc‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/osd/scrubber/osd_scrub.h‎
Lines changed: 2 additions & 2 deletions b/‎src/osd/scrubber/osd_scrub.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/osd/scrubber/pg_scrubber.cc‎
Lines changed: 32 additions & 91 deletions b/‎src/osd/scrubber/pg_scrubber.cc‎
Lines changed: 32 additions & 91 deletions
@@ -15,13 +15,20 @@
 # GNU Library Public License for more details.
 #
 
+
+# 30.11.2023: the test is now disabled, as the reservation mechanism has been
+# thoroughly reworked and the test is no longer valid.  The test is left here
+# as a basis for a new set of primary vs. replicas scrub activation tests.
+
 source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
 
 MAX_SCRUBS=4
 SCRUB_SLEEP=3
 POOL_SIZE=3
 
 function run() {
+    echo "This test is disabled"
+    return 0
     local dir=$1
     shift
     local CHUNK_MAX=5
@@ -123,7 +130,7 @@ function TEST_recover_unexpected() {
 	for o in $(seq 0 $(expr $OSDS - 1))
 	do
 		CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations
-		scrubs=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations | jq '.scrubs_local + .scrubs_remote')
+		scrubs=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations | jq '.scrubs_local + .granted_reservations')
 		if [ $scrubs -gt $MAX_SCRUBS ]; then
 		    echo "ERROR: More than $MAX_SCRUBS currently reserved"
 		    return 1
 
@@ -24,7 +24,7 @@ class MOSDScrubReserve : public MOSDFastDispatchOp {
 public:
   spg_t pgid;
   epoch_t map_epoch;
-  enum {
+  enum ReserveMsgOp {
     REQUEST = 0,
     GRANT = 1,
     RELEASE = 2,
 
@@ -1823,6 +1823,11 @@ void PG::on_activate(interval_set<snapid_t> snaps)
   m_scrubber->on_pg_activate(m_planned_scrub);
 }
 
+void PG::on_replica_activate()
+{
+  m_scrubber->on_replica_activate();
+}
+
 void PG::on_active_exit()
 {
   backfill_reserving = false;
 
@@ -624,6 +624,8 @@ class PG : public DoutPrefixProvider,
 
   void on_activate(interval_set<snapid_t> snaps) override;
 
+  void on_replica_activate() override;
+
   void on_activate_committed() override;
 
   void on_active_actmap() override;
 
@@ -2967,6 +2967,8 @@ void PeeringState::activate(
 
     state_set(PG_STATE_ACTIVATING);
     pl->on_activate(std::move(to_trim));
+  } else {
+    pl->on_replica_activate();
   }
   if (acting_set_writeable()) {
     PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};
 
@@ -389,6 +389,7 @@ class PeeringState : public MissingLoc::MappingInfo {
     virtual void on_role_change() = 0;
     virtual void on_change(ObjectStore::Transaction &t) = 0;
     virtual void on_activate(interval_set<snapid_t> to_trim) = 0;
+    virtual void on_replica_activate() {}
     virtual void on_activate_complete() = 0;
     virtual void on_new_interval() = 0;
     virtual Context *on_clean() = 0;
 
@@ -441,14 +441,14 @@ void OsdScrub::dec_scrubs_local()
   m_resource_bookkeeper.dec_scrubs_local();
 }
 
-bool OsdScrub::inc_scrubs_remote()
+bool OsdScrub::inc_scrubs_remote(pg_t pgid)
 {
-  return m_resource_bookkeeper.inc_scrubs_remote();
+  return m_resource_bookkeeper.inc_scrubs_remote(pgid);
 }
 
-void OsdScrub::dec_scrubs_remote()
+void OsdScrub::dec_scrubs_remote(pg_t pgid)
 {
-  m_resource_bookkeeper.dec_scrubs_remote();
+  m_resource_bookkeeper.dec_scrubs_remote(pgid);
 }
 
 void OsdScrub::mark_pg_scrub_blocked(spg_t blocked_pg)
 
@@ -67,8 +67,8 @@ class OsdScrub {
   // updating the resource counters
   bool inc_scrubs_local();
   void dec_scrubs_local();
-  bool inc_scrubs_remote();
-  void dec_scrubs_remote();
+  bool inc_scrubs_remote(pg_t pgid);
+  void dec_scrubs_remote(pg_t pgid);
 
   // counting the number of PGs stuck while scrubbing, waiting for objects
   void mark_pg_scrub_blocked(spg_t blocked_pg);
 
@@ -85,6 +85,13 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf)
   return out;
 }
 
+void PgScrubber::on_replica_activate()
+{
+  dout(10) << __func__ << dendl;
+  m_fsm->process_event(ReplicaActivate{});
+}
+
+
 /*
  * if the incoming message is from a previous interval, it must mean
  * PrimaryLogPG::on_change() was called when that interval ended. We can safely
@@ -197,7 +204,6 @@ bool PgScrubber::should_abort() const
  *
  * Some of the considerations above are also relevant to the replica-side
  * initiation
- * ('StartReplica' & 'StartReplicaNoWait').
  */
 
 void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
@@ -216,11 +222,6 @@ void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
   }
 }
 
-void PgScrubber::dec_scrubs_remote()
-{
-  m_osds->get_scrub_services().dec_scrubs_remote();
-}
-
 void PgScrubber::advance_token()
 {
   m_current_token++;
@@ -274,13 +275,7 @@ void PgScrubber::send_start_replica(epoch_t epoch_queued,
   }
 
   if (check_interval(epoch_queued) && is_token_current(token)) {
-    // save us some time by not waiting for updates if there are none
-    // to wait for. Affects the transition from NotActive into either
-    // ReplicaWaitUpdates or ActiveReplica.
-    if (pending_active_pushes())
-      m_fsm->process_event(StartReplica{});
-    else
-      m_fsm->process_event(StartReplicaNoWait{});
+    m_fsm->process_event(StartReplica{});
   }
   dout(10) << "scrubber event --<< " << __func__ << dendl;
 }
@@ -452,6 +447,11 @@ unsigned int PgScrubber::scrub_requeue_priority(
  * Responsible for resetting any scrub state and releasing any resources.
  * Any inflight events will be ignored via check_interval/should_drop_message
  * or canceled.
+ * Specifically:
+ * - if Primary and in an active session - the IntervalChanged handler takes
+ *   care of discarding the remote reservations, and transitioning out of
+ *   Session. That resets both the scrubber and the FSM.
+ * - if we are a reserved replica - we need to free ourselves;
  */
 void PgScrubber::on_new_interval()
 {
@@ -461,13 +461,7 @@ void PgScrubber::on_new_interval()
 		  is_scrub_active(), is_queued_or_active())
 	   << dendl;
 
-  // If in active session - the IntervalChanged handler takes care of
-  // discarding the remote reservations, and transitioning out of Session.
-  // That resets both the scrubber and the FSM.
   m_fsm->process_event(IntervalChanged{});
-
-  // The 'FullReset' is only relevant if we are not an active Primary
-  m_fsm->process_event(FullReset{});
   rm_from_osd_scrubbing();
 }
 
@@ -806,7 +800,7 @@ void PgScrubber::cancel_callback(scrubber_callback_cancel_token_t token)
   m_osds->sleep_timer.cancel_event(token);
 }
 
-LogChannelRef &PgScrubber::get_clog() const
+LogChannelRef& PgScrubber::get_clog() const
 {
   return m_osds->clog;
 }
@@ -816,6 +810,11 @@ int PgScrubber::get_whoami() const
   return m_osds->whoami;
 }
 
+[[nodiscard]] bool PgScrubber::is_high_priority() const
+{
+  return m_flags.required;
+}
+
 /*
  * The selected range is set directly into 'm_start' and 'm_end'
  * setting:
@@ -1139,13 +1138,7 @@ void PgScrubber::on_init()
   m_pg->publish_stats_to_osd();
 }
 
-/*
- * Note: as on_replica_init() is likely to be called twice (entering
- * both ReplicaWaitUpdates & ActiveReplica), its operations should be
- * idempotent.
- * Now that it includes some state-changing operations, we need to check
- * m_active against double-activation.
- */
+
 void PgScrubber::on_replica_init()
 {
   dout(10) << __func__ << " called with 'active' "
@@ -1159,6 +1152,7 @@ void PgScrubber::on_replica_init()
   }
 }
 
+
 int PgScrubber::build_primary_map_chunk()
 {
   epoch_t map_building_since = m_pg->get_osdmap_epoch();
@@ -1217,23 +1211,21 @@ int PgScrubber::build_replica_map_chunk()
 
       // the local map has been created. Send it to the primary.
       // Note: once the message reaches the Primary, it may ask us for another
-      // chunk - and we better be done with the current scrub. Thus - the
-      // preparation of the reply message is separate, and we clear the scrub
-      // state before actually sending it.
+      // chunk - and we better be done with the current scrub. The clearing of
+      // state must be complete before we relinquish the PG lock.
 
-      auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
-      replica_handling_done();
-      dout(15) << __func__ << " chunk map sent " << dendl;
-      send_replica_map(reply);
-    } break;
+      send_replica_map(prep_replica_map_msg(PreemptionNoted::no_preemption));
+      dout(15) << fmt::format("{}: chunk map sent", __func__) << dendl;
+    }
+    break;
 
     default:
       // negative retval: build_scrub_map_chunk() signalled an error
       // Pre-Pacific code ignored this option, treating it as a success.
       // \todo Add an error flag in the returning message.
+      // \todo: must either abort, send a reply, or return some error message
       dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: "
 	      << ret << dendl;
-      replica_handling_done();
       // only in debug mode for now:
       assert(false && "backend error");
       break;
@@ -1520,6 +1512,7 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
   replica_scrubmap_pos.reset();	 // needed? RRR
 
   set_queued_or_active();
+  advance_token();
   m_osds->queue_for_rep_scrub(m_pg,
 			      m_replica_request_priority,
 			      m_flags.priority,
@@ -1675,7 +1668,7 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
   auto m = op->get_req<MOSDScrubReserve>();
   switch (m->type) {
     case MOSDScrubReserve::REQUEST:
-      handle_scrub_reserve_request(op);
+      m_fsm->process_event(ReplicaReserveReq{op, m->from});
       break;
     case MOSDScrubReserve::GRANT:
       m_fsm->process_event(ReplicaGrant{op, m->from});
@@ -1684,65 +1677,12 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
       m_fsm->process_event(ReplicaReject{op, m->from});
       break;
     case MOSDScrubReserve::RELEASE:
-      handle_scrub_reserve_release(op);
+      m_fsm->process_event(ReplicaRelease{op, m->from});
       break;
   }
 }
 
 
-void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
-{
-  auto request_ep = op->sent_epoch;
-  dout(20) << fmt::format("{}: request_ep:{} recovery:{}",
-			  __func__,
-			  request_ep,
-			  m_osds->is_recovery_active())
-	   << dendl;
-
-  // The primary may unilaterally restart the scrub process without notifying
-  // replicas. Unconditionally clear any existing state prior to handling
-  // the new reservation.
-  m_fsm->process_event(FullReset{});
-
-  bool granted{false};
-  if (m_pg->cct->_conf->osd_scrub_during_recovery ||
-      !m_osds->is_recovery_active()) {
-
-    granted = m_osds->get_scrub_services().inc_scrubs_remote();
-    if (granted) {
-      m_fsm->process_event(ReplicaGrantReservation{});
-    } else {
-      dout(20) << __func__ << ": failed to reserve remotely" << dendl;
-    }
-  } else {
-    dout(10) << __func__ << ": recovery is active; not granting" << dendl;
-  }
-
-  dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
-
-  Message* reply = new MOSDScrubReserve(
-    spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
-    request_ep,
-    granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
-    m_pg_whoami);
-
-  m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
-}
-
-void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
-{
-  dout(10) << __func__ << " " << *op->get_req() << dendl;
-  if (should_drop_message(op)) {
-    // we might have turned into a Primary in the meantime. The interval
-    // change should have been noticed already, and caused us to reset.
-    return;
-  }
-
-  // this specific scrub session has terminated. All incoming events carrying
-  // the old tag will be discarded.
-  m_fsm->process_event(FullReset{});
-}
-
 bool PgScrubber::set_reserving_now() {
   return m_osds->get_scrub_services().set_reserving_now(m_pg_id,
                                                         ceph_clock_now());
@@ -2211,6 +2151,7 @@ void PgScrubber::handle_query_state(ceph::Formatter* f)
 
 PgScrubber::~PgScrubber()
 {
+  m_fsm->process_event(IntervalChanged{});
   if (m_scrub_job) {
     // make sure the OSD won't try to scrub this one just now
     rm_from_osd_scrubbing();
Original file line number	Diff line number	Diff line change
`@@ -1823,6 +1823,11 @@ void PG::on_activate(interval_set<snapid_t> snaps)`
`1823`	`1823`	`m_scrubber->on_pg_activate(m_planned_scrub);`
`1824`	`1824`	`}`
`1825`	`1825`
	`1826`	`+void PG::on_replica_activate()`
	`1827`	`+{`
	`1828`	`+ m_scrubber->on_replica_activate();`
	`1829`	`+}`
	`1830`	`+`
`1826`	`1831`	`void PG::on_active_exit()`
`1827`	`1832`	`{`
`1828`	`1833`	`backfill_reserving = false;`
Original file line number	Diff line number	Diff line change
`@@ -2967,6 +2967,8 @@ void PeeringState::activate(`
`2967`	`2967`
`2968`	`2968`	`state_set(PG_STATE_ACTIVATING);`
`2969`	`2969`	`pl->on_activate(std::move(to_trim));`
	`2970`	`+ } else {`
	`2971`	`+ pl->on_replica_activate();`
`2970`	`2972`	`}`
`2971`	`2973`	`if (acting_set_writeable()) {`
`2972`	`2974`	`PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};`
Original file line number	Diff line number	Diff line change
`@@ -441,14 +441,14 @@ void OsdScrub::dec_scrubs_local()`
`441`	`441`	`m_resource_bookkeeper.dec_scrubs_local();`
`442`	`442`	`}`
`443`	`443`
`444`		`-bool OsdScrub::inc_scrubs_remote()`
	`444`	`+bool OsdScrub::inc_scrubs_remote(pg_t pgid)`
`445`	`445`	`{`
`446`		`- return m_resource_bookkeeper.inc_scrubs_remote();`
	`446`	`+ return m_resource_bookkeeper.inc_scrubs_remote(pgid);`
`447`	`447`	`}`
`448`	`448`
`449`		`-void OsdScrub::dec_scrubs_remote()`
	`449`	`+void OsdScrub::dec_scrubs_remote(pg_t pgid)`
`450`	`450`	`{`
`451`		`- m_resource_bookkeeper.dec_scrubs_remote();`
	`451`	`+ m_resource_bookkeeper.dec_scrubs_remote(pgid);`
`452`	`452`	`}`
`453`	`453`
`454`	`454`	`void OsdScrub::mark_pg_scrub_blocked(spg_t blocked_pg)`