osd/scrub: handling unexpected scrub requests

ronen-fr · ronen-fr · commit 9802b770f835 · 2023-11-30T05:40:27.000-06:00
if arriving while still handling a previous chunk request,
the handling of of the previous chunk request will be aborted.
No response is sent. The scrubber resets, then handles the new request.

Signed-off-by: Ronen Friedman &lt;rfriedma@redhat.com&gt;
diff --git a/src/osd/scrubber/pg_scrubber.h b/src/osd/scrubber/pg_scrubber.h
@@ -673,10 +673,18 @@ class PgScrubber : public ScrubPgIF,
   /**
    * (replica) a tag identifying a specific replica operation, i.e. the
    * creation of the replica scrub map for a single chunk.
-   * Incremented immediately before sending a response to the primary,
-   * so that the next request would be identified as such. Also changed
-   * on reservation release.
-   * Used to identify stale scrub-re-sched messages triggered by the backend.
+   *
+   * Background: the backend is asynchronous, and the specific
+   * operations are size-limited. While the scrubber handles a specific
+   * request, it is continuously triggered to poll the backend for the
+   * full results for the chunk handled.
+   * Once the chunk request becomes obsolete, either following an interval
+   * change or if a new request was received, we must not send the stale
+   * data to the primary. The polling of the obsolete chunk request must
+   * stop, and the stale backend response should be discarded.
+   * In other words - the token should be read as saying "the primary has
+   * lost interest in the results of all operations identified by mismatched
+   * token values".
    */
   Scrub::act_token_t m_current_token{1};
 
diff --git a/src/osd/scrubber/scrub_machine.cc b/src/osd/scrubber/scrub_machine.cc
@@ -797,6 +797,22 @@ ReplicaActiveOp::~ReplicaActiveOp()
   scrbr->replica_handling_done();
 }
 
+sc::result ReplicaActiveOp::react(const StartReplica&)
+{
+  DECLARE_LOCALS;  // 'scrbr' & 'pg_id' aliases
+  dout(10) << "ReplicaActiveOp::react(const StartReplica&)" << dendl;
+
+  const auto msg = fmt::format(
+      "osd.{} pg[{}]: new chunk request while still handling the previous one",
+      scrbr->get_whoami(), scrbr->get_spgid());
+  dout(1) << msg << dendl;
+  scrbr->get_clog()->warn() << msg;
+
+  post_event(ReplicaPushesUpd{});
+
+  // exit & re-enter the state
+  return transit<ReplicaActiveOp>();
+}
 
 // ------------- ReplicaActive/ReplicaWaitUpdates ------------------------
 
diff --git a/src/osd/scrubber/scrub_machine.h b/src/osd/scrubber/scrub_machine.h
@@ -704,6 +704,20 @@ struct ReplicaActiveOp
       NamedSimply {
   explicit ReplicaActiveOp(my_context ctx);
   ~ReplicaActiveOp();
+
+  using reactions = mpl::list<sc::custom_reaction<StartReplica>>;
+
+  /**
+   * Handling the unexpected (read - caused by a bug) case of receiving a
+   * new chunk request while still handling the previous one.
+   * To note:
+   * - the primary is evidently no longer waiting for the results of the
+   *   previous request. On the other hand
+   * - we must respond to the new request, as the primary would wait for
+   *   it "forever"`,
+   * - and we should log this unexpected scenario clearly in the cluster log.
+   */
+  sc::result react(const StartReplica&);
 };
 
 /*