File tree Expand file tree Collapse file tree 3 files changed +42
-4
lines changed
Expand file tree Collapse file tree 3 files changed +42
-4
lines changed Original file line number Diff line number Diff line change @@ -673,10 +673,18 @@ class PgScrubber : public ScrubPgIF,
673673 /* *
674674 * (replica) a tag identifying a specific replica operation, i.e. the
675675 * creation of the replica scrub map for a single chunk.
676- * Incremented immediately before sending a response to the primary,
677- * so that the next request would be identified as such. Also changed
678- * on reservation release.
679- * Used to identify stale scrub-re-sched messages triggered by the backend.
676+ *
677+ * Background: the backend is asynchronous, and the specific
678+ * operations are size-limited. While the scrubber handles a specific
679+ * request, it is continuously triggered to poll the backend for the
680+ * full results for the chunk handled.
681+ * Once the chunk request becomes obsolete, either following an interval
682+ * change or if a new request was received, we must not send the stale
683+ * data to the primary. The polling of the obsolete chunk request must
684+ * stop, and the stale backend response should be discarded.
685+ * In other words - the token should be read as saying "the primary has
686+ * lost interest in the results of all operations identified by mismatched
687+ * token values".
680688 */
681689 Scrub::act_token_t m_current_token{1 };
682690
Original file line number Diff line number Diff line change @@ -797,6 +797,22 @@ ReplicaActiveOp::~ReplicaActiveOp()
797797 scrbr->replica_handling_done ();
798798}
799799
800+ sc::result ReplicaActiveOp::react (const StartReplica&)
801+ {
802+ DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
803+ dout (10 ) << " ReplicaActiveOp::react(const StartReplica&)" << dendl;
804+
805+ const auto msg = fmt::format (
806+ " osd.{} pg[{}]: new chunk request while still handling the previous one" ,
807+ scrbr->get_whoami (), scrbr->get_spgid ());
808+ dout (1 ) << msg << dendl;
809+ scrbr->get_clog ()->warn () << msg;
810+
811+ post_event (ReplicaPushesUpd{});
812+
813+ // exit & re-enter the state
814+ return transit<ReplicaActiveOp>();
815+ }
800816
801817// ------------- ReplicaActive/ReplicaWaitUpdates ------------------------
802818
Original file line number Diff line number Diff line change @@ -704,6 +704,20 @@ struct ReplicaActiveOp
704704 NamedSimply {
705705 explicit ReplicaActiveOp (my_context ctx);
706706 ~ReplicaActiveOp ();
707+
708+ using reactions = mpl::list<sc::custom_reaction<StartReplica>>;
709+
710+ /* *
711+ * Handling the unexpected (read - caused by a bug) case of receiving a
712+ * new chunk request while still handling the previous one.
713+ * To note:
714+ * - the primary is evidently no longer waiting for the results of the
715+ * previous request. On the other hand
716+ * - we must respond to the new request, as the primary would wait for
717+ * it "forever"`,
718+ * - and we should log this unexpected scenario clearly in the cluster log.
719+ */
720+ sc::result react (const StartReplica&);
707721};
708722
709723/*
You can’t perform that action at this time.
0 commit comments