Skip to content

Commit 03a2f8c

Browse files
committed
osd/scrub: discard ReplicaIdle sub-states
The 'reservation requested' & 'reservation was granted' states are now maintained directly in the ReplicaActive state. 'StartReplica' (or 'do process a chunk') - the last event handled by the sub-states - is now handled by ReplicaIdle. Signed-off-by: Ronen Friedman <[email protected]>
1 parent 765d475 commit 03a2f8c

File tree

2 files changed

+45
-157
lines changed

2 files changed

+45
-157
lines changed

src/osd/scrubber/scrub_machine.cc

Lines changed: 30 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,12 @@ ReplicaActive::~ReplicaActive()
767767
clear_remote_reservation(false);
768768
}
769769

770+
void ReplicaActive::exit()
771+
{
772+
dout(20) << "ReplicaActive::exit()" << dendl;
773+
}
774+
775+
770776
/*
771777
* Note: we are expected to be in the initial internal state (Idle) when
772778
* receiving any registration request. ReplicaActiveOp, our other internal
@@ -943,84 +949,32 @@ ReplicaIdle::ReplicaIdle(my_context ctx)
943949
dout(10) << "-- state -->> ReplicaActive/ReplicaIdle" << dendl;
944950
}
945951

946-
void ReplicaIdle::reset_ignored(const FullReset&)
947-
{
948-
dout(10) << "ReplicaIdle::react(const FullReset&): FullReset ignored"
949-
<< dendl;
950-
}
951-
952-
953-
// ---------------- ReplicaIdle/ReplicaUnreserved ---------------------------
954952

955-
ReplicaUnreserved::ReplicaUnreserved(my_context ctx)
956-
: my_base(ctx)
957-
, NamedSimply(
958-
context<ScrubMachine>().m_scrbr,
959-
"ReplicaActive/ReplicaIdle/ReplicaUnreserved")
960-
{
961-
dout(10) << "-- state -->> ReplicaActive/ReplicaIdle/ReplicaUnreserved"
962-
<< dendl;
963-
}
964-
965-
966-
sc::result ReplicaUnreserved::react(const StartReplica& ev)
953+
sc::result ReplicaIdle::react(const StartReplica& ev)
967954
{
968955
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
969-
dout(10) << "ReplicaUnreserved::react(const StartReplica&)" << dendl;
970-
post_event(ReplicaPushesUpd{});
971-
return transit<ReplicaActiveOp>();
972-
}
973-
974-
975-
// ---------------- ReplicaIdle/ReplicaWaitingReservation ---------------------------
976-
977-
ReplicaWaitingReservation::ReplicaWaitingReservation(my_context ctx)
978-
: my_base(ctx)
979-
, NamedSimply(
980-
context<ScrubMachine>().m_scrbr,
981-
"ReplicaActive/ReplicaIdle/ReplicaWaitingReservation")
982-
{
983-
dout(10)
984-
<< "-- state -->> ReplicaActive/ReplicaIdle/ReplicaWaitingReservation"
985-
<< dendl;
986-
}
987-
988-
sc::result ReplicaWaitingReservation::react(const StartReplica& ev)
989-
{
990-
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
991-
dout(10) << "ReplicaWaitingReservation::react(const StartReplica&)" << dendl;
992-
993-
// this shouldn't happen. We will handle it, but will also log an error.
994-
scrbr->get_clog()->error() << fmt::format(
995-
"osd.{} pg[{}]: new chunk request while still waiting for "
996-
"reservation",
997-
scrbr->get_whoami(), scrbr->get_spgid());
998-
context<ReplicaActive>().clear_remote_reservation(true);
956+
dout(10) << "ReplicaIdle::react(const StartReplica&)" << dendl;
957+
958+
// if we are waiting for a reservation grant from the reserver (an
959+
// illegal scenario!), that reservation must be cleared.
960+
if (context<ReplicaActive>().pending_reservation_nonce) {
961+
scrbr->get_clog()->warn() << fmt::format(
962+
"osd.{} pg[{}]: new chunk request while still waiting for "
963+
"reservation",
964+
scrbr->get_whoami(), scrbr->get_spgid());
965+
context<ReplicaActive>().clear_remote_reservation(true);
966+
}
999967
post_event(ReplicaPushesUpd{});
1000968
return transit<ReplicaActiveOp>();
1001969
}
1002970

1003971

1004-
// ---------------- ReplicaIdle/ReplicaReserved ---------------------------
1005-
1006-
ReplicaReserved::ReplicaReserved(my_context ctx)
1007-
: my_base(ctx)
1008-
, NamedSimply(
1009-
context<ScrubMachine>().m_scrbr,
1010-
"ReplicaActive/ReplicaIdle/ReplicaReserved")
972+
void ReplicaIdle::reset_ignored(const FullReset&)
1011973
{
1012-
dout(10) << "-- state -->> ReplicaActive/ReplicaIdle/ReplicaReserved"
974+
dout(10) << "ReplicaIdle::react(const FullReset&): FullReset ignored"
1013975
<< dendl;
1014976
}
1015977

1016-
sc::result ReplicaReserved::react(const StartReplica& ev)
1017-
{
1018-
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
1019-
dout(10) << "ReplicaReserved::react(const StartReplica&)" << dendl;
1020-
post_event(ReplicaPushesUpd{});
1021-
return transit<ReplicaActiveOp>();
1022-
}
1023-
1024978

1025979
// ------------- ReplicaActive/ReplicaActiveOp --------------------------
1026980

@@ -1134,20 +1088,22 @@ sc::result ReplicaBuildingMap::react(const SchedReplica&)
11341088
dout(10) << "replica scrub job preempted" << dendl;
11351089

11361090
scrbr->send_preempted_replica();
1137-
return transit<ReplicaReserved>();
1091+
return transit<ReplicaIdle>();
11381092
}
11391093

11401094
// start or check progress of build_replica_map_chunk()
1141-
auto ret_init = scrbr->build_replica_map_chunk();
1142-
if (ret_init != -EINPROGRESS) {
1143-
dout(10) << "ReplicaBuildingMap::react(const SchedReplica&): back to idle"
1144-
<< dendl;
1145-
return transit<ReplicaReserved>();
1095+
if (scrbr->build_replica_map_chunk() == -EINPROGRESS) {
1096+
// Must ask the backend for the next stride shortly.
1097+
// build_replica_map_chunk() has already requeued us.
1098+
dout(20) << "waiting for the backend..." << dendl;
1099+
return discard_event();
11461100
}
11471101

1148-
dout(20) << "ReplicaBuildingMap::react(const SchedReplica&): discarded"
1102+
// Note: build_replica_map_chunk() aborts the OSD on any backend retval
1103+
// which is not -EINPROGRESS or 0 ('done').
1104+
dout(10) << "ReplicaBuildingMap::react(const SchedReplica&): chunk done"
11491105
<< dendl;
1150-
return discard_event();
1106+
return transit<ReplicaIdle>();
11511107
}
11521108

11531109
} // namespace Scrub

src/osd/scrubber/scrub_machine.h

Lines changed: 15 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -268,12 +268,8 @@ struct Session; ///< either reserving or actively scrubbing
268268
// the Replica states:
269269
struct ReplicaActive; ///< base state for when peered as a replica
270270

271-
/// Inactive replica state. Handles reservation requests
271+
/// Inactive replica state
272272
struct ReplicaIdle;
273-
// its sub-states:
274-
struct ReplicaUnreserved; ///< not reserved by a primary
275-
struct ReplicaWaitingReservation; ///< a reservation request was received from
276-
struct ReplicaReserved; ///< we are reserved by our primary
277273

278274
// and when handling a single chunk scrub request op:
279275
struct ReplicaActiveOp;
@@ -815,6 +811,7 @@ struct ReplicaActive : sc::state<ReplicaActive, ScrubMachine, ReplicaIdle>,
815811
NamedSimply {
816812
explicit ReplicaActive(my_context ctx);
817813
~ReplicaActive();
814+
void exit();
818815

819816
/**
820817
* cancel a granted or pending reservation
@@ -847,6 +844,12 @@ struct ReplicaActive : sc::state<ReplicaActive, ScrubMachine, ReplicaIdle>,
847844
*/
848845
sc::result react(const ReserverGranted&);
849846

847+
/**
848+
* a reservation request with this nonce is queued at the scrub_reserver,
849+
* and was not yet granted.
850+
*/
851+
MOSDScrubReserve::reservation_nonce_t pending_reservation_nonce{0};
852+
850853
private:
851854
PG* m_pg;
852855
OSDService* m_osds;
@@ -878,12 +881,6 @@ struct ReplicaActive : sc::state<ReplicaActive, ScrubMachine, ReplicaIdle>,
878881

879882
reservation_status_t m_reservation_status{reservation_status_t::unreserved};
880883

881-
/**
882-
* a reservation request with this nonce is queued at the scrub_reserver,
883-
* and was not yet granted.
884-
*/
885-
MOSDScrubReserve::reservation_nonce_t pending_reservation_nonce{0};
886-
887884
// clang-format off
888885
struct RtReservationCB : public Context {
889886
PGRef pg;
@@ -904,83 +901,18 @@ struct ReplicaActive : sc::state<ReplicaActive, ScrubMachine, ReplicaIdle>,
904901
};
905902

906903

907-
struct ReplicaIdle : sc::state<
908-
ReplicaIdle,
909-
ReplicaActive,
910-
ReplicaUnreserved>,
911-
NamedSimply {
904+
struct ReplicaIdle : sc::state<ReplicaIdle, ReplicaActive>, NamedSimply {
912905
explicit ReplicaIdle(my_context ctx);
913906
~ReplicaIdle() = default;
914907
void reset_ignored(const FullReset&);
915-
using reactions = mpl::list<sc::in_state_reaction<
916-
FullReset,
917-
ReplicaIdle,
918-
&ReplicaIdle::reset_ignored>>;
919-
};
920-
921-
/*
922-
* ReplicaUnreserved
923-
*
924-
* Possible events:
925-
* - a reservation request from a legacy primary (i.e. a primary that does not
926-
* support queued reservations). We either deny or grant, transitioning to
927-
* ReplicaReserved directly.
928-
* - a reservation request from a primary that supports queued reservations.
929-
* We transition to ReplicaWaitingReservation, and wait for the Reserver's
930-
* response.
931-
* - (handled by our parent state) a chunk scrub request. We transition to
932-
* ReplicaActiveOp.
933-
*/
934-
struct ReplicaUnreserved : sc::state<ReplicaUnreserved, ReplicaIdle>,
935-
NamedSimply {
936-
explicit ReplicaUnreserved(my_context ctx);
937-
938-
using reactions = mpl::list<
939-
sc::custom_reaction<StartReplica>>;
940-
941-
sc::result react(const StartReplica& ev);
942-
};
943-
944-
/**
945-
* ReplicaWaitingReservation
946-
*
947-
* Possible events:
948-
* - 'go ahead' from the async reserver. We send a GRANT message to the
949-
* primary & transition to ReplicaReserved.
950-
* - 'cancel' from the primary. We clear our reservation state, and transition
951-
* back to ReplicaUnreserved.
952-
* - a chunk request: shouldn't happen, but we handle it anyway. An error
953-
* is logged (to trigger test failures).
954-
* - on interval change: handled by our parent state.
955-
*/
956-
struct ReplicaWaitingReservation
957-
: sc::state<ReplicaWaitingReservation, ReplicaIdle>,
958-
NamedSimply {
959-
explicit ReplicaWaitingReservation(my_context ctx);
960-
961908
using reactions = mpl::list<
962-
sc::custom_reaction<StartReplica>>;
963-
964-
sc::result react(const StartReplica& ev);
965-
};
966-
967-
/**
968-
* ReplicaReserved
969-
*
970-
* Possible events:
971-
* - 'cancel' from the primary. We clear our reservation state, and transition
972-
* back to ReplicaUnreserved.
973-
* - a chunk scrub request. We transition to ReplicaActiveOp.
974-
* - on interval change: we clear our reservation state, and transition
975-
* back to ReplicaUnreserved.
976-
*/
977-
struct ReplicaReserved : sc::state<ReplicaReserved, ReplicaIdle>, NamedSimply {
978-
explicit ReplicaReserved(my_context ctx);
979-
980-
using reactions = mpl::list<
981-
sc::custom_reaction<StartReplica>>;
909+
sc::custom_reaction<StartReplica>,
910+
sc::in_state_reaction<
911+
FullReset,
912+
ReplicaIdle,
913+
&ReplicaIdle::reset_ignored>>;
982914

983-
sc::result react(const StartReplica& eq);
915+
sc::result react(const StartReplica& ev);
984916
};
985917

986918

0 commit comments

Comments
 (0)