Skip to content

Commit ec381fa

Browse files
committed
osd/scrub: handle reservation requests directly by ReplicaActive
including handling the reserver grant response. Signed-off-by: Ronen Friedman <[email protected]>
1 parent d643bde commit ec381fa

File tree

2 files changed

+107
-181
lines changed

2 files changed

+107
-181
lines changed

src/osd/scrubber/scrub_machine.cc

Lines changed: 77 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -769,8 +769,8 @@ ReplicaActive::~ReplicaActive()
769769

770770
/*
771771
* Note: we are expected to be in the initial internal state (Idle) when
772-
* receiving any registration request. Our other internal states, the
773-
* active ones, have their own handler for this event, and will treat it
772+
* receiving any registration request. ReplicaActiveOp, our other internal
773+
* state, has its own handler for this event, and will treat it
774774
* as an abort request.
775775
*
776776
* Process:
@@ -787,75 +787,101 @@ ReplicaActive::~ReplicaActive()
787787
* implementation note: sc::result objects cannot be copied or moved. Thus,
788788
* we've resorted to returning a code indicating the next action.
789789
*/
790-
ReplicaReactCode ReplicaActive::on_reserve_request(
791-
const ReplicaReserveReq& ev,
792-
bool async_request)
790+
sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
793791
{
794792
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
795-
const auto m = ev.m_op->get_req<MOSDScrubReserve>();
793+
const auto& m = *(ev.m_op->get_req<MOSDScrubReserve>());
794+
795+
// should we handle the request asynchronously, using the reserver?
796+
const auto async_disabled = scrbr->get_pg_cct()->_conf.get_val<bool>(
797+
"osd_scrub_disable_reservation_queuing");
798+
const bool async_request = !async_disabled && m.wait_for_resources;
796799
dout(10) << fmt::format(
797-
"ReplicaActive::on_reserve_req() request:{} async_request:{} "
798-
"reservation_nonce:{}",
799-
ev, async_request, m->reservation_nonce)
800+
"ReplicaActive::react(const ReplicaReserveReq&): async "
801+
"request?:{} disabled?:{} -> async? {}",
802+
m.wait_for_resources, async_disabled, async_request)
800803
<< dendl;
801804

802-
ceph_assert(!reservation_granted);
803-
ceph_assert(!pending_reservation_nonce);
804-
ReplicaReactCode next_action{ReplicaReactCode::discard};
805-
AsyncScrubResData request_details{
806-
pg_id, ev.m_from, ev.m_op->sent_epoch, m->reservation_nonce};
805+
if (m_reservation_status != reservation_status_t::unreserved) {
806+
// we are not expected to be in this state when a new request arrives.
807+
// Exit-then-reenter ReplicaActive, causing the existing reservation - be
808+
// it granted or pending - to be cleared. This maneuver also aborts
809+
// any on-going chunk request.
810+
dout(1) << fmt::format(
811+
"{}: unexpected request (granted:{} request:{})", __func__,
812+
reservation_granted, m)
813+
<< dendl;
814+
post_event(ev);
815+
return transit<ReplicaActive>();
816+
}
817+
807818
auto& reserver = m_osds->get_scrub_reserver();
808819

809820
if (async_request) {
810821
// the request is to be handled asynchronously
811-
dout(20) << fmt::format(
812-
"{}: async request: {} details:{}", __func__, ev,
813-
request_details)
822+
AsyncScrubResData request_details{
823+
pg_id, ev.m_from, ev.m_op->sent_epoch, m.reservation_nonce};
824+
dout(15) << fmt::format(
825+
"ReplicaActive::react(const ReplicaReserveReq& ev): async "
826+
"request: {} details:{}",
827+
ev, request_details)
814828
<< dendl;
815-
pending_reservation_nonce = m->reservation_nonce;
829+
830+
pending_reservation_nonce = m.reservation_nonce;
816831
const auto reservation_cb = new RtReservationCB(m_pg, request_details);
817-
reserver.request_reservation(pg_id, reservation_cb, 0, nullptr);
818-
next_action = ReplicaReactCode::goto_waiting_reservation;
832+
reserver.request_reservation(pg_id, reservation_cb, /*prio=*/0, nullptr);
833+
m_reservation_status = reservation_status_t::requested_or_granted;
819834

820835
} else {
821836
// an immediate yes/no is required
822837
Message* reply{nullptr};
823838
reservation_granted = reserver.request_reservation_or_fail(pg_id);
824839
if (reservation_granted) {
825840
dout(10) << fmt::format("{}: reserved? yes", __func__) << dendl;
841+
m_reservation_status = reservation_status_t::requested_or_granted;
826842
reply = new MOSDScrubReserve(
827843
spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch,
828-
MOSDScrubReserve::GRANT, m_pg->pg_whoami, m->reservation_nonce);
829-
next_action = ReplicaReactCode::goto_replica_reserved;
844+
MOSDScrubReserve::GRANT, m_pg->pg_whoami, m.reservation_nonce);
830845

831846
} else {
832847
dout(10) << fmt::format("{}: reserved? no", __func__) << dendl;
833848
reply = new MOSDScrubReserve(
834849
spg_t(pg_id.pgid, m_pg->get_primary().shard), ev.m_op->sent_epoch,
835-
MOSDScrubReserve::REJECT, m_pg->pg_whoami, m->reservation_nonce);
836-
// the event is discarded
837-
next_action = ReplicaReactCode::discard;
850+
MOSDScrubReserve::REJECT, m_pg->pg_whoami, m.reservation_nonce);
838851
}
839852

840853
m_osds->send_message_osd_cluster(
841854
reply, ev.m_op->get_req()->get_connection());
842855
}
843-
844-
return next_action;
856+
return discard_event();
845857
}
846858

847-
bool ReplicaActive::granted_by_reserver(const AsyncScrubResData& reservation)
859+
860+
sc::result ReplicaActive::react(const ReserverGranted& ev)
848861
{
849862
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
850-
dout(10) << fmt::format("{}: reservation granted: {}", __func__, reservation)
851-
<< dendl;
863+
const AsyncScrubResData& reservation = ev.value;
864+
dout(10)
865+
<< fmt::format(
866+
"ReplicaActive::react(const ReserverGranted&). Reservation:{}",
867+
reservation)
868+
<< dendl;
852869

853-
/// verify that the granted reservation is the one we were waiting for
870+
/**
871+
* discard (and log) unexpected 'reservation granted' messages
872+
* from the async reserver. 'Unexpected' here - either not carrying the
873+
* ID of our last request from the reserver, or arriving when there is
874+
* no 'open request' made to the reserver.
875+
* As canceled reservations may still be triggered, this is not
876+
* necessarily a bug.
877+
*/
854878
if (reservation.nonce != pending_reservation_nonce) {
855879
dout(5) << fmt::format(
856-
"{}: reservation_nonce mismatch: {} != {}", __func__, reservation.nonce,
857-
pending_reservation_nonce) << dendl;
858-
return false;
880+
"ReplicaActive::react(const ReserverGranted&): "
881+
"reservation_nonce mismatch: {} != {}",
882+
reservation.nonce, pending_reservation_nonce)
883+
<< dendl;
884+
return discard_event();
859885
}
860886

861887
reservation_granted = true;
@@ -867,9 +893,10 @@ bool ReplicaActive::granted_by_reserver(const AsyncScrubResData& reservation)
867893
MOSDScrubReserve::GRANT, m_pg->pg_whoami, pending_reservation_nonce);
868894
m_pg->send_cluster_message(
869895
m_pg->get_primary().osd, grant_msg, reservation.request_epoch, false);
870-
return true;
896+
return discard_event();
871897
}
872898

899+
873900
void ReplicaActive::on_release(const ReplicaRelease& ev)
874901
{
875902
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
@@ -899,12 +926,6 @@ void ReplicaActive::clear_remote_reservation(bool warn_if_no_reservation)
899926
}
900927
}
901928

902-
void ReplicaActive::ignore_unhandled_grant(const ReserverGranted&)
903-
{
904-
dout(10) << "ReplicaActive::react(const ReserverGranted&): ignored"
905-
<< dendl;
906-
}
907-
908929

909930
// ---------------- ReplicaActive/ReplicaIdle ---------------------------
910931

@@ -934,34 +955,6 @@ ReplicaUnreserved::ReplicaUnreserved(my_context ctx)
934955
<< dendl;
935956
}
936957

937-
sc::result ReplicaUnreserved::react(const ReplicaReserveReq& ev)
938-
{
939-
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
940-
dout(10) << "ReplicaUnreserved::react(const ReplicaReserveReq&)" << dendl;
941-
942-
const auto& m = *(ev.m_op->get_req<MOSDScrubReserve>());
943-
const auto async_disabled = scrbr->get_pg_cct()->_conf.get_val<bool>(
944-
"osd_scrub_disable_reservation_queuing");
945-
const bool async_request = !async_disabled && m.wait_for_resources;
946-
dout(15) << fmt::format(
947-
"ReplicaUnreserved::react(const ReplicaReserveReq&): "
948-
"request:{} disabled?:{} -> async? {}", m.wait_for_resources,
949-
async_disabled, async_request)
950-
<< dendl;
951-
952-
switch (context<ReplicaActive>().on_reserve_request(ev, async_request)) {
953-
case ReplicaReactCode::discard:
954-
return discard_event();
955-
case ReplicaReactCode::goto_waiting_reservation:
956-
return transit<ReplicaWaitingReservation>();
957-
case ReplicaReactCode::goto_replica_reserved:
958-
return transit<ReplicaReserved>();
959-
default:
960-
ceph_abort_msg("unexpected return value");
961-
}
962-
// can't happen, but some compilers complain:
963-
return transit<ReplicaReserved>();
964-
}
965958

966959
sc::result ReplicaUnreserved::react(const StartReplica& ev)
967960
{
@@ -983,20 +976,6 @@ sc::result ReplicaUnreserved::react(const ReplicaRelease&)
983976
return discard_event();
984977
}
985978

986-
sc::result ReplicaUnreserved::react(const ReserverGranted&)
987-
{
988-
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
989-
dout(10) << "ReplicaUnreserved::react(const ReserverGranted&)" << dendl;
990-
// shouldn't happen. Might be a result of a cancelled reservation
991-
// that was still delivered.
992-
// must unreserve
993-
dout(5) << "ReplicaUnreserved::react(const ReserverGranted&): reservation "
994-
"granted while not being waited for"
995-
<< dendl;
996-
context<ReplicaActive>().clear_remote_reservation(false);
997-
return discard_event();
998-
}
999-
1000979

1001980
// ---------------- ReplicaIdle/ReplicaWaitingReservation ---------------------------
1002981

@@ -1011,20 +990,6 @@ ReplicaWaitingReservation::ReplicaWaitingReservation(my_context ctx)
1011990
<< dendl;
1012991
}
1013992

1014-
sc::result ReplicaWaitingReservation::react(const ReserverGranted& ev)
1015-
{
1016-
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
1017-
dout(10) << fmt::format(
1018-
"ReplicaWaitingReservation::react(const ReserverGranted&): "
1019-
"event:{}",
1020-
ev)
1021-
<< dendl;
1022-
if (context<ReplicaActive>().granted_by_reserver(ev.value)) {
1023-
return transit<ReplicaReserved>();
1024-
}
1025-
return discard_event();
1026-
}
1027-
1028993
sc::result ReplicaWaitingReservation::react(const ReplicaRelease& ev)
1029994
{
1030995
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
@@ -1050,21 +1015,6 @@ sc::result ReplicaWaitingReservation::react(const StartReplica& ev)
10501015
return transit<ReplicaActiveOp>();
10511016
}
10521017

1053-
sc::result ReplicaWaitingReservation::react(const ReplicaReserveReq& ev)
1054-
{
1055-
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
1056-
dout(10) << "ReplicaWaitingReservation::react(const ReplicaReserveReq&)"
1057-
<< dendl;
1058-
// this shouldn't happen. We will handle it, but will also log an error.
1059-
scrbr->get_clog()->error() << fmt::format(
1060-
"osd.{} pg[{}]: reservation requested while previous is pending",
1061-
scrbr->get_whoami(), scrbr->get_spgid());
1062-
// cancel the existing reservation, and re-request
1063-
context<ReplicaActive>().clear_remote_reservation(true);
1064-
post_event(ev);
1065-
return transit<ReplicaUnreserved>();
1066-
}
1067-
10681018

10691019
// ---------------- ReplicaIdle/ReplicaReserved ---------------------------
10701020

@@ -1086,22 +1036,6 @@ sc::result ReplicaReserved::react(const ReplicaRelease& ev)
10861036
return transit<ReplicaUnreserved>();
10871037
}
10881038

1089-
sc::result ReplicaReserved::react(const ReplicaReserveReq& ev)
1090-
{
1091-
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
1092-
dout(10) << "ReplicaReserved::react(const ReplicaReserveReq&)" << dendl;
1093-
scrbr->get_clog()->error() << fmt::format(
1094-
"osd.{} pg[{}]: reservation requested while still reserved",
1095-
scrbr->get_whoami(), scrbr->get_spgid());
1096-
// This is a bug. We should never receive a new request unless the
1097-
// previous one was cancelled - either by the primary, or on interval
1098-
// change.
1099-
// cancel the existing reservation, and re-request
1100-
context<ReplicaActive>().clear_remote_reservation(true);
1101-
post_event(ev);
1102-
return transit<ReplicaUnreserved>();
1103-
}
1104-
11051039
sc::result ReplicaReserved::react(const StartReplica& ev)
11061040
{
11071041
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
@@ -1146,6 +1080,21 @@ sc::result ReplicaActiveOp::react(const StartReplica&)
11461080
return transit<ReplicaActiveOp>();
11471081
}
11481082

1083+
1084+
sc::result ReplicaActiveOp::react(const ReplicaReserveReq& ev)
1085+
{
1086+
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
1087+
dout(10) << "ReplicaActiveOp::react(const ReplicaReserveReq&)" << dendl;
1088+
scrbr->get_clog()->warn() << fmt::format(
1089+
"osd.{} pg[{}]: reservation requested while processing a chunk",
1090+
scrbr->get_whoami(), scrbr->get_spgid());
1091+
// exit-n-renter ReplicaActive (aborting the chunk & freeing the reservation
1092+
// in the process)
1093+
post_event(ev);
1094+
return transit<ReplicaActive>();
1095+
}
1096+
1097+
11491098
sc::result ReplicaActiveOp::react(const ReplicaRelease& ev)
11501099
{
11511100
dout(10) << "ReplicaActiveOp::react(const ReplicaRelease&)" << dendl;

0 commit comments

Comments
 (0)