Skip to content

Commit 005839b

Browse files
committed
osd/scrub: uniform handling of reservation requests
we now allow - on the replica side - reservation requests regardless of the ReplicaActive sub-state. I.e. - we will honor such requests even when handling a chunk request (in ReplicaActiveOp). Note that the current Primary code would never send such a request. But a future primary code might. Signed-off-by: Ronen Friedman <[email protected]>
1 parent 4a26aa1 commit 005839b

File tree

2 files changed

+52
-58
lines changed

2 files changed

+52
-58
lines changed

src/osd/scrubber/scrub_machine.cc

Lines changed: 41 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -774,26 +774,48 @@ void ReplicaActive::exit()
774774

775775

776776
/*
777-
* Note: we are expected to be in the initial internal state (Idle) when
778-
* receiving any registration request. ReplicaActiveOp, our other internal
779-
* state, has its own handler for this event, and will treat it
780-
* as an abort request.
781-
*
777+
* Note: we are expected to be in the ReplicaIdle sub-state: the current
778+
* scrub code on the primary side would never interlace chunk ops with
779+
* reservation requests. But 'badly timed' requests are not blocked
780+
* on the replica side: while requests arriving while in ReplicaActiveOp
781+
* are at this time probably a bug; but a future Primary scrub code
782+
* would possibly treat 'reservation' & 'scrubbing' as (almost)
783+
* totally orthogonal.
784+
*/
785+
sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
786+
{
787+
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
788+
dout(10) << "ReplicaActive::react(const ReplicaReserveReq&)" << dendl;
789+
790+
if (m_reservation_status != reservation_status_t::unreserved) {
791+
// we are not expected to be in this state when a new request arrives.
792+
// Clear the existing reservation - be it granted or pending.
793+
const auto& m = *(ev.m_op->get_req<MOSDScrubReserve>());
794+
dout(1) << fmt::format(
795+
"ReplicaActive::react(const ReplicaReserveReq&): unexpected "
796+
"request. Discarding existing "
797+
"reservation (was granted?:{}). Incoming request: {}",
798+
reservation_granted, m)
799+
<< dendl;
800+
clear_remote_reservation(true);
801+
}
802+
803+
handle_reservation_request(ev);
804+
return discard_event();
805+
}
806+
807+
808+
/*
782809
* Process:
783-
* - if already reserved: clear existing reservation, then continue
784810
* - for async requests:
785-
* - enqueue the request with reserver;
786-
* - move to the ReplicaWaitingReservation state
811+
* - enqueue the request with reserver, noting the nonce;
787812
* - no reply is expected by the caller
788813
* - for legacy requests:
789-
* - ask the OSD for the "reservation resource"
790-
* - if granted: move to ReplicaReserved and notify the Primary.
791-
* - otherwise: just notify the requesting primary.
792-
*
793-
* implementation note: sc::result objects cannot be copied or moved. Thus,
794-
* we've resorted to returning a code indicating the next action.
814+
* - ask the OSD for the "reservation resource";
815+
* - send grant/reject to the requesting primary;
816+
* - update 'reservation_granted'
795817
*/
796-
sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
818+
void ReplicaActive::handle_reservation_request(const ReplicaReserveReq& ev)
797819
{
798820
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
799821
const auto& m = *(ev.m_op->get_req<MOSDScrubReserve>());
@@ -803,33 +825,19 @@ sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
803825
"osd_scrub_disable_reservation_queuing");
804826
const bool async_request = !async_disabled && m.wait_for_resources;
805827
dout(10) << fmt::format(
806-
"ReplicaActive::react(const ReplicaReserveReq&): async "
807-
"request?:{} disabled?:{} -> async? {}",
808-
m.wait_for_resources, async_disabled, async_request)
828+
"{}: Message:{}. async request?:{} disabled?:{} -> async? {}",
829+
__func__, m, m.wait_for_resources, async_disabled,
830+
async_request)
809831
<< dendl;
810832

811-
if (m_reservation_status != reservation_status_t::unreserved) {
812-
// we are not expected to be in this state when a new request arrives.
813-
// Exit-then-reenter ReplicaActive, causing the existing reservation - be
814-
// it granted or pending - to be cleared. This maneuver also aborts
815-
// any on-going chunk request.
816-
dout(1) << fmt::format(
817-
"{}: unexpected request (granted:{} request:{})", __func__,
818-
reservation_granted, m)
819-
<< dendl;
820-
post_event(ev);
821-
return transit<ReplicaActive>();
822-
}
823-
824833
auto& reserver = m_osds->get_scrub_reserver();
825834

826835
if (async_request) {
827836
// the request is to be handled asynchronously
828837
AsyncScrubResData request_details{
829838
pg_id, ev.m_from, ev.m_op->sent_epoch, m.reservation_nonce};
830839
dout(15) << fmt::format(
831-
"ReplicaActive::react(const ReplicaReserveReq& ev): async "
832-
"request: {} details:{}",
840+
"{}: async request: {} details:{}", __func__,
833841
ev, request_details)
834842
<< dendl;
835843

@@ -859,7 +867,6 @@ sc::result ReplicaActive::react(const ReplicaReserveReq& ev)
859867
m_osds->send_message_osd_cluster(
860868
reply, ev.m_op->get_req()->get_connection());
861869
}
862-
return discard_event();
863870
}
864871

865872

@@ -1012,20 +1019,6 @@ sc::result ReplicaActiveOp::react(const StartReplica&)
10121019
}
10131020

10141021

1015-
sc::result ReplicaActiveOp::react(const ReplicaReserveReq& ev)
1016-
{
1017-
DECLARE_LOCALS; // 'scrbr' & 'pg_id' aliases
1018-
dout(10) << "ReplicaActiveOp::react(const ReplicaReserveReq&)" << dendl;
1019-
scrbr->get_clog()->warn() << fmt::format(
1020-
"osd.{} pg[{}]: reservation requested while processing a chunk",
1021-
scrbr->get_whoami(), scrbr->get_spgid());
1022-
// exit-n-renter ReplicaActive (aborting the chunk & freeing the reservation
1023-
// in the process)
1024-
post_event(ev);
1025-
return transit<ReplicaActive>();
1026-
}
1027-
1028-
10291022
sc::result ReplicaActiveOp::react(const ReplicaRelease& ev)
10301023
{
10311024
dout(10) << "ReplicaActiveOp::react(const ReplicaRelease&)" << dendl;

src/osd/scrubber/scrub_machine.h

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -887,6 +887,16 @@ struct ReplicaActive : sc::state<ReplicaActive, ScrubMachine, ReplicaIdle>,
887887

888888
reservation_status_t m_reservation_status{reservation_status_t::unreserved};
889889

890+
/**
891+
* React to the reservation request.
892+
* Called after any existing pending/granted request was released.
893+
*
894+
* Async requests are sent to the reserver.
895+
* For old-style synchronous requests, the reserver is queried using
896+
* its 'immediate' interface, and the response is sent back to the primary.
897+
*/
898+
void handle_reservation_request(const ReplicaReserveReq& ev);
899+
890900
// clang-format off
891901
struct RtReservationCB : public Context {
892902
PGRef pg;
@@ -929,8 +939,7 @@ struct ReplicaActiveOp
929939

930940
using reactions = mpl::list<
931941
sc::custom_reaction<StartReplica>,
932-
sc::custom_reaction<ReplicaRelease>,
933-
sc::custom_reaction<ReplicaReserveReq>>;
942+
sc::custom_reaction<ReplicaRelease>>;
934943

935944
/**
936945
* Handling the unexpected (read - caused by a bug) case of receiving a
@@ -950,14 +959,6 @@ struct ReplicaActiveOp
950959
* releasing the reservation on the way.
951960
*/
952961
sc::result react(const ReplicaRelease&);
953-
954-
/**
955-
* handling unexpected 'reservation requests' while we are in the middle
956-
* of serving a chunk request.
957-
* This is a bug. We log it, abort the operation, and re-enter ReplicaActive
958-
* to handle the reservation request.
959-
*/
960-
sc::result react(const ReplicaReserveReq& ev);
961962
};
962963

963964
/*

0 commit comments

Comments
 (0)