Skip to content

Commit 8ec47bb

Browse files
authored
Merge pull request ceph#54482 from ronen-fr/wip-rf-repl-hp
osd/scrub: decouple being reserved from handling scrub requests Reviewed-by: Samuel Just <[email protected]>
2 parents 28f26d8 + 6f1e0e6 commit 8ec47bb

17 files changed

+457
-252
lines changed

qa/standalone/scrub/osd-scrub-dump.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,20 @@
1515
# GNU Library Public License for more details.
1616
#
1717

18+
19+
# 30.11.2023: the test is now disabled, as the reservation mechanism has been
20+
# thoroughly reworked and the test is no longer valid. The test is left here
21+
# as a basis for a new set of primary vs. replicas scrub activation tests.
22+
1823
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
1924

2025
MAX_SCRUBS=4
2126
SCRUB_SLEEP=3
2227
POOL_SIZE=3
2328

2429
function run() {
30+
echo "This test is disabled"
31+
return 0
2532
local dir=$1
2633
shift
2734
local CHUNK_MAX=5
@@ -123,7 +130,7 @@ function TEST_recover_unexpected() {
123130
for o in $(seq 0 $(expr $OSDS - 1))
124131
do
125132
CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations
126-
scrubs=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations | jq '.scrubs_local + .scrubs_remote')
133+
scrubs=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations | jq '.scrubs_local + .granted_reservations')
127134
if [ $scrubs -gt $MAX_SCRUBS ]; then
128135
echo "ERROR: More than $MAX_SCRUBS currently reserved"
129136
return 1

src/messages/MOSDScrubReserve.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class MOSDScrubReserve : public MOSDFastDispatchOp {
2424
public:
2525
spg_t pgid;
2626
epoch_t map_epoch;
27-
enum {
27+
enum ReserveMsgOp {
2828
REQUEST = 0,
2929
GRANT = 1,
3030
RELEASE = 2,

src/osd/PG.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1823,6 +1823,11 @@ void PG::on_activate(interval_set<snapid_t> snaps)
18231823
m_scrubber->on_pg_activate(m_planned_scrub);
18241824
}
18251825

1826+
void PG::on_replica_activate()
1827+
{
1828+
m_scrubber->on_replica_activate();
1829+
}
1830+
18261831
void PG::on_active_exit()
18271832
{
18281833
backfill_reserving = false;

src/osd/PG.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,8 @@ class PG : public DoutPrefixProvider,
624624

625625
void on_activate(interval_set<snapid_t> snaps) override;
626626

627+
void on_replica_activate() override;
628+
627629
void on_activate_committed() override;
628630

629631
void on_active_actmap() override;

src/osd/PeeringState.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2967,6 +2967,8 @@ void PeeringState::activate(
29672967

29682968
state_set(PG_STATE_ACTIVATING);
29692969
pl->on_activate(std::move(to_trim));
2970+
} else {
2971+
pl->on_replica_activate();
29702972
}
29712973
if (acting_set_writeable()) {
29722974
PGLog::LogEntryHandlerRef rollbacker{pl->get_log_handler(t)};

src/osd/PeeringState.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ class PeeringState : public MissingLoc::MappingInfo {
389389
virtual void on_role_change() = 0;
390390
virtual void on_change(ObjectStore::Transaction &t) = 0;
391391
virtual void on_activate(interval_set<snapid_t> to_trim) = 0;
392+
virtual void on_replica_activate() {}
392393
virtual void on_activate_complete() = 0;
393394
virtual void on_new_interval() = 0;
394395
virtual Context *on_clean() = 0;

src/osd/scrubber/osd_scrub.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -441,14 +441,14 @@ void OsdScrub::dec_scrubs_local()
441441
m_resource_bookkeeper.dec_scrubs_local();
442442
}
443443

444-
bool OsdScrub::inc_scrubs_remote()
444+
bool OsdScrub::inc_scrubs_remote(pg_t pgid)
445445
{
446-
return m_resource_bookkeeper.inc_scrubs_remote();
446+
return m_resource_bookkeeper.inc_scrubs_remote(pgid);
447447
}
448448

449-
void OsdScrub::dec_scrubs_remote()
449+
void OsdScrub::dec_scrubs_remote(pg_t pgid)
450450
{
451-
m_resource_bookkeeper.dec_scrubs_remote();
451+
m_resource_bookkeeper.dec_scrubs_remote(pgid);
452452
}
453453

454454
void OsdScrub::mark_pg_scrub_blocked(spg_t blocked_pg)

src/osd/scrubber/osd_scrub.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ class OsdScrub {
6767
// updating the resource counters
6868
bool inc_scrubs_local();
6969
void dec_scrubs_local();
70-
bool inc_scrubs_remote();
71-
void dec_scrubs_remote();
70+
bool inc_scrubs_remote(pg_t pgid);
71+
void dec_scrubs_remote(pg_t pgid);
7272

7373
// counting the number of PGs stuck while scrubbing, waiting for objects
7474
void mark_pg_scrub_blocked(spg_t blocked_pg);

src/osd/scrubber/pg_scrubber.cc

Lines changed: 32 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,13 @@ ostream& operator<<(ostream& out, const requested_scrub_t& sf)
8585
return out;
8686
}
8787

88+
void PgScrubber::on_replica_activate()
89+
{
90+
dout(10) << __func__ << dendl;
91+
m_fsm->process_event(ReplicaActivate{});
92+
}
93+
94+
8895
/*
8996
* if the incoming message is from a previous interval, it must mean
9097
* PrimaryLogPG::on_change() was called when that interval ended. We can safely
@@ -197,7 +204,6 @@ bool PgScrubber::should_abort() const
197204
*
198205
* Some of the considerations above are also relevant to the replica-side
199206
* initiation
200-
* ('StartReplica' & 'StartReplicaNoWait').
201207
*/
202208

203209
void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
@@ -216,11 +222,6 @@ void PgScrubber::initiate_regular_scrub(epoch_t epoch_queued)
216222
}
217223
}
218224

219-
void PgScrubber::dec_scrubs_remote()
220-
{
221-
m_osds->get_scrub_services().dec_scrubs_remote();
222-
}
223-
224225
void PgScrubber::advance_token()
225226
{
226227
m_current_token++;
@@ -274,13 +275,7 @@ void PgScrubber::send_start_replica(epoch_t epoch_queued,
274275
}
275276

276277
if (check_interval(epoch_queued) && is_token_current(token)) {
277-
// save us some time by not waiting for updates if there are none
278-
// to wait for. Affects the transition from NotActive into either
279-
// ReplicaWaitUpdates or ActiveReplica.
280-
if (pending_active_pushes())
281-
m_fsm->process_event(StartReplica{});
282-
else
283-
m_fsm->process_event(StartReplicaNoWait{});
278+
m_fsm->process_event(StartReplica{});
284279
}
285280
dout(10) << "scrubber event --<< " << __func__ << dendl;
286281
}
@@ -452,6 +447,11 @@ unsigned int PgScrubber::scrub_requeue_priority(
452447
* Responsible for resetting any scrub state and releasing any resources.
453448
* Any inflight events will be ignored via check_interval/should_drop_message
454449
* or canceled.
450+
* Specifically:
451+
* - if Primary and in an active session - the IntervalChanged handler takes
452+
* care of discarding the remote reservations, and transitioning out of
453+
* Session. That resets both the scrubber and the FSM.
454+
* - if we are a reserved replica - we need to free ourselves;
455455
*/
456456
void PgScrubber::on_new_interval()
457457
{
@@ -461,13 +461,7 @@ void PgScrubber::on_new_interval()
461461
is_scrub_active(), is_queued_or_active())
462462
<< dendl;
463463

464-
// If in active session - the IntervalChanged handler takes care of
465-
// discarding the remote reservations, and transitioning out of Session.
466-
// That resets both the scrubber and the FSM.
467464
m_fsm->process_event(IntervalChanged{});
468-
469-
// The 'FullReset' is only relevant if we are not an active Primary
470-
m_fsm->process_event(FullReset{});
471465
rm_from_osd_scrubbing();
472466
}
473467

@@ -806,7 +800,7 @@ void PgScrubber::cancel_callback(scrubber_callback_cancel_token_t token)
806800
m_osds->sleep_timer.cancel_event(token);
807801
}
808802

809-
LogChannelRef &PgScrubber::get_clog() const
803+
LogChannelRef& PgScrubber::get_clog() const
810804
{
811805
return m_osds->clog;
812806
}
@@ -816,6 +810,11 @@ int PgScrubber::get_whoami() const
816810
return m_osds->whoami;
817811
}
818812

813+
[[nodiscard]] bool PgScrubber::is_high_priority() const
814+
{
815+
return m_flags.required;
816+
}
817+
819818
/*
820819
* The selected range is set directly into 'm_start' and 'm_end'
821820
* setting:
@@ -1139,13 +1138,7 @@ void PgScrubber::on_init()
11391138
m_pg->publish_stats_to_osd();
11401139
}
11411140

1142-
/*
1143-
* Note: as on_replica_init() is likely to be called twice (entering
1144-
* both ReplicaWaitUpdates & ActiveReplica), its operations should be
1145-
* idempotent.
1146-
* Now that it includes some state-changing operations, we need to check
1147-
* m_active against double-activation.
1148-
*/
1141+
11491142
void PgScrubber::on_replica_init()
11501143
{
11511144
dout(10) << __func__ << " called with 'active' "
@@ -1159,6 +1152,7 @@ void PgScrubber::on_replica_init()
11591152
}
11601153
}
11611154

1155+
11621156
int PgScrubber::build_primary_map_chunk()
11631157
{
11641158
epoch_t map_building_since = m_pg->get_osdmap_epoch();
@@ -1217,23 +1211,21 @@ int PgScrubber::build_replica_map_chunk()
12171211

12181212
// the local map has been created. Send it to the primary.
12191213
// Note: once the message reaches the Primary, it may ask us for another
1220-
// chunk - and we better be done with the current scrub. Thus - the
1221-
// preparation of the reply message is separate, and we clear the scrub
1222-
// state before actually sending it.
1214+
// chunk - and we better be done with the current scrub. The clearing of
1215+
// state must be complete before we relinquish the PG lock.
12231216

1224-
auto reply = prep_replica_map_msg(PreemptionNoted::no_preemption);
1225-
replica_handling_done();
1226-
dout(15) << __func__ << " chunk map sent " << dendl;
1227-
send_replica_map(reply);
1228-
} break;
1217+
send_replica_map(prep_replica_map_msg(PreemptionNoted::no_preemption));
1218+
dout(15) << fmt::format("{}: chunk map sent", __func__) << dendl;
1219+
}
1220+
break;
12291221

12301222
default:
12311223
// negative retval: build_scrub_map_chunk() signalled an error
12321224
// Pre-Pacific code ignored this option, treating it as a success.
12331225
// \todo Add an error flag in the returning message.
1226+
// \todo: must either abort, send a reply, or return some error message
12341227
dout(1) << "Error! Aborting. ActiveReplica::react(SchedReplica) Ret: "
12351228
<< ret << dendl;
1236-
replica_handling_done();
12371229
// only in debug mode for now:
12381230
assert(false && "backend error");
12391231
break;
@@ -1520,6 +1512,7 @@ void PgScrubber::replica_scrub_op(OpRequestRef op)
15201512
replica_scrubmap_pos.reset(); // needed? RRR
15211513

15221514
set_queued_or_active();
1515+
advance_token();
15231516
m_osds->queue_for_rep_scrub(m_pg,
15241517
m_replica_request_priority,
15251518
m_flags.priority,
@@ -1675,7 +1668,7 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
16751668
auto m = op->get_req<MOSDScrubReserve>();
16761669
switch (m->type) {
16771670
case MOSDScrubReserve::REQUEST:
1678-
handle_scrub_reserve_request(op);
1671+
m_fsm->process_event(ReplicaReserveReq{op, m->from});
16791672
break;
16801673
case MOSDScrubReserve::GRANT:
16811674
m_fsm->process_event(ReplicaGrant{op, m->from});
@@ -1684,65 +1677,12 @@ void PgScrubber::handle_scrub_reserve_msgs(OpRequestRef op)
16841677
m_fsm->process_event(ReplicaReject{op, m->from});
16851678
break;
16861679
case MOSDScrubReserve::RELEASE:
1687-
handle_scrub_reserve_release(op);
1680+
m_fsm->process_event(ReplicaRelease{op, m->from});
16881681
break;
16891682
}
16901683
}
16911684

16921685

1693-
void PgScrubber::handle_scrub_reserve_request(OpRequestRef op)
1694-
{
1695-
auto request_ep = op->sent_epoch;
1696-
dout(20) << fmt::format("{}: request_ep:{} recovery:{}",
1697-
__func__,
1698-
request_ep,
1699-
m_osds->is_recovery_active())
1700-
<< dendl;
1701-
1702-
// The primary may unilaterally restart the scrub process without notifying
1703-
// replicas. Unconditionally clear any existing state prior to handling
1704-
// the new reservation.
1705-
m_fsm->process_event(FullReset{});
1706-
1707-
bool granted{false};
1708-
if (m_pg->cct->_conf->osd_scrub_during_recovery ||
1709-
!m_osds->is_recovery_active()) {
1710-
1711-
granted = m_osds->get_scrub_services().inc_scrubs_remote();
1712-
if (granted) {
1713-
m_fsm->process_event(ReplicaGrantReservation{});
1714-
} else {
1715-
dout(20) << __func__ << ": failed to reserve remotely" << dendl;
1716-
}
1717-
} else {
1718-
dout(10) << __func__ << ": recovery is active; not granting" << dendl;
1719-
}
1720-
1721-
dout(10) << __func__ << " reserved? " << (granted ? "yes" : "no") << dendl;
1722-
1723-
Message* reply = new MOSDScrubReserve(
1724-
spg_t(m_pg->info.pgid.pgid, m_pg->get_primary().shard),
1725-
request_ep,
1726-
granted ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
1727-
m_pg_whoami);
1728-
1729-
m_osds->send_message_osd_cluster(reply, op->get_req()->get_connection());
1730-
}
1731-
1732-
void PgScrubber::handle_scrub_reserve_release(OpRequestRef op)
1733-
{
1734-
dout(10) << __func__ << " " << *op->get_req() << dendl;
1735-
if (should_drop_message(op)) {
1736-
// we might have turned into a Primary in the meantime. The interval
1737-
// change should have been noticed already, and caused us to reset.
1738-
return;
1739-
}
1740-
1741-
// this specific scrub session has terminated. All incoming events carrying
1742-
// the old tag will be discarded.
1743-
m_fsm->process_event(FullReset{});
1744-
}
1745-
17461686
bool PgScrubber::set_reserving_now() {
17471687
return m_osds->get_scrub_services().set_reserving_now(m_pg_id,
17481688
ceph_clock_now());
@@ -2211,6 +2151,7 @@ void PgScrubber::handle_query_state(ceph::Formatter* f)
22112151

22122152
PgScrubber::~PgScrubber()
22132153
{
2154+
m_fsm->process_event(IntervalChanged{});
22142155
if (m_scrub_job) {
22152156
// make sure the OSD won't try to scrub this one just now
22162157
rm_from_osd_scrubbing();

0 commit comments

Comments
 (0)