Skip to content

Commit babb801

Browse files
committed
osd: EC Optimizations: Backfill changes for partial writes
Optimized EC pools support partial writes that do not update every shard. Consequently shards that are not updated can have out of date version numbers. The primary shard object_info_t is always updated and tracks the expected version of each shards. To avoid unnecessary backfill work changes are required to use the extra data in the object_info_t when comparing version numbers to work out whether a shard is missing updates or just didn't participate in recent partial writes. See comments in src/osd/recovery_types.h Signed-off-by: Bill Scales <[email protected]>
1 parent edbce5f commit babb801

15 files changed

+564
-137
lines changed

src/crimson/osd/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ add_executable(crimson-osd
5555
${PROJECT_SOURCE_DIR}/src/osd/MissingLoc.cc
5656
${PROJECT_SOURCE_DIR}/src/osd/PGLog.cc
5757
${PROJECT_SOURCE_DIR}/src/osd/SnapMapper.cc
58-
${PROJECT_SOURCE_DIR}/src/osd/recovery_types.cc
5958
${PROJECT_SOURCE_DIR}/src/osd/osd_perf_counters.cc
6059
${PROJECT_SOURCE_DIR}/src/mgr/OSDPerfMetricTypes.cc
6160
watch.cc

src/crimson/osd/backfill_facades.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,11 @@ struct PeeringFacade final : BackfillState::PeeringFacade {
6262
const std::vector<pg_shard_t> &peers) override {
6363
return peering_state.prepare_backfill_for_missing(soid, v, peers);
6464
}
65+
66+
const pg_pool_t& get_pool() const override {
67+
return peering_state.get_pgpool().info;
68+
}
69+
6570
PeeringFacade(PeeringState& peering_state)
6671
: peering_state(peering_state) {
6772
}

src/crimson/osd/backfill_state.cc

Lines changed: 76 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt)
8585
ceph_assert(backfill_state().last_backfill_started == \
8686
peering_state().earliest_backfill());
8787
ceph_assert(peering_state().is_backfilling());
88-
// initialize BackfillIntervals
88+
// initialize ReplicaBackfillIntervals
8989
for (const auto& bt : peering_state().get_backfill_targets()) {
9090
backfill_state().peer_backfill_info[bt].reset(
9191
peering_state().get_peer_last_backfill(bt));
@@ -134,13 +134,52 @@ void BackfillState::Enqueuing::maybe_update_range()
134134
if (e.is_update()) {
135135
DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}",
136136
pg(), e.soid, e.version);
137-
primary_bi.objects.erase(e.soid);
138-
primary_bi.objects.insert(std::make_pair(e.soid,
139-
e.version));
137+
if (e.written_shards.empty()) {
138+
// Log entry updates all shards, replace all entries for e.soid
139+
primary_bi.objects.erase(e.soid);
140+
primary_bi.objects.insert(
141+
std::make_pair(e.soid,
142+
std::make_pair(shard_id_t::NO_SHARD,
143+
e.version)));
144+
} else {
145+
// Update backfill interval for shards modified by log entry
146+
std::map<shard_id_t,eversion_t> versions;
147+
// Create map from existing entries in backfill entry
148+
const auto & [begin, end] = primary_bi.objects.equal_range(e.soid);
149+
for (const auto & entry : std::ranges::subrange(begin, end)) {
150+
const auto & [shard, version] = entry.second;
151+
versions[shard] = version;
152+
}
153+
// Update entries in map that are modified by log entry
154+
bool uses_default = false;
155+
for (const auto & shard : peering_state().get_backfill_targets()) {
156+
if (e.is_written_shard(shard.shard)) {
157+
versions.erase(shard.shard);
158+
uses_default = true;
159+
} else {
160+
if (!versions.contains(shard.shard)) {
161+
versions[shard.shard] = e.prior_version;
162+
}
163+
//Else: keep existing version
164+
}
165+
}
166+
if (uses_default) {
167+
versions[shard_id_t::NO_SHARD] = e.version;
168+
} else {
169+
versions.erase(shard_id_t::NO_SHARD);
170+
}
171+
// Erase and recreate backfill interval for e.soid using map
172+
primary_bi.objects.erase(e.soid);
173+
for (auto & [shard, version] : versions) {
174+
primary_bi.objects.insert(
175+
std::make_pair(e.soid,
176+
std::make_pair(shard, version)));
177+
}
178+
}
140179
} else if (e.is_delete()) {
141180
DEBUGDPP("maybe_update_range(lambda): {} removed",
142181
pg(), e.soid);
143-
primary_bi.objects.erase(e.soid);
182+
primary_bi.objects.erase(e.soid); // Erase all entries for e.soid
144183
}
145184
}
146185
};
@@ -168,8 +207,8 @@ void BackfillState::Enqueuing::trim_backfill_infos()
168207

169208
/* static */ bool BackfillState::Enqueuing::all_enqueued(
170209
const PeeringFacade& peering_state,
171-
const BackfillInterval& backfill_info,
172-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
210+
const PrimaryBackfillInterval& backfill_info,
211+
const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info)
173212
{
174213
const bool all_local_enqueued = \
175214
backfill_info.extends_to_end() && backfill_info.empty();
@@ -184,7 +223,8 @@ void BackfillState::Enqueuing::trim_backfill_infos()
184223
}
185224

186225
hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
187-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
226+
const std::map<pg_shard_t,
227+
ReplicaBackfillInterval>& peer_backfill_info) const
188228
{
189229
hobject_t e = hobject_t::get_max();
190230
for (const pg_shard_t& bt : peering_state().get_backfill_targets()) {
@@ -196,8 +236,8 @@ hobject_t BackfillState::Enqueuing::earliest_peer_backfill(
196236
}
197237

198238
bool BackfillState::Enqueuing::should_rescan_replicas(
199-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
200-
const BackfillInterval& backfill_info) const
239+
const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
240+
const PrimaryBackfillInterval& backfill_info) const
201241
{
202242
const auto& targets = peering_state().get_backfill_targets();
203243
return std::any_of(std::begin(targets), std::end(targets),
@@ -208,8 +248,8 @@ bool BackfillState::Enqueuing::should_rescan_replicas(
208248
}
209249

210250
bool BackfillState::Enqueuing::should_rescan_primary(
211-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
212-
const BackfillInterval& backfill_info) const
251+
const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
252+
const PrimaryBackfillInterval& backfill_info) const
213253
{
214254
return backfill_info.begin <= earliest_peer_backfill(peer_backfill_info) &&
215255
!backfill_info.extends_to_end() && backfill_info.empty();
@@ -218,7 +258,7 @@ bool BackfillState::Enqueuing::should_rescan_primary(
218258
void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
219259
BackfillState::Enqueuing::result_t&& result,
220260
hobject_t& last_backfill_started,
221-
std::map<pg_shard_t, BackfillInterval>& peer_backfill_info)
261+
std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info)
222262
{
223263
std::for_each(std::begin(result.pbi_targets), std::end(result.pbi_targets),
224264
[&peer_backfill_info] (const auto& bt) {
@@ -257,13 +297,28 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
257297
result_t result { {}, primary_bi.begin };
258298
std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
259299

300+
std::map<shard_id_t,eversion_t> versions;
301+
auto it = primary_bi.objects.begin();
302+
const hobject_t& hoid = it->first;
303+
eversion_t obj_v;
304+
while (it != primary_bi.objects.end() && it->first == hoid) {
305+
obj_v = std::max(obj_v, it->second.second);
306+
versions[it->second.first] = it->second.second;
307+
++it;
308+
}
309+
260310
for (const auto& bt : peering_state().get_backfill_targets()) {
261311
const auto& peer_bi = backfill_state().peer_backfill_info.at(bt);
262312

263313
// Find all check peers that have the wrong version
264-
if (const eversion_t& obj_v = primary_bi.objects.begin()->second;
265-
check == primary_bi.begin && check == peer_bi.begin) {
266-
if (peer_bi.objects.begin()->second != obj_v) {
314+
if (check == primary_bi.begin && check == peer_bi.begin) {
315+
eversion_t replicaobj_v;
316+
if (versions.contains(bt.shard)) {
317+
replicaobj_v = versions.at(bt.shard);
318+
} else {
319+
replicaobj_v = versions.at(shard_id_t::NO_SHARD);
320+
}
321+
if (peer_bi.objects.begin()->second != replicaobj_v) {
267322
std::ignore = backfill_state().progress_tracker->enqueue_push(
268323
primary_bi.begin);
269324
auto &[v, peers] = backfills[primary_bi.begin];
@@ -298,8 +353,9 @@ BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
298353
}
299354

300355
bool BackfillState::Enqueuing::Enqueuing::all_emptied(
301-
const BackfillInterval& local_backfill_info,
302-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const
356+
const PrimaryBackfillInterval& local_backfill_info,
357+
const std::map<pg_shard_t,
358+
ReplicaBackfillInterval>& peer_backfill_info) const
303359
{
304360
const auto& targets = peering_state().get_backfill_targets();
305361
const auto replicas_emptied =
@@ -459,8 +515,8 @@ BackfillState::PrimaryScanning::react(ObjectPushed evt)
459515

460516
// -- ReplicasScanning
461517
bool BackfillState::ReplicasScanning::replica_needs_scan(
462-
const BackfillInterval& replica_backfill_info,
463-
const BackfillInterval& local_backfill_info)
518+
const ReplicaBackfillInterval& replica_backfill_info,
519+
const PrimaryBackfillInterval& local_backfill_info)
464520
{
465521
return replica_backfill_info.empty() && \
466522
replica_backfill_info.begin <= local_backfill_info.begin && \

src/crimson/osd/backfill_state.h

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "osd/recovery_types.h"
1717
#include "osd/PGLog.h"
18+
#include "osd/PeeringState.h"
1819

1920
namespace crimson::osd {
2021

@@ -27,16 +28,16 @@ struct BackfillState {
2728

2829
// events comes first
2930
struct PrimaryScanned : sc::event<PrimaryScanned> {
30-
BackfillInterval result;
31-
PrimaryScanned(BackfillInterval&& result)
31+
PrimaryBackfillInterval result;
32+
PrimaryScanned(PrimaryBackfillInterval&& result)
3233
: result(std::move(result)) {
3334
}
3435
};
3536

3637
struct ReplicaScanned : sc::event<ReplicaScanned> {
3738
pg_shard_t from;
38-
BackfillInterval result;
39-
ReplicaScanned(pg_shard_t from, BackfillInterval&& result)
39+
ReplicaBackfillInterval result;
40+
ReplicaScanned(pg_shard_t from, ReplicaBackfillInterval&& result)
4041
: from(std::move(from)),
4142
result(std::move(result)) {
4243
}
@@ -166,8 +167,8 @@ struct BackfillState {
166167
// completed yet.
167168
static bool all_enqueued(
168169
const PeeringFacade& peering_state,
169-
const BackfillInterval& backfill_info,
170-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
170+
const PrimaryBackfillInterval& backfill_info,
171+
const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info);
171172

172173
private:
173174
void maybe_update_range();
@@ -176,33 +177,35 @@ struct BackfillState {
176177
// these methods take BackfillIntervals instead of extracting them from
177178
// the state to emphasize the relationships across the main loop.
178179
bool all_emptied(
179-
const BackfillInterval& local_backfill_info,
180-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
180+
const PrimaryBackfillInterval& local_backfill_info,
181+
const std::map<pg_shard_t,
182+
ReplicaBackfillInterval>& peer_backfill_info) const;
181183
hobject_t earliest_peer_backfill(
182-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info) const;
184+
const std::map<pg_shard_t,
185+
ReplicaBackfillInterval>& peer_backfill_info) const;
183186
bool should_rescan_replicas(
184-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
185-
const BackfillInterval& backfill_info) const;
187+
const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
188+
const PrimaryBackfillInterval& backfill_info) const;
186189
// indicate whether a particular acting primary needs to scanned again
187190
// to process next piece of the hobject_t's namespace.
188191
// the logic is per analogy to replica_needs_scan(). See comments there.
189192
bool should_rescan_primary(
190-
const std::map<pg_shard_t, BackfillInterval>& peer_backfill_info,
191-
const BackfillInterval& backfill_info) const;
193+
const std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info,
194+
const PrimaryBackfillInterval& backfill_info) const;
192195

193196
// the result_t is intermediary between {remove,update}_on_peers() and
194-
// updating BackfillIntervals in trim_backfilled_object_from_intervals.
195-
// This step is important because it affects the main loop's condition,
196-
// and thus deserves to be exposed instead of being called deeply from
197-
// {remove,update}_on_peers().
197+
// updating ReplicaBackfillIntervals in
198+
// trim_backfilled_object_from_intervals. This step is important
199+
// because it affects the main loop's condition, and thus deserves to be
200+
// exposed instead of being called deeply from {remove,update}_on_peers().
198201
struct [[nodiscard]] result_t {
199202
std::set<pg_shard_t> pbi_targets;
200203
hobject_t new_last_backfill_started;
201204
};
202205
void trim_backfilled_object_from_intervals(
203206
result_t&&,
204207
hobject_t& last_backfill_started,
205-
std::map<pg_shard_t, BackfillInterval>& peer_backfill_info);
208+
std::map<pg_shard_t, ReplicaBackfillInterval>& peer_backfill_info);
206209
result_t remove_on_peers(const hobject_t& check);
207210
result_t update_on_peers(const hobject_t& check);
208211
};
@@ -242,12 +245,12 @@ struct BackfillState {
242245
sc::result react(Triggered);
243246

244247
// indicate whether a particular peer should be scanned to retrieve
245-
// BackfillInterval for new range of hobject_t namespace.
248+
// ReplicaBackfillInterval for new range of hobject_t namespace.
246249
// true when bi.objects is exhausted, replica bi's end is not MAX,
247250
// and primary bi'begin is further than the replica's one.
248251
static bool replica_needs_scan(
249-
const BackfillInterval& replica_backfill_info,
250-
const BackfillInterval& local_backfill_info);
252+
const ReplicaBackfillInterval& replica_backfill_info,
253+
const PrimaryBackfillInterval& local_backfill_info);
251254

252255
private:
253256
std::set<pg_shard_t> waiting_on_backfill;
@@ -339,8 +342,8 @@ struct BackfillState {
339342
backfill_suspend_state.should_go_enqueuing = true;
340343
}
341344
hobject_t last_backfill_started;
342-
BackfillInterval backfill_info;
343-
std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
345+
PrimaryBackfillInterval backfill_info;
346+
std::map<pg_shard_t, ReplicaBackfillInterval> peer_backfill_info;
344347
BackfillMachine backfill_machine;
345348
std::unique_ptr<ProgressTracker> progress_tracker;
346349
size_t replicas_in_backfill = 0;
@@ -408,6 +411,7 @@ struct BackfillState::PeeringFacade {
408411
const hobject_t &soid,
409412
const eversion_t &v,
410413
const std::vector<pg_shard_t> &peers) = 0;
414+
virtual const pg_pool_t& get_pool() const = 0;
411415
virtual ~PeeringFacade() {}
412416
};
413417

src/crimson/osd/pg_recovery.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -508,11 +508,12 @@ void PGRecovery::request_primary_scan(
508508
{
509509
logger().debug("{}", __func__);
510510
using crimson::common::local_conf;
511-
std::ignore = pg->get_recovery_backend()->scan_for_backfill(
511+
std::ignore = pg->get_recovery_backend()->scan_for_backfill_primary(
512512
begin,
513513
local_conf()->osd_backfill_scan_min,
514-
local_conf()->osd_backfill_scan_max
515-
).then_interruptible([this] (BackfillInterval bi) {
514+
local_conf()->osd_backfill_scan_max,
515+
pg->get_peering_state().get_backfill_targets()
516+
).then_interruptible([this] (PrimaryBackfillInterval bi) {
516517
logger().debug("request_primary_scan:{}", __func__);
517518
using BackfillState = crimson::osd::BackfillState;
518519
backfill_state->process_event(

0 commit comments

Comments
 (0)