Skip to content

Commit 5ad1aaf

Browse files
committed
osd: EC optimizations: changes to rollback to support partial writes
EC Pools create an ObjectModDesc entry in each log entry that describes how to undo the modifcation. During peering if it is determined that only some of the shards completed an update then these shards are instructed to rollback the change. This ensures that each update is either applied to all or none of the shards. For EC optimized pools rollback becomes a bit more complicated. Firstly because not all shards may have been updated the rollback needs to be more selective in what is undone. Secondly optimized pools do not pad objects to be a multiple of the stripe width which means shards can be different sizes. There is a single ObjectModDesc entry that contains a set of operations applied by every shard, individial operations need to include enough information to work out what has to be undone on each shard. Signed-off-by: Bill Scales <[email protected]>
1 parent edbce5f commit 5ad1aaf

File tree

7 files changed

+227
-54
lines changed

7 files changed

+227
-54
lines changed

src/osd/ECBackend.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,11 @@ class ECBackend : public ECCommon {
388388
int get_ec_stripe_chunk_size() const {
389389
return sinfo.get_chunk_size();
390390
}
391-
uint64_t object_size_to_shard_size(const uint64_t size, int shard) const {
391+
uint64_t object_size_to_shard_size(const uint64_t size,
392+
shard_id_t shard) const {
393+
if (size == std::numeric_limits<uint64_t>::max()) {
394+
return size;
395+
}
392396
return sinfo.logical_to_next_chunk_offset(size);
393397
}
394398
/**

src/osd/ECBackendL.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,9 @@ END_IGNORE_DEPRECATED
385385
return sinfo.get_chunk_size();
386386
}
387387
uint64_t object_size_to_shard_size(const uint64_t size) const {
388+
if (size == std::numeric_limits<uint64_t>::max()) {
389+
return size;
390+
}
388391
return sinfo.logical_to_next_chunk_offset(size);
389392
}
390393
/**

src/osd/ECSwitch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ class ECSwitch : public PGBackend
347347
}
348348

349349
uint64_t
350-
object_size_to_shard_size(const uint64_t size, int shard) const override
350+
object_size_to_shard_size(const uint64_t size, shard_id_t shard) const override
351351
{
352352
if (is_optimized()) {
353353
return optimized.object_size_to_shard_size(size, shard);

src/osd/PGBackend.cc

Lines changed: 145 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -204,23 +204,56 @@ void PGBackend::rollback(
204204
struct RollbackVisitor : public ObjectModDesc::Visitor {
205205
const hobject_t &hoid;
206206
PGBackend *pg;
207+
const pg_log_entry_t &entry;
207208
ObjectStore::Transaction t;
208209
RollbackVisitor(
209210
const hobject_t &hoid,
210-
PGBackend *pg) : hoid(hoid), pg(pg) {}
211+
PGBackend *pg,
212+
const pg_log_entry_t &entry) : hoid(hoid), pg(pg), entry(entry) {}
211213
void append(uint64_t old_size) override {
212214
ObjectStore::Transaction temp;
213-
int s = static_cast<int>(pg->get_parent()->whoami_shard().shard);
214-
const uint64_t shard_size = pg->object_size_to_shard_size(old_size, s);
215+
auto dpp = pg->get_parent()->get_dpp();
216+
const uint64_t shard_size = pg->object_size_to_shard_size(old_size,
217+
pg->get_parent()->whoami_shard().shard);
218+
ldpp_dout(dpp, 20) << " entry " << entry.version
219+
<< " rollback append object_size " << old_size
220+
<< " shard_size " << shard_size << dendl;
215221
pg->rollback_append(hoid, shard_size, &temp);
216222
temp.append(t);
217223
temp.swap(t);
218224
}
219225
void setattrs(map<string, std::optional<bufferlist> > &attrs) override {
220-
ObjectStore::Transaction temp;
221-
pg->rollback_setattrs(hoid, attrs, &temp);
222-
temp.append(t);
223-
temp.swap(t);
226+
auto dpp = pg->get_parent()->get_dpp();
227+
const pg_pool_t &pool = pg->get_parent()->get_pool();
228+
if (pool.is_nonprimary_shard(pg->get_parent()->whoami_shard().shard)) {
229+
if (entry.is_written_shard(pg->get_parent()->whoami_shard().shard)) {
230+
// Written shard - only rollback OI attr
231+
ldpp_dout(dpp, 20) << " entry " << entry.version
232+
<< " written shard OI attr rollback "
233+
<< pg->get_parent()->whoami_shard().shard
234+
<< dendl;
235+
ObjectStore::Transaction temp;
236+
pg->rollback_setattrs(hoid, attrs, &temp, true);
237+
temp.append(t);
238+
temp.swap(t);
239+
} else {
240+
// Unwritten shard - nothing to rollback
241+
ldpp_dout(dpp, 20) << " entry " << entry.version
242+
<< " unwritten shard skipping attr rollback "
243+
<< pg->get_parent()->whoami_shard().shard
244+
<< dendl;
245+
}
246+
} else {
247+
// Primary shard - rollback all attrs
248+
ldpp_dout(dpp, 20) << " entry " << entry.version
249+
<< " primary_shard attr rollback "
250+
<< pg->get_parent()->whoami_shard().shard
251+
<< dendl;
252+
ObjectStore::Transaction temp;
253+
pg->rollback_setattrs(hoid, attrs, &temp, false);
254+
temp.append(t);
255+
temp.swap(t);
256+
}
224257
}
225258
void rmobject(version_t old_version) override {
226259
ObjectStore::Transaction temp;
@@ -247,17 +280,58 @@ void PGBackend::rollback(
247280
temp.swap(t);
248281
}
249282
void rollback_extents(
250-
version_t gen,
251-
const vector<pair<uint64_t, uint64_t> > &extents) override {
283+
const version_t gen,
284+
const std::vector<std::pair<uint64_t, uint64_t>> &extents,
285+
const uint64_t object_size,
286+
const std::vector<shard_id_set> &shards) override {
252287
ObjectStore::Transaction temp;
253-
pg->rollback_extents(gen, extents, hoid, &temp);
254-
temp.append(t);
255-
temp.swap(t);
288+
const pg_pool_t& pool = pg->get_parent()->get_pool();
289+
ceph_assert(entry.written_shards.empty() ||
290+
pool.allows_ecoptimizations());
291+
auto dpp = pg->get_parent()->get_dpp();
292+
bool donework = false;
293+
ceph_assert(shards.empty() || shards.size() == extents.size());
294+
for (unsigned int i = 0; i < extents.size(); i++) {
295+
if (shards.empty() ||
296+
shards[i].empty() ||
297+
shards[i].contains(pg->get_parent()->whoami_shard().shard)) {
298+
// Written shard - rollback extents
299+
const uint64_t shard_size = pg->object_size_to_shard_size(
300+
object_size,
301+
pg->get_parent()->whoami_shard().shard);
302+
ldpp_dout(dpp, 20) << " entry " << entry.version
303+
<< " written shard rollback_extents "
304+
<< entry.written_shards
305+
<< " shards "
306+
<< (shards.empty() ? shard_id_set() : shards[i])
307+
<< " " << pg->get_parent()->whoami_shard().shard
308+
<< " " << object_size
309+
<< " " << shard_size
310+
<< dendl;
311+
pg->rollback_extents(gen, extents[i].first, extents[i].second,
312+
hoid, shard_size, &temp);
313+
donework = true;
314+
} else {
315+
// Unwritten shard - nothing to rollback
316+
ldpp_dout(dpp, 20) << " entry " << entry.version
317+
<< " unwritten shard skipping rollback_extents "
318+
<< entry.written_shards
319+
<< " " << pg->get_parent()->whoami_shard().shard
320+
<< dendl;
321+
}
322+
}
323+
if (donework) {
324+
t.remove(
325+
pg->coll,
326+
ghobject_t(hoid, gen, pg->get_parent()->whoami_shard().shard));
327+
temp.append(t);
328+
temp.swap(t);
329+
}
256330
}
257331
};
258332

259333
ceph_assert(entry.mod_desc.can_rollback());
260-
RollbackVisitor vis(entry.soid, this);
334+
RollbackVisitor vis(entry.soid, this, entry);
261335
entry.mod_desc.visit(&vis);
262336
t->append(vis.t);
263337
}
@@ -279,12 +353,28 @@ struct Trimmer : public ObjectModDesc::Visitor {
279353
}
280354
// try_rmobject defaults to rmobject
281355
void rollback_extents(
282-
version_t gen,
283-
const vector<pair<uint64_t, uint64_t> > &extents) override {
284-
pg->trim_rollback_object(
285-
soid,
286-
gen,
287-
t);
356+
const version_t gen,
357+
const std::vector<std::pair<uint64_t, uint64_t>> &extents,
358+
const uint64_t object_size,
359+
const std::vector<shard_id_set> &shards) override {
360+
auto dpp = pg->get_parent()->get_dpp();
361+
ceph_assert(shards.empty() || shards.size() == extents.size());
362+
for (unsigned int i = 0; i < extents.size(); i++) {
363+
if (shards.empty() ||
364+
shards[i].empty() ||
365+
shards[i].contains(pg->get_parent()->whoami_shard().shard)) {
366+
ldpp_dout(dpp, 30) << __func__ << " trim " << shards << " "
367+
<< pg->get_parent()->whoami_shard().shard << dendl;
368+
pg->trim_rollback_object(
369+
soid,
370+
gen,
371+
t);
372+
break;
373+
} else {
374+
ldpp_dout(dpp, 20) << __func__ << " skipping trim " << shards << " "
375+
<< pg->get_parent()->whoami_shard().shard << dendl;
376+
}
377+
}
288378
}
289379
};
290380

@@ -481,25 +571,34 @@ int PGBackend::objects_get_attrs(
481571
void PGBackend::rollback_setattrs(
482572
const hobject_t &hoid,
483573
map<string, std::optional<bufferlist> > &old_attrs,
484-
ObjectStore::Transaction *t) {
574+
ObjectStore::Transaction *t,
575+
bool only_oi) {
485576
map<string, bufferlist, less<>> to_set;
486577
ceph_assert(!hoid.is_temp());
487578
for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin();
488579
i != old_attrs.end();
489580
++i) {
490581
if (i->second) {
491582
to_set[i->first] = *(i->second);
492-
} else {
583+
} else if (!only_oi) {
493584
t->rmattr(
494585
coll,
495586
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
496587
i->first);
497588
}
498589
}
499-
t->setattrs(
500-
coll,
501-
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
502-
to_set);
590+
if (only_oi) {
591+
t->setattr(
592+
coll,
593+
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
594+
OI_ATTR,
595+
to_set[OI_ATTR]);
596+
} else {
597+
t->setattrs(
598+
coll,
599+
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
600+
to_set);
601+
}
503602
}
504603

505604
void PGBackend::rollback_append(
@@ -533,6 +632,7 @@ void PGBackend::rollback_try_stash(
533632
version_t old_version,
534633
ObjectStore::Transaction *t) {
535634
ceph_assert(!hoid.is_temp());
635+
dout(20) << __func__ << " " << hoid << " " << old_version << dendl;
536636
t->remove(
537637
coll,
538638
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
@@ -544,29 +644,41 @@ void PGBackend::rollback_try_stash(
544644

545645
void PGBackend::rollback_extents(
546646
version_t gen,
547-
const vector<pair<uint64_t, uint64_t> > &extents,
647+
const uint64_t offset,
648+
uint64_t length,
548649
const hobject_t &hoid,
650+
const uint64_t shard_size,
549651
ObjectStore::Transaction *t) {
550652
auto shard = get_parent()->whoami_shard().shard;
551-
for (auto &&extent: extents) {
653+
if (offset >= shard_size) {
654+
// extent on this shard is beyond the end of the object - nothing to do
655+
dout(20) << __func__ << " " << hoid << " "
656+
<< offset << "~" << length << " is out of range "
657+
<< shard_size << dendl;
658+
} else {
659+
if (offset + length > shard_size) {
660+
dout(20) << __func__ << " " << length << " is being truncated" << dendl;
661+
// extent on this shard goes beyond end of the object - truncate length
662+
length = shard_size - offset;
663+
}
664+
dout(20) << __func__ << " " << hoid << " " << offset << "~" << length
665+
<< dendl;
552666
t->clone_range(
553667
coll,
554668
ghobject_t(hoid, gen, shard),
555669
ghobject_t(hoid, ghobject_t::NO_GEN, shard),
556-
extent.first,
557-
extent.second,
558-
extent.first);
670+
offset,
671+
length,
672+
offset);
559673
}
560-
t->remove(
561-
coll,
562-
ghobject_t(hoid, gen, shard));
563674
}
564675

565676
void PGBackend::trim_rollback_object(
566677
const hobject_t &hoid,
567678
version_t old_version,
568679
ObjectStore::Transaction *t) {
569680
ceph_assert(!hoid.is_temp());
681+
dout(20) << __func__ << " trim " << hoid << " " << old_version << dendl;
570682
t->remove(
571683
coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
572684
}

src/osd/PGBackend.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
426426
virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
427427
virtual unsigned int get_ec_data_chunk_count() const { return 0; };
428428
virtual int get_ec_stripe_chunk_size() const { return 0; };
429-
virtual uint64_t object_size_to_shard_size(const uint64_t size, int shard) const { return size; };
429+
virtual uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard) const { return size; };
430430
virtual void dump_recovery_info(ceph::Formatter *f) const = 0;
431431

432432
private:
@@ -504,7 +504,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
504504
void rollback_setattrs(
505505
const hobject_t &hoid,
506506
std::map<std::string, std::optional<ceph::buffer::list> > &old_attrs,
507-
ObjectStore::Transaction *t);
507+
ObjectStore::Transaction *t,
508+
bool only_oi);
508509

509510
/// Truncate object to rollback append
510511
void rollback_append(
@@ -534,8 +535,10 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
534535
/// Clone the extents back into place
535536
void rollback_extents(
536537
version_t gen,
537-
const std::vector<std::pair<uint64_t, uint64_t> > &extents,
538+
const uint64_t offset,
539+
uint64_t length,
538540
const hobject_t &hoid,
541+
const uint64_t shard_size,
539542
ObjectStore::Transaction *t);
540543
public:
541544

src/osd/osd_types.cc

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4668,11 +4668,25 @@ void ObjectModDesc::visit(Visitor *visitor) const
46684668
break;
46694669
}
46704670
case ROLLBACK_EXTENTS: {
4671-
vector<pair<uint64_t, uint64_t> > extents;
4671+
vector<pair<uint64_t, uint64_t>> extents;
46724672
version_t gen;
4673+
uint64_t object_size;
4674+
vector<shard_id_set> shards;
46734675
decode(gen, bp);
46744676
decode(extents, bp);
4675-
visitor->rollback_extents(gen,extents);
4677+
if (struct_v < 3) {
4678+
// Object size is used by optimized EC pools that do not pad objects to a
4679+
// multiple of the strip size. Rollback clone operations for each shard
4680+
// need to be truncated to not exceed the object size. Legacy EC pools
4681+
// do not store the object_size, but because objects are padded do not
4682+
// need to truncate the clones. Setting object_size to max avoids
4683+
// truncation.
4684+
object_size = std::numeric_limits<uint64_t>::max();
4685+
} else {
4686+
decode(object_size, bp);
4687+
decode(shards, bp);
4688+
}
4689+
visitor->rollback_extents(gen, extents, object_size, shards);
46764690
break;
46774691
}
46784692
default:
@@ -4728,11 +4742,16 @@ struct DumpVisitor : public ObjectModDesc::Visitor {
47284742
f->close_section();
47294743
}
47304744
void rollback_extents(
4731-
version_t gen,
4732-
const vector<pair<uint64_t, uint64_t> > &extents) override {
4745+
const version_t gen,
4746+
const vector<pair<uint64_t, uint64_t>> &extents,
4747+
const uint64_t object_size,
4748+
const vector<shard_id_set> &shards) override {
47334749
f->open_object_section("op");
47344750
f->dump_string("code", "ROLLBACK_EXTENTS");
47354751
f->dump_unsigned("gen", gen);
4752+
f->dump_unsigned("object_size", object_size);
4753+
f->dump_stream("extents") << extents;
4754+
f->dump_stream("shards") << shards;
47364755
f->dump_stream("snaps") << extents;
47374756
f->close_section();
47384757
}

0 commit comments

Comments
 (0)