Skip to content

Commit ac8d74a

Browse files
authored
Merge pull request ceph#62497 from bill-scales/ec_rollback_changes
osd: EC optimizations: changes to rollback to support partial writes
2 parents 7530afa + 5ad1aaf commit ac8d74a

File tree

7 files changed

+227
-54
lines changed

7 files changed

+227
-54
lines changed

src/osd/ECBackend.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,11 @@ class ECBackend : public ECCommon {
388388
int get_ec_stripe_chunk_size() const {
389389
return sinfo.get_chunk_size();
390390
}
391-
uint64_t object_size_to_shard_size(const uint64_t size, int shard) const {
391+
uint64_t object_size_to_shard_size(const uint64_t size,
392+
shard_id_t shard) const {
393+
if (size == std::numeric_limits<uint64_t>::max()) {
394+
return size;
395+
}
392396
return sinfo.logical_to_next_chunk_offset(size);
393397
}
394398
/**

src/osd/ECBackendL.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,9 @@ END_IGNORE_DEPRECATED
385385
return sinfo.get_chunk_size();
386386
}
387387
uint64_t object_size_to_shard_size(const uint64_t size) const {
388+
if (size == std::numeric_limits<uint64_t>::max()) {
389+
return size;
390+
}
388391
return sinfo.logical_to_next_chunk_offset(size);
389392
}
390393
/**

src/osd/ECSwitch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ class ECSwitch : public PGBackend
347347
}
348348

349349
uint64_t
350-
object_size_to_shard_size(const uint64_t size, int shard) const override
350+
object_size_to_shard_size(const uint64_t size, shard_id_t shard) const override
351351
{
352352
if (is_optimized()) {
353353
return optimized.object_size_to_shard_size(size, shard);

src/osd/PGBackend.cc

Lines changed: 145 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -204,23 +204,56 @@ void PGBackend::rollback(
204204
struct RollbackVisitor : public ObjectModDesc::Visitor {
205205
const hobject_t &hoid;
206206
PGBackend *pg;
207+
const pg_log_entry_t &entry;
207208
ObjectStore::Transaction t;
208209
RollbackVisitor(
209210
const hobject_t &hoid,
210-
PGBackend *pg) : hoid(hoid), pg(pg) {}
211+
PGBackend *pg,
212+
const pg_log_entry_t &entry) : hoid(hoid), pg(pg), entry(entry) {}
211213
void append(uint64_t old_size) override {
212214
ObjectStore::Transaction temp;
213-
int s = static_cast<int>(pg->get_parent()->whoami_shard().shard);
214-
const uint64_t shard_size = pg->object_size_to_shard_size(old_size, s);
215+
auto dpp = pg->get_parent()->get_dpp();
216+
const uint64_t shard_size = pg->object_size_to_shard_size(old_size,
217+
pg->get_parent()->whoami_shard().shard);
218+
ldpp_dout(dpp, 20) << " entry " << entry.version
219+
<< " rollback append object_size " << old_size
220+
<< " shard_size " << shard_size << dendl;
215221
pg->rollback_append(hoid, shard_size, &temp);
216222
temp.append(t);
217223
temp.swap(t);
218224
}
219225
void setattrs(map<string, std::optional<bufferlist> > &attrs) override {
220-
ObjectStore::Transaction temp;
221-
pg->rollback_setattrs(hoid, attrs, &temp);
222-
temp.append(t);
223-
temp.swap(t);
226+
auto dpp = pg->get_parent()->get_dpp();
227+
const pg_pool_t &pool = pg->get_parent()->get_pool();
228+
if (pool.is_nonprimary_shard(pg->get_parent()->whoami_shard().shard)) {
229+
if (entry.is_written_shard(pg->get_parent()->whoami_shard().shard)) {
230+
// Written shard - only rollback OI attr
231+
ldpp_dout(dpp, 20) << " entry " << entry.version
232+
<< " written shard OI attr rollback "
233+
<< pg->get_parent()->whoami_shard().shard
234+
<< dendl;
235+
ObjectStore::Transaction temp;
236+
pg->rollback_setattrs(hoid, attrs, &temp, true);
237+
temp.append(t);
238+
temp.swap(t);
239+
} else {
240+
// Unwritten shard - nothing to rollback
241+
ldpp_dout(dpp, 20) << " entry " << entry.version
242+
<< " unwritten shard skipping attr rollback "
243+
<< pg->get_parent()->whoami_shard().shard
244+
<< dendl;
245+
}
246+
} else {
247+
// Primary shard - rollback all attrs
248+
ldpp_dout(dpp, 20) << " entry " << entry.version
249+
<< " primary_shard attr rollback "
250+
<< pg->get_parent()->whoami_shard().shard
251+
<< dendl;
252+
ObjectStore::Transaction temp;
253+
pg->rollback_setattrs(hoid, attrs, &temp, false);
254+
temp.append(t);
255+
temp.swap(t);
256+
}
224257
}
225258
void rmobject(version_t old_version) override {
226259
ObjectStore::Transaction temp;
@@ -247,17 +280,58 @@ void PGBackend::rollback(
247280
temp.swap(t);
248281
}
249282
void rollback_extents(
250-
version_t gen,
251-
const vector<pair<uint64_t, uint64_t> > &extents) override {
283+
const version_t gen,
284+
const std::vector<std::pair<uint64_t, uint64_t>> &extents,
285+
const uint64_t object_size,
286+
const std::vector<shard_id_set> &shards) override {
252287
ObjectStore::Transaction temp;
253-
pg->rollback_extents(gen, extents, hoid, &temp);
254-
temp.append(t);
255-
temp.swap(t);
288+
const pg_pool_t& pool = pg->get_parent()->get_pool();
289+
ceph_assert(entry.written_shards.empty() ||
290+
pool.allows_ecoptimizations());
291+
auto dpp = pg->get_parent()->get_dpp();
292+
bool donework = false;
293+
ceph_assert(shards.empty() || shards.size() == extents.size());
294+
for (unsigned int i = 0; i < extents.size(); i++) {
295+
if (shards.empty() ||
296+
shards[i].empty() ||
297+
shards[i].contains(pg->get_parent()->whoami_shard().shard)) {
298+
// Written shard - rollback extents
299+
const uint64_t shard_size = pg->object_size_to_shard_size(
300+
object_size,
301+
pg->get_parent()->whoami_shard().shard);
302+
ldpp_dout(dpp, 20) << " entry " << entry.version
303+
<< " written shard rollback_extents "
304+
<< entry.written_shards
305+
<< " shards "
306+
<< (shards.empty() ? shard_id_set() : shards[i])
307+
<< " " << pg->get_parent()->whoami_shard().shard
308+
<< " " << object_size
309+
<< " " << shard_size
310+
<< dendl;
311+
pg->rollback_extents(gen, extents[i].first, extents[i].second,
312+
hoid, shard_size, &temp);
313+
donework = true;
314+
} else {
315+
// Unwritten shard - nothing to rollback
316+
ldpp_dout(dpp, 20) << " entry " << entry.version
317+
<< " unwritten shard skipping rollback_extents "
318+
<< entry.written_shards
319+
<< " " << pg->get_parent()->whoami_shard().shard
320+
<< dendl;
321+
}
322+
}
323+
if (donework) {
324+
t.remove(
325+
pg->coll,
326+
ghobject_t(hoid, gen, pg->get_parent()->whoami_shard().shard));
327+
temp.append(t);
328+
temp.swap(t);
329+
}
256330
}
257331
};
258332

259333
ceph_assert(entry.mod_desc.can_rollback());
260-
RollbackVisitor vis(entry.soid, this);
334+
RollbackVisitor vis(entry.soid, this, entry);
261335
entry.mod_desc.visit(&vis);
262336
t->append(vis.t);
263337
}
@@ -279,12 +353,28 @@ struct Trimmer : public ObjectModDesc::Visitor {
279353
}
280354
// try_rmobject defaults to rmobject
281355
void rollback_extents(
282-
version_t gen,
283-
const vector<pair<uint64_t, uint64_t> > &extents) override {
284-
pg->trim_rollback_object(
285-
soid,
286-
gen,
287-
t);
356+
const version_t gen,
357+
const std::vector<std::pair<uint64_t, uint64_t>> &extents,
358+
const uint64_t object_size,
359+
const std::vector<shard_id_set> &shards) override {
360+
auto dpp = pg->get_parent()->get_dpp();
361+
ceph_assert(shards.empty() || shards.size() == extents.size());
362+
for (unsigned int i = 0; i < extents.size(); i++) {
363+
if (shards.empty() ||
364+
shards[i].empty() ||
365+
shards[i].contains(pg->get_parent()->whoami_shard().shard)) {
366+
ldpp_dout(dpp, 30) << __func__ << " trim " << shards << " "
367+
<< pg->get_parent()->whoami_shard().shard << dendl;
368+
pg->trim_rollback_object(
369+
soid,
370+
gen,
371+
t);
372+
break;
373+
} else {
374+
ldpp_dout(dpp, 20) << __func__ << " skipping trim " << shards << " "
375+
<< pg->get_parent()->whoami_shard().shard << dendl;
376+
}
377+
}
288378
}
289379
};
290380

@@ -481,25 +571,34 @@ int PGBackend::objects_get_attrs(
481571
void PGBackend::rollback_setattrs(
482572
const hobject_t &hoid,
483573
map<string, std::optional<bufferlist> > &old_attrs,
484-
ObjectStore::Transaction *t) {
574+
ObjectStore::Transaction *t,
575+
bool only_oi) {
485576
map<string, bufferlist, less<>> to_set;
486577
ceph_assert(!hoid.is_temp());
487578
for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin();
488579
i != old_attrs.end();
489580
++i) {
490581
if (i->second) {
491582
to_set[i->first] = *(i->second);
492-
} else {
583+
} else if (!only_oi) {
493584
t->rmattr(
494585
coll,
495586
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
496587
i->first);
497588
}
498589
}
499-
t->setattrs(
500-
coll,
501-
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
502-
to_set);
590+
if (only_oi) {
591+
t->setattr(
592+
coll,
593+
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
594+
OI_ATTR,
595+
to_set[OI_ATTR]);
596+
} else {
597+
t->setattrs(
598+
coll,
599+
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
600+
to_set);
601+
}
503602
}
504603

505604
void PGBackend::rollback_append(
@@ -533,6 +632,7 @@ void PGBackend::rollback_try_stash(
533632
version_t old_version,
534633
ObjectStore::Transaction *t) {
535634
ceph_assert(!hoid.is_temp());
635+
dout(20) << __func__ << " " << hoid << " " << old_version << dendl;
536636
t->remove(
537637
coll,
538638
ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
@@ -544,29 +644,41 @@ void PGBackend::rollback_try_stash(
544644

545645
void PGBackend::rollback_extents(
546646
version_t gen,
547-
const vector<pair<uint64_t, uint64_t> > &extents,
647+
const uint64_t offset,
648+
uint64_t length,
548649
const hobject_t &hoid,
650+
const uint64_t shard_size,
549651
ObjectStore::Transaction *t) {
550652
auto shard = get_parent()->whoami_shard().shard;
551-
for (auto &&extent: extents) {
653+
if (offset >= shard_size) {
654+
// extent on this shard is beyond the end of the object - nothing to do
655+
dout(20) << __func__ << " " << hoid << " "
656+
<< offset << "~" << length << " is out of range "
657+
<< shard_size << dendl;
658+
} else {
659+
if (offset + length > shard_size) {
660+
dout(20) << __func__ << " " << length << " is being truncated" << dendl;
661+
// extent on this shard goes beyond end of the object - truncate length
662+
length = shard_size - offset;
663+
}
664+
dout(20) << __func__ << " " << hoid << " " << offset << "~" << length
665+
<< dendl;
552666
t->clone_range(
553667
coll,
554668
ghobject_t(hoid, gen, shard),
555669
ghobject_t(hoid, ghobject_t::NO_GEN, shard),
556-
extent.first,
557-
extent.second,
558-
extent.first);
670+
offset,
671+
length,
672+
offset);
559673
}
560-
t->remove(
561-
coll,
562-
ghobject_t(hoid, gen, shard));
563674
}
564675

565676
void PGBackend::trim_rollback_object(
566677
const hobject_t &hoid,
567678
version_t old_version,
568679
ObjectStore::Transaction *t) {
569680
ceph_assert(!hoid.is_temp());
681+
dout(20) << __func__ << " trim " << hoid << " " << old_version << dendl;
570682
t->remove(
571683
coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
572684
}

src/osd/PGBackend.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
426426
virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
427427
virtual unsigned int get_ec_data_chunk_count() const { return 0; };
428428
virtual int get_ec_stripe_chunk_size() const { return 0; };
429-
virtual uint64_t object_size_to_shard_size(const uint64_t size, int shard) const { return size; };
429+
virtual uint64_t object_size_to_shard_size(const uint64_t size, shard_id_t shard) const { return size; };
430430
virtual void dump_recovery_info(ceph::Formatter *f) const = 0;
431431

432432
private:
@@ -504,7 +504,8 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
504504
void rollback_setattrs(
505505
const hobject_t &hoid,
506506
std::map<std::string, std::optional<ceph::buffer::list> > &old_attrs,
507-
ObjectStore::Transaction *t);
507+
ObjectStore::Transaction *t,
508+
bool only_oi);
508509

509510
/// Truncate object to rollback append
510511
void rollback_append(
@@ -534,8 +535,10 @@ typedef std::shared_ptr<const OSDMap> OSDMapRef;
534535
/// Clone the extents back into place
535536
void rollback_extents(
536537
version_t gen,
537-
const std::vector<std::pair<uint64_t, uint64_t> > &extents,
538+
const uint64_t offset,
539+
uint64_t length,
538540
const hobject_t &hoid,
541+
const uint64_t shard_size,
539542
ObjectStore::Transaction *t);
540543
public:
541544

src/osd/osd_types.cc

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4668,11 +4668,25 @@ void ObjectModDesc::visit(Visitor *visitor) const
46684668
break;
46694669
}
46704670
case ROLLBACK_EXTENTS: {
4671-
vector<pair<uint64_t, uint64_t> > extents;
4671+
vector<pair<uint64_t, uint64_t>> extents;
46724672
version_t gen;
4673+
uint64_t object_size;
4674+
vector<shard_id_set> shards;
46734675
decode(gen, bp);
46744676
decode(extents, bp);
4675-
visitor->rollback_extents(gen,extents);
4677+
if (struct_v < 3) {
4678+
// Object size is used by optimized EC pools that do not pad objects to a
4679+
// multiple of the strip size. Rollback clone operations for each shard
4680+
// need to be truncated to not exceed the object size. Legacy EC pools
4681+
// do not store the object_size, but because objects are padded do not
4682+
// need to truncate the clones. Setting object_size to max avoids
4683+
// truncation.
4684+
object_size = std::numeric_limits<uint64_t>::max();
4685+
} else {
4686+
decode(object_size, bp);
4687+
decode(shards, bp);
4688+
}
4689+
visitor->rollback_extents(gen, extents, object_size, shards);
46764690
break;
46774691
}
46784692
default:
@@ -4728,11 +4742,16 @@ struct DumpVisitor : public ObjectModDesc::Visitor {
47284742
f->close_section();
47294743
}
47304744
void rollback_extents(
4731-
version_t gen,
4732-
const vector<pair<uint64_t, uint64_t> > &extents) override {
4745+
const version_t gen,
4746+
const vector<pair<uint64_t, uint64_t>> &extents,
4747+
const uint64_t object_size,
4748+
const vector<shard_id_set> &shards) override {
47334749
f->open_object_section("op");
47344750
f->dump_string("code", "ROLLBACK_EXTENTS");
47354751
f->dump_unsigned("gen", gen);
4752+
f->dump_unsigned("object_size", object_size);
4753+
f->dump_stream("extents") << extents;
4754+
f->dump_stream("shards") << shards;
47364755
f->dump_stream("snaps") << extents;
47374756
f->close_section();
47384757
}

0 commit comments

Comments
 (0)