Skip to content

Commit 8cacc72

Browse files
authored
Merge pull request ceph#61990 from aclamk/wip-aclamk-write-v2-dirty-reshard
os/bluestore: write_v2 resharding misplaced
2 parents 352d1a4 + 71e094f commit 8cacc72

File tree

5 files changed

+153
-38
lines changed

5 files changed

+153
-38
lines changed

src/os/bluestore/BlueStore.cc

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4178,6 +4178,29 @@ void BlueStore::ExtentMap::init_shards(bool loaded, bool dirty)
41784178
}
41794179
}
41804180

4181+
std::pair<uint32_t, uint32_t> BlueStore::ExtentMap::fault_range_ex(
4182+
KeyValueDB *db,
4183+
uint32_t offset,
4184+
uint32_t length)
4185+
{
4186+
dout(30) << __func__ << " 0x" << std::hex << offset << "~" << length
4187+
<< std::dec << dendl;
4188+
if (shards.size() == 0) {
4189+
// no sharding yet; everyting is loaded
4190+
return {0, OBJECT_MAX_SIZE};
4191+
}
4192+
auto start = seek_shard(offset);
4193+
auto last = seek_shard(offset + length);
4194+
maybe_load_shard(db, start, last);
4195+
uint32_t left_bound = shards[start].shard_info->offset;
4196+
uint32_t right_bound = (size_t)last + 1 < shards.size() ?
4197+
shards[last + 1].shard_info->offset : OBJECT_MAX_SIZE;
4198+
dout(20) << __func__ << " start=" << start << " last=" << last
4199+
<< " -> 0x" << std::hex << left_bound << "~" << right_bound
4200+
<< std::dec << dendl;
4201+
return {left_bound, right_bound};
4202+
}
4203+
41814204
void BlueStore::ExtentMap::fault_range(
41824205
KeyValueDB *db,
41834206
uint32_t offset,
@@ -4191,6 +4214,14 @@ void BlueStore::ExtentMap::fault_range(
41914214
}
41924215
auto start = seek_shard(offset);
41934216
auto last = seek_shard(offset + length);
4217+
maybe_load_shard(db, start, last);
4218+
}
4219+
4220+
void BlueStore::ExtentMap::maybe_load_shard(
4221+
KeyValueDB *db,
4222+
int start,
4223+
int last)
4224+
{
41944225
ceph_assert(last >= start);
41954226
ceph_assert(start >= 0);
41964227

@@ -4216,9 +4247,10 @@ void BlueStore::ExtentMap::fault_range(
42164247
);
42174248
p->extents = decode_some(v);
42184249
p->loaded = true;
4219-
dout(20) << __func__ << " open shard 0x" << std::hex
4220-
<< p->shard_info->offset
4221-
<< " for range 0x" << offset << "~" << length << std::dec
4250+
uint32_t shard_end =
4251+
(size_t)start + 1 < shards.size() ? (p + 1)->shard_info->offset : OBJECT_MAX_SIZE;
4252+
dout(20) << __func__ << " open shard for range 0x"
4253+
<< std::hex << p->shard_info->offset << "~" << shard_end << std::dec
42224254
<< " (" << v.length() << " bytes)" << dendl;
42234255
ceph_assert(p->dirty == false);
42244256
ceph_assert(v.length() == p->shard_info->bytes);
@@ -17621,11 +17653,16 @@ int BlueStore::_do_write_v2(
1762117653
if (bl.length() != length) {
1762217654
bl.splice(length, bl.length() - length);
1762317655
}
17656+
BlueStore::Writer wr(this, txc, &wctx, o);
1762417657
uint64_t start = p2align(offset, min_alloc_size);
1762517658
uint64_t end = p2roundup(offset + length, min_alloc_size);
17626-
o->extent_map.fault_range(db, start, end - start);
17627-
BlueStore::Writer wr(this, txc, &wctx, o);
17659+
wr.left_affected_range = start;
17660+
wr.right_affected_range = end;
17661+
std::tie(wr.left_shard_bound, wr.right_shard_bound) =
17662+
o->extent_map.fault_range_ex(db, start, end - start);
1762817663
wr.do_write(offset, bl);
17664+
o->extent_map.dirty_range(wr.left_affected_range, wr.right_affected_range - wr.left_affected_range);
17665+
o->extent_map.maybe_reshard(wr.left_affected_range, wr.right_affected_range);
1762917666
return r;
1763017667
}
1763117668

src/os/bluestore/BlueStore.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,6 +1149,16 @@ class BlueStore : public ObjectStore,
11491149
/// ensure that a range of the map is loaded
11501150
void fault_range(KeyValueDB *db,
11511151
uint32_t offset, uint32_t length);
1152+
/// ensure that a range of the map is loaded
1153+
/// return range that is encompassed by affected shards
1154+
std::pair<uint32_t, uint32_t> fault_range_ex(
1155+
KeyValueDB *db,
1156+
uint32_t offset,
1157+
uint32_t length);
1158+
void maybe_load_shard(
1159+
KeyValueDB *db,
1160+
int begin_shard,
1161+
int end_shard);
11521162

11531163
/// ensure a range of the map is marked dirty
11541164
void dirty_range(uint32_t offset, uint32_t length);

src/os/bluestore/Writer.cc

Lines changed: 78 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,6 @@ inline BlueStore::extent_map_t::iterator BlueStore::Writer::_find_mutable_blob_r
282282
if (it->blob_end() <= mapmust_end) continue;
283283
}
284284
return it;
285-
break;
286285
};
287286
return map.end();
288287
}
@@ -547,6 +546,68 @@ BlueStore::BlobRef BlueStore::Writer::_blob_create_full(
547546
return blob;
548547
}
549548

549+
inline void BlueStore::Writer::_place_extent_in_blob(
550+
Extent* ex,
551+
uint32_t map_begin,
552+
uint32_t map_end,
553+
uint32_t in_blob_offset)
554+
{
555+
if (ex->logical_end() <= map_begin) {
556+
// we are adding to right side of the target
557+
if (ex->logical_end() == map_begin) {
558+
// we can just expand existing Extent
559+
ex->length += map_end - map_begin;
560+
dout(20) << __func__ << " expanded extent " << ex->print(pp_mode) << dendl;
561+
left_affected_range = std::min(left_affected_range, ex->logical_offset);
562+
} else {
563+
// disjointed, new extent needed
564+
Extent *le = new Extent(
565+
map_begin, in_blob_offset, map_end - map_begin, ex->blob);
566+
dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl;
567+
onode->extent_map.extent_map.insert(*le);
568+
left_affected_range = std::min(left_affected_range, le->logical_offset);
569+
}
570+
} else if (ex->logical_offset >= map_end) {
571+
// we are adding to left side of target
572+
if (ex->logical_offset == map_end) {
573+
// we can just expand existing Extent
574+
//ceph_assert(ref_end == want_subau_end);
575+
ex->logical_offset -= (map_end - map_begin);
576+
ex->blob_offset -= (map_end - map_begin);
577+
ex->length += (map_end - map_begin);
578+
dout(20) << __func__ << " expanded extent " << ex->print(pp_mode) << dendl;
579+
right_affected_range = std::max(right_affected_range, ex->logical_end());
580+
} else {
581+
// disjointed, new extent needed
582+
Extent *le = new Extent(
583+
map_begin, in_blob_offset, map_end - map_begin, ex->blob);
584+
dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl;
585+
onode->extent_map.extent_map.insert(*le);
586+
right_affected_range = std::max(right_affected_range, le->logical_end());
587+
}
588+
}
589+
}
590+
591+
// Iterator it can be invalidated.
592+
void BlueStore::Writer::_maybe_meld_with_prev_extent(exmp_it it)
593+
{
594+
if (it == onode->extent_map.extent_map.end()) return; // can't merge with non-existent
595+
if (it == onode->extent_map.extent_map.begin()) return; // can't merge when there is only 1 extent
596+
if (it->logical_end() >= right_shard_bound) return; // not allowed to escape shard range
597+
auto it_p = it;
598+
--it_p;
599+
if (it_p->logical_offset < left_shard_bound) return; // we could jump here behind our inserted range
600+
if (it_p->blob == it->blob &&
601+
it_p->logical_end() == it->logical_offset &&
602+
it_p->blob_offset + it_p->length == it->blob_offset) // this one is specifc, currently is always true
603+
{
604+
it_p->length += it->length;
605+
onode->extent_map.rm(it);
606+
left_affected_range = std::min(left_affected_range, it_p->logical_offset);
607+
right_affected_range = std::max(right_affected_range, it_p->logical_end());
608+
}
609+
}
610+
550611
/**
551612
* Note from developer
552613
* This module tries to keep naming convention:
@@ -732,6 +793,7 @@ void BlueStore::Writer::_try_reuse_allocated_l(
732793
blob_data_t& bd) // modified when consumed
733794
{
734795
uint32_t search_stop = p2align(logical_offset, (uint32_t)wctx->target_blob_size);
796+
search_stop = std::max(left_shard_bound, search_stop);
735797
uint32_t au_size = bstore->min_alloc_size;
736798
uint32_t block_size = bstore->block_size;
737799
ceph_assert(!bd.is_compressed());
@@ -781,11 +843,7 @@ void BlueStore::Writer::_try_reuse_allocated_l(
781843
uint32_t ref_end = std::min(ref_end_offset, want_subau_end);
782844
//fixme/improve - need something without stupid extras - that is without coll
783845
b->get_ref(onode->c, in_blob_offset, ref_end - want_subau_begin);
784-
Extent *le = new Extent(
785-
want_subau_begin, in_blob_offset, ref_end - want_subau_begin, it->blob);
786-
dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl;
787-
emap.extent_map.insert(*le);
788-
846+
_place_extent_in_blob(&*it, want_subau_begin, ref_end, in_blob_offset);
789847
logical_offset += data_size;
790848
break;
791849
}
@@ -812,6 +870,7 @@ void BlueStore::Writer::_try_reuse_allocated_r(
812870
uint32_t block_size = bstore->block_size;
813871
uint32_t blob_size = wctx->target_blob_size;
814872
uint32_t search_end = p2roundup(end_offset, blob_size);
873+
search_end = std::min(right_shard_bound, search_end);
815874
ceph_assert(!bd.is_compressed());
816875
ceph_assert(p2phase<uint32_t>(end_offset, au_size) != 0);
817876
BlueStore::ExtentMap& emap = onode->extent_map;
@@ -854,11 +913,7 @@ void BlueStore::Writer::_try_reuse_allocated_r(
854913
uint32_t ref_end = std::min(ref_end_offset, want_subau_end);
855914
//fixme/improve - need something without stupid extras - that is without coll
856915
b->get_ref(onode->c, in_blob_offset, ref_end - want_subau_begin);
857-
Extent *le = new Extent(
858-
want_subau_begin, in_blob_offset, ref_end - want_subau_begin, it->blob);
859-
dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl;
860-
emap.extent_map.insert(*le);
861-
916+
_place_extent_in_blob(&*it, want_subau_begin, ref_end, in_blob_offset);
862917
end_offset -= data_size;
863918
break;
864919
}
@@ -1023,14 +1078,11 @@ void BlueStore::Writer::_do_put_blobs(
10231078
uint32_t ref_end = std::min(ref_end_offset, data_end_offset);
10241079
//fixme/improve - need something without stupid extras - that is without coll
10251080
left_b->blob->get_ref(coll, in_blob_offset, ref_end - logical_offset);
1026-
Extent *le = new Extent(
1027-
logical_offset, in_blob_offset, ref_end - logical_offset, left_b->blob);
1028-
dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl;
1029-
emap.insert(*le);
1081+
_place_extent_in_blob(&*left_b, logical_offset, ref_end, in_blob_offset);
1082+
bstore->logger->inc(l_bluestore_write_small);
1083+
bstore->logger->inc(l_bluestore_write_small_bytes, ref_end - logical_offset);
10301084
logical_offset = ref_end;
10311085
++bd_it;
1032-
bstore->logger->inc(l_bluestore_write_small);
1033-
bstore->logger->inc(l_bluestore_write_small_bytes, le->length);
10341086
} else {
10351087
// it is still possible to use first bd and put it into
10361088
// blob after punch_hole
@@ -1062,13 +1114,10 @@ void BlueStore::Writer::_do_put_blobs(
10621114
uint32_t ref_end = std::min(ref_end_offset, data_begin_offset + back_it->disk_data.length());
10631115
//fixme - need something without stupid extras
10641116
right_b->blob->get_ref(coll, in_blob_offset, ref_end - data_begin_offset);
1065-
Extent *le = new Extent(
1066-
data_begin_offset, in_blob_offset, ref_end - data_begin_offset, right_b->blob);
1067-
dout(20) << __func__ << " new extent " << le->print(pp_mode) << dendl;
1068-
emap.insert(*le);
1117+
_place_extent_in_blob(&*right_b, data_begin_offset, ref_end, in_blob_offset);
10691118
bd.erase(back_it); //TODO - or other way of limiting end
10701119
bstore->logger->inc(l_bluestore_write_small);
1071-
bstore->logger->inc(l_bluestore_write_small_bytes, le->length);
1120+
bstore->logger->inc(l_bluestore_write_small_bytes, ref_end - data_begin_offset);
10721121
}
10731122
}
10741123
}
@@ -1379,9 +1428,13 @@ void BlueStore::Writer::do_write(
13791428
_collect_released_allocated();
13801429
// update statfs
13811430
txc->statfs_delta += statfs_delta;
1382-
onode->extent_map.compress_extent_map(location, data_end - location);
1383-
onode->extent_map.dirty_range(location, data_end-location);
1384-
onode->extent_map.maybe_reshard(location, data_end);
1431+
// Note: compress extent is mostly not needed; _try_reuse_allocated_* joins extents if possible;
1432+
// same is done after _blob_put_data_subau_allocate (allocate more to existing blob).
1433+
// New blobs cannot be joined with existing ones.
1434+
// The only case that adjecent extents can be meld together is on the boundary.
1435+
// More specific, since we are operating left side first, right side later
1436+
// the only non-joined extent is between after_punch_it - 1 and after_punch_it.
1437+
_maybe_meld_with_prev_extent(after_punch_it);
13851438
dout(25) << "result: " << std::endl << onode->print(pp_mode) << dendl;
13861439
}
13871440

src/os/bluestore/Writer.h

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ class BlueStore::Writer {
5050
virtual bufferlist read(uint32_t object_offset, uint32_t object_length) = 0;
5151
};
5252
Writer(BlueStore* bstore, TransContext* txc, WriteContext* wctx, OnodeRef o)
53-
:bstore(bstore), txc(txc), wctx(wctx), onode(o) {
53+
:left_shard_bound(0), right_shard_bound(OBJECT_MAX_SIZE)
54+
, bstore(bstore), txc(txc), wctx(wctx), onode(o) {
5455
pp_mode = debug_level_to_pp_mode(bstore->cct);
5556
}
5657
public:
@@ -67,7 +68,10 @@ class BlueStore::Writer {
6768
read_divertor* test_read_divertor = nullptr;
6869
std::vector<BlobRef> pruned_blobs;
6970
volatile_statfs statfs_delta;
70-
71+
uint32_t left_shard_bound; // if sharding is in effect,
72+
uint32_t right_shard_bound; // do not cross this line
73+
uint32_t left_affected_range;
74+
uint32_t right_affected_range;
7175
private:
7276
BlueStore* bstore;
7377
TransContext* txc;
@@ -140,8 +144,15 @@ class BlueStore::Writer {
140144
void _align_to_disk_block(
141145
uint32_t& location,
142146
uint32_t& ref_end,
143-
blob_vec& blobs
144-
);
147+
blob_vec& blobs);
148+
149+
void _place_extent_in_blob(
150+
Extent* target,
151+
uint32_t map_begin,
152+
uint32_t map_end,
153+
uint32_t in_blob_offset);
154+
155+
void _maybe_meld_with_prev_extent(exmp_it after_punch_it);
145156

146157
inline void _blob_put_data_subau(
147158
Blob* blob,

src/os/bluestore/bluestore_types.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,10 +1004,14 @@ struct bluestore_blob_t {
10041004
void add_tail(uint32_t new_len) {
10051005
ceph_assert(!has_unused());
10061006
ceph_assert(new_len > logical_length);
1007-
extents.emplace_back(
1008-
bluestore_pextent_t(
1009-
bluestore_pextent_t::INVALID_OFFSET,
1010-
new_len - logical_length));
1007+
if (extents.size() == 0 || extents.back().is_valid()) {
1008+
extents.emplace_back(
1009+
bluestore_pextent_t(
1010+
bluestore_pextent_t::INVALID_OFFSET,
1011+
new_len - logical_length));
1012+
} else {
1013+
extents.back().length += new_len - logical_length;
1014+
}
10111015
logical_length = new_len;
10121016
if (has_csum()) {
10131017
ceph::buffer::ptr t;

0 commit comments

Comments
 (0)