Skip to content

Commit a6bcb57

Browse files
committed
os/bluestore: set rocksdb iterator bounds for Bluestore::_collection_list()
When Bluestore::_collection_list() is called during PG removal, rocksdb often has to iterate over many rocksdb deletion tombstones from recenently deleted onode keys. This change bounds the rocksdb iteration when possible, to avoid high latencies caused by iteration over many contiguous deletion tombstones. Fixes: https://tracker.ceph.com/issues/58274 Signed-off-by: Cory Snyder <[email protected]>
1 parent f19765e commit a6bcb57

File tree

1 file changed

+46
-71
lines changed

1 file changed

+46
-71
lines changed

src/os/bluestore/BlueStore.cc

Lines changed: 46 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -12539,16 +12539,14 @@ int BlueStore::_collection_list(
1253912539
Collection *c, const ghobject_t& start, const ghobject_t& end, int max,
1254012540
bool legacy, vector<ghobject_t> *ls, ghobject_t *pnext)
1254112541
{
12542-
1254312542
if (!c->exists)
1254412543
return -ENOENT;
1254512544

1254612545
ghobject_t static_next;
1254712546
std::unique_ptr<CollectionListIterator> it;
1254812547
ghobject_t coll_range_temp_start, coll_range_temp_end;
1254912548
ghobject_t coll_range_start, coll_range_end;
12550-
ghobject_t pend;
12551-
bool temp;
12549+
std::vector<std::tuple<ghobject_t, ghobject_t>> ranges;
1255212550

1255312551
if (!pnext)
1255412552
pnext = &static_next;
@@ -12582,82 +12580,59 @@ int BlueStore::_collection_list(
1258212580
<< " and " << coll_range_start
1258312581
<< " to " << coll_range_end
1258412582
<< " start " << start << dendl;
12585-
if (legacy) {
12586-
it = std::make_unique<SimpleCollectionListIterator>(
12587-
cct, db->get_iterator(PREFIX_OBJ));
12588-
} else {
12589-
it = std::make_unique<SortedCollectionListIterator>(
12590-
db->get_iterator(PREFIX_OBJ));
12591-
}
12592-
if (start == ghobject_t() ||
12593-
start.hobj == hobject_t() ||
12594-
start == c->cid.get_min_hobj()) {
12595-
it->upper_bound(coll_range_temp_start);
12596-
temp = true;
12597-
} else {
12598-
if (start.hobj.is_temp()) {
12599-
temp = true;
12600-
ceph_assert(start >= coll_range_temp_start && start < coll_range_temp_end);
12601-
} else {
12602-
temp = false;
12603-
ceph_assert(start >= coll_range_start && start < coll_range_end);
12604-
}
12605-
dout(20) << __func__ << " temp=" << (int)temp << dendl;
12606-
it->lower_bound(start);
12583+
12584+
// if specified start is not specifically in the pg normal range, we should start with temp iter
12585+
if ((start == ghobject_t() ||
12586+
start.hobj == hobject_t() ||
12587+
start == c->cid.get_min_hobj() ||
12588+
start.hobj.is_temp())
12589+
&& coll_range_temp_start != coll_range_temp_end) {
12590+
ranges.push_back(std::tuple(std::move(coll_range_temp_start), std::move(coll_range_temp_end)));
1260712591
}
12608-
if (end.hobj.is_max()) {
12609-
pend = temp ? coll_range_temp_end : coll_range_end;
12610-
} else {
12611-
if (end.hobj.is_temp()) {
12612-
if (temp) {
12613-
pend = end;
12614-
} else {
12615-
*pnext = ghobject_t::get_max();
12616-
return 0;
12617-
}
12592+
// if end param is in temp section, then we do not need to proceed to the normal section
12593+
if (!end.hobj.is_temp()) {
12594+
ranges.push_back(std::tuple(std::move(coll_range_start), std::move(coll_range_end)));
12595+
}
12596+
12597+
for (const auto & [cur_range_start, cur_range_end] : ranges) {
12598+
dout(30) << __func__ << " cur_range " << cur_range_start << " to " << cur_range_end << dendl;
12599+
12600+
const ghobject_t low = start > cur_range_start ? start : cur_range_start;
12601+
const ghobject_t high = end < cur_range_end ? end : cur_range_end;
12602+
if (low >= high) {
12603+
continue;
12604+
}
12605+
12606+
std::string kv_low_key, kv_high_key;
12607+
_key_encode_prefix(low, &kv_low_key);
12608+
_key_encode_prefix(high, &kv_high_key);
12609+
kv_high_key.push_back('\xff');
12610+
dout(30) << __func__ << " kv_low_key: " << kv_low_key << " kv_high_key: " << kv_high_key << dendl;
12611+
const KeyValueDB::IteratorBounds bounds = KeyValueDB::IteratorBounds{std::move(kv_low_key), std::move(kv_high_key)};
12612+
if (legacy) {
12613+
it = std::make_unique<SimpleCollectionListIterator>(
12614+
cct, db->get_iterator(PREFIX_OBJ, 0, std::move(bounds)));
1261812615
} else {
12619-
pend = temp ? coll_range_temp_end : end;
12616+
it = std::make_unique<SortedCollectionListIterator>(
12617+
db->get_iterator(PREFIX_OBJ, 0, std::move(bounds)));
1262012618
}
12621-
}
12622-
dout(20) << __func__ << " pend " << pend << dendl;
12623-
while (true) {
12624-
if (!it->valid() || it->is_ge(pend)) {
12625-
if (!it->valid())
12626-
dout(20) << __func__ << " iterator not valid (end of db?)" << dendl;
12627-
else
12628-
dout(20) << __func__ << " oid " << it->oid() << " >= " << pend << dendl;
12629-
if (temp) {
12630-
if (end.hobj.is_temp()) {
12631-
if (it->valid() && it->is_lt(coll_range_temp_end)) {
12632-
*pnext = it->oid();
12633-
return 0;
12634-
}
12635-
break;
12636-
}
12637-
dout(30) << __func__ << " switch to non-temp namespace" << dendl;
12638-
temp = false;
12639-
it->upper_bound(coll_range_start);
12640-
if (end.hobj.is_max())
12641-
pend = coll_range_end;
12642-
else
12643-
pend = end;
12644-
dout(30) << __func__ << " pend " << pend << dendl;
12645-
continue;
12619+
it->lower_bound(low);
12620+
while (it->valid()) {
12621+
if (it->oid() < low) {
12622+
it->next();
12623+
continue;
12624+
}
12625+
if (it->oid() > high) {
12626+
break;
1264612627
}
12647-
if (it->valid() && it->is_lt(coll_range_end)) {
12628+
if (ls->size() >= (unsigned)max || it->oid() == high) {
1264812629
*pnext = it->oid();
1264912630
return 0;
1265012631
}
12651-
break;
12652-
}
12653-
dout(20) << __func__ << " oid " << it->oid() << " end " << end << dendl;
12654-
if (ls->size() >= (unsigned)max) {
12655-
dout(20) << __func__ << " reached max " << max << dendl;
12656-
*pnext = it->oid();
12657-
return 0;
12632+
dout(20) << __func__ << " oid " << it->oid() << dendl;
12633+
ls->push_back(it->oid());
12634+
it->next();
1265812635
}
12659-
ls->push_back(it->oid());
12660-
it->next();
1266112636
}
1266212637
*pnext = ghobject_t::get_max();
1266312638
return 0;

0 commit comments

Comments
 (0)