Skip to content

Commit d70256c

Browse files
committed
[#28439] DocDB: Use bloom filter when user specified multiple keys
Summary: After commit 9d2e474/D41176, bloom filters can be modified dynamically during scan operations. Moreover, commit b97ccc9 / D41475 introduced bloom filter support for simple index scans that do not use HybridScanChoices. There are scenarios where users query multiple keys, for instance, using an IN clause where HybridScanChoices is used. This diff adds support for variable bloom filters to handle such cases efficiently. Core challenge: SeekTuple used by indexscan uses Seek to move the underlying iterator to the appropriate tuple. However, a scan using HybridScanChoices uses SeekForward to move the iterator across various scan choices because SeekForward is more efficient than a plain Seek. SeekForward intentionally does a no-op if iterator is already positioned after the target key/tuple. This implies checking the current position of iterator and comparing it with the target key. To avoid incorrect skipping of keys, iterator should never return current position after target key if there are keys between current position and target key. Therefore, to support variable bloom filter, HybridScanChoices must prevent the underlying iterator from moving beyond the current scan choice. Otherwise, the next scan choice may not be found even when present. To support the SeekForward optimization, 1. UpdateFilterKey now takes an extra seek_key argument to reposition the SST file iterators appropriately. 2. HybridScanChoices now uses the upperbound mechanism to iterate through the scan choices one by one. Tradeoff: After this change, there are some scenarios with increased seeks. Example: CREATE TABLE t (k INT PRIMARY KEY, v INT); INSERT INTO t VALUES (1000, 1000); SELECT k FROM t WHERE k IN (0, 1, 2, .., 999); HybridScanChoices now does a seek for each scan choice. This is the not the case before this change. However, users typically do not query for non existent keys. Performance measurements using newly added `PgSingleTServerTest.BloomFilterPerf` against master (b5373ca), release build, no LTO: Master: 1.0s This diff: 0.8s In one of the long-running (i.e. as data set footprint grew larger over a 24hr period) workloads, which had a good mix of queries with IN lists on primary key or indexed columns, with this optimization, we observed that the overall business txns/sec improved from 115 to 155; i.e. about 34% improvement. Also used TPCC to check that there is no regression in scenarios that do not covered by a improved logic. Jira: DB-18123 Test Plan: PgSingleTServerTest.BloomFilterIn PgSingleTServerTest.BloomFilterPerf Reviewers: timur, rthallam, patnaik.balivada Reviewed By: timur, patnaik.balivada Subscribers: smishra, ybase, yql Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D46548
1 parent 855fd21 commit d70256c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+660
-234
lines changed

src/yb/docdb/bounded_rocksdb_iterator.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ void BoundedRocksDbIterator::UseFastNext(bool value) {
9696
iterator_->UseFastNext(value);
9797
}
9898

99-
void BoundedRocksDbIterator::UpdateFilterKey(Slice user_key_for_filter) {
100-
iterator_->UpdateFilterKey(user_key_for_filter);
99+
void BoundedRocksDbIterator::UpdateFilterKey(Slice user_key_for_filter, Slice seek_key) {
100+
iterator_->UpdateFilterKey(user_key_for_filter, seek_key);
101101
}
102102

103103
} // namespace yb::docdb

src/yb/docdb/bounded_rocksdb_iterator.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class BoundedRocksDbIterator final : public rocksdb::Iterator {
7171
}
7272

7373
void UseFastNext(bool value) override;
74-
void UpdateFilterKey(Slice user_key_for_filter) override;
74+
void UpdateFilterKey(Slice user_key_for_filter, Slice seek_key) override;
7575

7676
private:
7777
const rocksdb::KeyValueEntry& FilterEntry(const rocksdb::KeyValueEntry& entry) const;

src/yb/docdb/conflict_resolution.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -968,7 +968,7 @@ class StrongConflictChecker {
968968
/* iterate_upper_bound = */ nullptr,
969969
rocksdb::CacheRestartBlockKeys::kFalse);
970970
}
971-
value_iter_.UpdateFilterKey(intent_key);
971+
value_iter_.UpdateFilterKey(intent_key, Slice());
972972
const auto* entry = &value_iter_.Seek(intent_key);
973973

974974
VLOG_WITH_PREFIX_AND_FUNC(4)

src/yb/docdb/doc_operation-test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,7 @@ SubDocKey(DocKey(0x0000, [100], []), [ColumnId(3); HT{ physical: 0 logical: 3000
614614
DocRowwiseIterator iter(
615615
projection, doc_read_context, kNonTransactionalOperationContext, doc_db(),
616616
ReadOperationData::FromReadTime(ReadHybridTime::FromUint64(3000)), pending_op);
617-
iter.InitForTableType(YQL_TABLE_TYPE);
617+
ASSERT_OK(iter.InitForTableType(YQL_TABLE_TYPE));
618618
ASSERT_FALSE(ASSERT_RESULT(iter.FetchNext(nullptr)));
619619

620620
// Now verify row exists even with one valid column.

src/yb/docdb/doc_read_context.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "yb/common/ql_type.h"
1717

18+
#include "yb/dockv/doc_key.h"
1819
#include "yb/dockv/value_type.h"
1920

2021
#include "yb/util/logging.h"
@@ -154,6 +155,17 @@ void DocReadContext::UpdateKeyPrefix() {
154155
}
155156
}
156157

158+
Result<bool> DocReadContext::HaveEqualBloomFilterKey(Slice lhs, Slice rhs) const {
159+
return dockv::HashedOrFirstRangeComponentsEqual(lhs, rhs);
160+
}
161+
162+
size_t DocReadContext::NumColumnsUsedByBloomFilterKey() const {
163+
// If there are hash columns, when we include hash code, otherwise bloom filter
164+
// pick the first range component.
165+
// So num columns used by bloom filter always num hash columns + 1.
166+
return schema_.num_hash_key_columns() + 1;
167+
}
168+
157169
DocReadContext DocReadContext::TEST_Create(const Schema& schema) {
158170
static const auto registry = std::make_shared<dockv::SchemaPackingRegistry>("TEST: ");
159171
return DocReadContext(

src/yb/docdb/doc_read_context.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ struct DocReadContext {
100100
return Slice(shared_key_prefix_buffer_.data(), table_key_prefix_len_);
101101
}
102102

103+
Result<bool> HaveEqualBloomFilterKey(Slice lhs, Slice rhs) const;
104+
size_t NumColumnsUsedByBloomFilterKey() const;
105+
103106
void TEST_SetDefaultTimeToLive(uint64_t ttl_msec) {
104107
schema_.SetDefaultTimeToLive(ttl_msec);
105108
}

src/yb/docdb/doc_rowwise_iterator.cc

Lines changed: 47 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -136,13 +136,13 @@ void DocRowwiseIterator::SetSchema(const Schema& schema) {
136136
schema_ = &schema;
137137
}
138138

139-
void DocRowwiseIterator::InitForTableType(
139+
Status DocRowwiseIterator::InitForTableType(
140140
TableType table_type, Slice sub_doc_key, SkipSeek skip_seek,
141141
AddTablePrefixToKey add_table_prefix_to_key) {
142142
CheckInitOnce();
143143
table_type_ = table_type;
144144
ignore_ttl_ = (table_type_ == TableType::PGSQL_TABLE_TYPE);
145-
InitIterator(BloomFilterOptions::Inactive());
145+
RETURN_NOT_OK(InitIterator(BloomFilterOptions::Inactive()));
146146

147147
if (sub_doc_key.empty() || add_table_prefix_to_key) {
148148
dockv::DocKeyEncoder(&row_key_).Schema(*schema_);
@@ -154,11 +154,13 @@ void DocRowwiseIterator::InitForTableType(
154154
has_bound_key_ = false;
155155

156156
scan_choices_ = ScanChoices::CreateEmpty();
157+
158+
return Status::OK();
157159
}
158160

159161
Status DocRowwiseIterator::Init(
160162
const qlexpr::YQLScanSpec& doc_spec, SkipSeek skip_seek,
161-
UseVariableBloomFilter use_variable_bloom_filter) {
163+
AllowVariableBloomFilter allow_variable_bloom_filter) {
162164
table_type_ = doc_spec.client_type() == YQL_CLIENT_CQL ? TableType::YQL_TABLE_TYPE
163165
: TableType::PGSQL_TABLE_TYPE;
164166
ignore_ttl_ = table_type_ == TableType::PGSQL_TABLE_TYPE;
@@ -172,17 +174,6 @@ Status DocRowwiseIterator::Init(
172174
VLOG(2) << "DocKey Bounds " << DocKey::DebugSliceToString(bounds.lower.AsSlice()) << ", "
173175
<< DocKey::DebugSliceToString(bounds.upper.AsSlice());
174176

175-
// TODO(bogdan): decide if this is a good enough heuristic for using blooms for scans.
176-
const bool is_fixed_point_get =
177-
!bounds.lower.empty() &&
178-
VERIFY_RESULT(HashedOrFirstRangeComponentsEqual(bounds.lower, bounds.upper));
179-
auto bloom_filter = BloomFilterOptions::Inactive();
180-
if (is_fixed_point_get) {
181-
bloom_filter = BloomFilterOptions::Fixed(bounds.lower.AsSlice());
182-
} else if (use_variable_bloom_filter) {
183-
bloom_filter = BloomFilterOptions::Variable();
184-
}
185-
186177
if (is_forward_scan_) {
187178
has_bound_key_ = !bounds.upper.empty();
188179
if (has_bound_key_) {
@@ -206,17 +197,21 @@ Status DocRowwiseIterator::Init(
206197
}
207198
}
208199

209-
InitIterator(bloom_filter, doc_spec.QueryId(), CreateFileFilter(doc_spec));
210-
211200
if (has_bound_key_) {
212201
if (is_forward_scan_) {
213202
bounds.upper = bound_key_;
214203
} else {
215204
bounds.lower = bound_key_;
216205
}
217206
}
218-
scan_choices_ = ScanChoices::Create(
219-
*schema_, doc_spec, bounds, doc_read_context_.table_key_prefix());
207+
208+
scan_choices_ = VERIFY_RESULT(ScanChoices::Create(
209+
doc_read_context_, doc_spec, bounds, doc_read_context_.table_key_prefix(),
210+
allow_variable_bloom_filter));
211+
212+
RETURN_NOT_OK(InitIterator(
213+
scan_choices_->BloomFilterOptions(), doc_spec.QueryId(), CreateFileFilter(doc_spec)));
214+
220215
if (!skip_seek) {
221216
if (is_forward_scan_) {
222217
Seek(bounds.lower);
@@ -314,7 +309,7 @@ Slice DocRowwiseIterator::GetRowKey() const {
314309
return row_key_;
315310
}
316311

317-
void DocRowwiseIterator::SeekTuple(Slice tuple_id) {
312+
void DocRowwiseIterator::SeekTuple(Slice tuple_id, docdb::UpdateFilterKey update_filter_key) {
318313
// If cotable id / colocation id is present in the table schema, then
319314
// we need to prepend it in the tuple key to seek.
320315
if (schema_->has_cotable_id() || schema_->has_colocation_id()) {
@@ -338,7 +333,9 @@ void DocRowwiseIterator::SeekTuple(Slice tuple_id) {
338333
tuple_key_->AppendRawBytes(tuple_id);
339334
tuple_id = *tuple_key_;
340335
}
341-
UpdateFilterKey(tuple_id);
336+
if (update_filter_key) {
337+
UpdateFilterKey(tuple_id);
338+
}
342339
Seek(tuple_id);
343340

344341
row_key_.Clear();
@@ -505,7 +502,7 @@ Result<DocHybridTime> DocRowwiseIterator::GetTableTombstoneTime(Slice root_doc_k
505502
return doc_ht;
506503
}
507504

508-
void DocRowwiseIterator::InitIterator(
505+
Status DocRowwiseIterator::InitIterator(
509506
const BloomFilterOptions& bloom_filter,
510507
const rocksdb::QueryId query_id,
511508
std::shared_ptr<rocksdb::ReadFileFilter> file_filter) {
@@ -534,16 +531,23 @@ void DocRowwiseIterator::InitIterator(
534531
FastBackwardScan{use_fast_backward_scan_});
535532
InitResult();
536533

537-
auto prefix = shared_key_prefix();
538-
if (is_forward_scan_ && has_bound_key_ &&
539-
bound_key_.data().data()[0] != dockv::KeyEntryTypeAsChar::kHighest) {
540-
DCHECK(bound_key_.AsSlice().starts_with(prefix))
541-
<< "Bound key: " << bound_key_.AsSlice().ToDebugHexString()
542-
<< ", prefix: " << prefix.ToDebugHexString();
543-
upperbound_scope_.emplace(bound_key_, db_iter_.get());
544-
} else {
545-
DCHECK(!upperbound().empty());
546-
upperbound_scope_.emplace(upperbound(), db_iter_.get());
534+
const auto scan_choices_has_upperbound =
535+
scan_choices_ &&
536+
VERIFY_RESULT(scan_choices_->PrepareIterator(
537+
*db_iter_, doc_read_context_.table_key_prefix()));
538+
539+
if (!scan_choices_has_upperbound) {
540+
auto prefix = shared_key_prefix();
541+
if (is_forward_scan_ && has_bound_key_ &&
542+
bound_key_.data().data()[0] != dockv::KeyEntryTypeAsChar::kHighest) {
543+
DCHECK(bound_key_.AsSlice().starts_with(prefix))
544+
<< "Bound key: " << bound_key_.AsSlice().ToDebugHexString()
545+
<< ", prefix: " << prefix.ToDebugHexString();
546+
upperbound_scope_.emplace(bound_key_, db_iter_.get());
547+
} else {
548+
DCHECK(!upperbound().empty());
549+
upperbound_scope_.emplace(upperbound(), db_iter_.get());
550+
}
547551
}
548552

549553
if (use_fast_backward_scan_) {
@@ -557,6 +561,7 @@ void DocRowwiseIterator::InitIterator(
557561
}
558562

559563
VLOG_WITH_FUNC(4) << "Initialization done";
564+
return Status::OK();
560565
}
561566

562567
void DocRowwiseIterator::ConfigureForYsql() {
@@ -580,11 +585,13 @@ void DocRowwiseIterator::Refresh(SeekFilter seek_filter) {
580585
}
581586

582587
void DocRowwiseIterator::UpdateFilterKey(Slice user_key_for_filter) {
588+
DCHECK(!scan_choices_ || scan_choices_->BloomFilterOptions().mode() != BloomFilterMode::kInactive)
589+
<< "Mode: " << scan_choices_->BloomFilterOptions().mode();
583590
db_iter_->UpdateFilterKey(user_key_for_filter);
584591
}
585592

586593
void DocRowwiseIterator::Seek(Slice key) {
587-
VLOG_WITH_FUNC(3) << " Seeking to " << key << "/" << dockv::DocKey::DebugSliceToString(key);
594+
VLOG_WITH_FUNC(3) << key << "/" << dockv::DocKey::DebugSliceToString(key);
588595

589596
DCHECK(!done_);
590597

@@ -630,7 +637,7 @@ inline void DocRowwiseIterator::SeekPrevDocKey(Slice key) {
630637
Status DocRowwiseIterator::AdvanceIteratorToNextDesiredRow(bool row_finished,
631638
bool current_fetched_row_skipped) {
632639
if (seek_filter_ == SeekFilter::kAll && !IsFetchedRowStatic() &&
633-
VERIFY_RESULT(scan_choices_->AdvanceToNextRow(&row_key_, db_iter_.get(),
640+
VERIFY_RESULT(scan_choices_->AdvanceToNextRow(&row_key_, *db_iter_,
634641
current_fetched_row_skipped))) {
635642
return Status::OK();
636643
}
@@ -702,6 +709,12 @@ Result<bool> DocRowwiseIterator::FetchNextImpl(TableRow table_row) {
702709

703710
const auto& key_data = VERIFY_RESULT_REF(db_iter_->Fetch());
704711
if (!key_data) {
712+
// It could happen that iterator did not find anything because of upper bound limit from
713+
// scan choices. So need to update it and retry.
714+
if (seek_filter_ == SeekFilter::kAll && !IsFetchedRowStatic() &&
715+
VERIFY_RESULT(scan_choices_->AdvanceToNextRow(nullptr, *db_iter_, true))) {
716+
continue;
717+
}
705718
done_ = true;
706719
return false;
707720
}
@@ -744,7 +757,7 @@ Result<bool> DocRowwiseIterator::FetchNextImpl(TableRow table_row) {
744757

745758
bool is_static_column = IsFetchedRowStatic();
746759
if (!is_static_column &&
747-
!VERIFY_RESULT(scan_choices_->InterestedInRow(&row_key_, db_iter_.get()))) {
760+
!VERIFY_RESULT(scan_choices_->InterestedInRow(&row_key_, *db_iter_))) {
748761
continue;
749762
}
750763

src/yb/docdb/doc_rowwise_iterator.h

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@
4141
#include "yb/util/operation_counter.h"
4242
#include "yb/util/status_fwd.h"
4343

44-
namespace yb {
45-
namespace docdb {
44+
namespace yb::docdb {
4645

4746
YB_STRONGLY_TYPED_BOOL(AddTablePrefixToKey);
4847

@@ -71,14 +70,13 @@ class DocRowwiseIterator final : public YQLRowwiseIteratorIf {
7170
void SetSchema(const Schema& schema);
7271

7372
// Init scan iterator.
74-
void InitForTableType(
73+
Status InitForTableType(
7574
TableType table_type, Slice sub_doc_key = Slice(), SkipSeek skip_seek = SkipSeek::kFalse,
7675
AddTablePrefixToKey add_table_prefix_to_key = AddTablePrefixToKey::kFalse);
7776
// Init QL read scan.
7877
Status Init(
79-
const qlexpr::YQLScanSpec& spec,
80-
SkipSeek skip_seek = SkipSeek::kFalse,
81-
UseVariableBloomFilter use_variable_bloom_filter = UseVariableBloomFilter::kFalse);
78+
const qlexpr::YQLScanSpec& spec, SkipSeek skip_seek = SkipSeek::kFalse,
79+
AllowVariableBloomFilter allow_variable_bloom_filter = AllowVariableBloomFilter::kFalse);
8280

8381
bool IsFetchedRowStatic() const override;
8482

@@ -90,7 +88,8 @@ class DocRowwiseIterator final : public YQLRowwiseIteratorIf {
9088

9189
// Seeks to the given tuple by its id. The tuple id should be the serialized DocKey and without
9290
// the cotable id.
93-
void SeekTuple(Slice tuple_id) override;
91+
void SeekTuple(
92+
Slice tuple_id, UpdateFilterKey update_filter_key = UpdateFilterKey::kTrue) override;
9493

9594
// Returns true if tuple was fetched, false otherwise.
9695
Result<bool> FetchTuple(Slice tuple_id, qlexpr::QLTableRow* row) override;
@@ -174,7 +173,7 @@ class DocRowwiseIterator final : public YQLRowwiseIteratorIf {
174173
Slice shared_key_prefix() const;
175174
Slice upperbound() const;
176175

177-
void InitIterator(
176+
Status InitIterator(
178177
const BloomFilterOptions& bloom_filter,
179178
const rocksdb::QueryId query_id = rocksdb::kDefaultQueryId,
180179
std::shared_ptr<rocksdb::ReadFileFilter> file_filter = nullptr);
@@ -241,6 +240,7 @@ class DocRowwiseIterator final : public YQLRowwiseIteratorIf {
241240
bool has_bound_key_ = false;
242241
dockv::KeyBytes bound_key_;
243242

243+
std::unique_ptr<IntentAwareIterator> db_iter_;
244244
std::unique_ptr<ScanChoices> scan_choices_;
245245

246246
// We keep the "pending operation" counter incremented for the lifetime of this iterator so that
@@ -276,7 +276,6 @@ class DocRowwiseIterator final : public YQLRowwiseIteratorIf {
276276
size_t obsolete_keys_found_ = 0;
277277
size_t obsolete_keys_found_past_cutoff_ = 0;
278278

279-
std::unique_ptr<IntentAwareIterator> db_iter_;
280279
KeyBuffer prefix_buffer_;
281280
std::optional<IntentAwareIteratorUpperboundScope> upperbound_scope_;
282281
std::optional<IntentAwareIteratorLowerboundScope> lowerbound_scope_;
@@ -300,5 +299,4 @@ class DocRowwiseIterator final : public YQLRowwiseIteratorIf {
300299
SeekFilter seek_filter_ = SeekFilter::kAll;
301300
};
302301

303-
} // namespace docdb
304-
} // namespace yb
302+
} // namespace yb::docdb

src/yb/docdb/docdb_fwd.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,9 @@ using DocVectorIndexesPtr = std::shared_ptr<DocVectorIndexes>;
100100
using DocVectorIndexInsertEntries = std::vector<DocVectorIndexInsertEntry>;
101101

102102
YB_STRONGLY_TYPED_BOOL(FastBackwardScan);
103+
YB_STRONGLY_TYPED_BOOL(AllowVariableBloomFilter);
103104
YB_STRONGLY_TYPED_BOOL(IncludeIntents);
104105
YB_STRONGLY_TYPED_BOOL(SkipFlush);
105106
YB_STRONGLY_TYPED_BOOL(SkipSeek);
106-
YB_STRONGLY_TYPED_BOOL(UseVariableBloomFilter);
107107

108108
} // namespace yb::docdb

src/yb/docdb/docdb_rocksdb_util.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "yb/docdb/bounded_rocksdb_iterator.h"
2727
#include "yb/docdb/consensus_frontier.h"
2828
#include "yb/docdb/doc_ql_filefilter.h"
29+
#include "yb/docdb/doc_read_context.h"
2930
#include "yb/docdb/docdb_filter_policy.h"
3031
#include "yb/docdb/docdb_statistics.h"
3132
#include "yb/docdb/intent_aware_iterator.h"
@@ -1107,5 +1108,18 @@ std::shared_ptr<rocksdb::RateLimiter> CreateRocksDBRateLimiter() {
11071108
return nullptr;
11081109
}
11091110

1111+
Result<BloomFilterOptions> BloomFilterOptions::Make(
1112+
const DocReadContext& doc_read_context, Slice lower, Slice upper, bool allow_variable) {
1113+
const bool is_fixed_point_get =
1114+
!lower.empty() && VERIFY_RESULT(doc_read_context.HaveEqualBloomFilterKey(lower, upper));
1115+
if (is_fixed_point_get) {
1116+
return BloomFilterOptions::Fixed(lower);
1117+
}
1118+
if (allow_variable) {
1119+
return BloomFilterOptions::Variable();
1120+
}
1121+
return BloomFilterOptions::Inactive();
1122+
}
1123+
11101124
} // namespace docdb
11111125
} // namespace yb

0 commit comments

Comments
 (0)