Skip to content

Commit b9cd10e

Browse files
authored
feat(scan): support built-in global index search during scan process (#23)
* feat(scan): support built-in global index search during scan process
1 parent 322adf2 commit b9cd10e

File tree

58 files changed

+1576
-467
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1576
-467
lines changed

include/paimon/defs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,8 @@ struct PAIMON_EXPORT Options {
282282
/// "blob-as-descriptor" - Read and write blob field using blob descriptor rather than blob
283283
/// bytes. Default value is "false".
284284
static const char BLOB_AS_DESCRIPTOR[];
285+
/// "global-index.enabled" - Whether to enable global index for scan. Default value is "true".
286+
static const char GLOBAL_INDEX_ENABLED[];
285287
};
286288

287289
static constexpr int64_t BATCH_WRITE_COMMIT_IDENTIFIER = std::numeric_limits<int64_t>::max();

include/paimon/global_index/bitmap_global_index_result.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,9 @@ class PAIMON_EXPORT BitmapGlobalIndexResult : public GlobalIndexResult {
7878
/// bitmap is not actually required. **Not thread-safe**.
7979
Result<const RoaringBitmap64*> GetBitmap() const;
8080

81-
/// @return A shared pointer to a new BitmapGlobalIndexResult instance representing the given
82-
/// inclusive range [from, to].
83-
static std::shared_ptr<BitmapGlobalIndexResult> FromRange(const Range& range);
81+
/// Creates `BitmapGlobalIndexResult` for all row ids in the given ranges.
82+
/// @note Overlapping or unsorted ranges are accepted.
83+
static std::shared_ptr<BitmapGlobalIndexResult> FromRanges(const std::vector<Range>& ranges);
8484

8585
private:
8686
mutable bool initialized_ = false;

include/paimon/global_index/global_index_result.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "paimon/memory/bytes.h"
2424
#include "paimon/memory/memory_pool.h"
2525
#include "paimon/result.h"
26+
#include "paimon/utils/range.h"
2627
#include "paimon/visibility.h"
2728

2829
namespace paimon {
@@ -55,6 +56,9 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
5556
/// Creates a new iterator over the selected global row ids.
5657
virtual Result<std::unique_ptr<Iterator>> CreateIterator() const = 0;
5758

59+
/// Returns non-overlapping, sorted ranges covering all row ids in `GlobalIndexResult`.
60+
Result<std::vector<Range>> ToRanges() const;
61+
5862
/// Computes the logical AND (intersection) between current result and another.
5963
virtual Result<std::shared_ptr<GlobalIndexResult>> And(
6064
const std::shared_ptr<GlobalIndexResult>& other);

include/paimon/global_index/global_index_scan.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class PAIMON_EXPORT GlobalIndexScan {
3737
///
3838
/// @param table_path Root directory of the table.
3939
/// @param snapshot_id Optional snapshot ID to read from; if not provided, uses the latest.
40-
/// @param partitions Optional list of partition specs to restrict the scan scope.
40+
/// @param partitions Optional list of specific partitions to restrict the scan scope.
4141
/// Each map represents one partition (e.g., {"dt": "2024-06-01"}).
4242
/// If omitted, scans all partitions.
4343
/// @param options Index-specific configuration.
@@ -53,6 +53,16 @@ class PAIMON_EXPORT GlobalIndexScan {
5353
const std::map<std::string, std::string>& options,
5454
const std::shared_ptr<FileSystem>& file_system, const std::shared_ptr<MemoryPool>& pool);
5555

56+
/// Creates a `GlobalIndexScan` instance for the specified table and context.
57+
///
58+
/// @param partition_filters Optional specific partition predicates.
59+
static Result<std::unique_ptr<GlobalIndexScan>> Create(
60+
const std::string& root_path, const std::optional<int64_t>& snapshot_id,
61+
const std::shared_ptr<Predicate>& partition_filters,
62+
const std::map<std::string, std::string>& options,
63+
const std::shared_ptr<FileSystem>& file_system,
64+
const std::shared_ptr<MemoryPool>& memory_pool);
65+
5666
virtual ~GlobalIndexScan() = default;
5767

5868
/// Creates a scanner for the global index over the specified row ID range.

include/paimon/scan_context.h

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,11 @@
2323
#include <string>
2424
#include <vector>
2525

26+
#include "paimon/global_index/global_index_result.h"
2627
#include "paimon/predicate/predicate.h"
2728
#include "paimon/result.h"
2829
#include "paimon/type_fwd.h"
29-
#include "paimon/utils/range.h"
3030
#include "paimon/visibility.h"
31-
3231
namespace paimon {
3332
class ScanContextBuilder;
3433
class ScanFilter;
@@ -45,6 +44,7 @@ class PAIMON_EXPORT ScanContext {
4544
public:
4645
ScanContext(const std::string& path, bool is_streaming_mode, std::optional<int32_t> limit,
4746
const std::shared_ptr<ScanFilter>& scan_filter,
47+
const std::shared_ptr<GlobalIndexResult>& global_index_result,
4848
const std::shared_ptr<MemoryPool>& memory_pool,
4949
const std::shared_ptr<Executor>& executor,
5050
const std::map<std::string, std::string>& options);
@@ -77,12 +77,16 @@ class PAIMON_EXPORT ScanContext {
7777
std::shared_ptr<Executor> GetExecutor() const {
7878
return executor_;
7979
}
80+
std::shared_ptr<GlobalIndexResult> GetGlobalIndexResult() const {
81+
return global_index_result_;
82+
}
8083

8184
private:
8285
std::string path_;
8386
bool is_streaming_mode_;
8487
std::optional<int32_t> limit_;
8588
std::shared_ptr<ScanFilter> scan_filters_;
89+
std::shared_ptr<GlobalIndexResult> global_index_result_;
8690
std::shared_ptr<MemoryPool> memory_pool_;
8791
std::shared_ptr<Executor> executor_;
8892
std::map<std::string, std::string> options_;
@@ -93,11 +97,10 @@ class PAIMON_EXPORT ScanFilter {
9397
public:
9498
ScanFilter(const std::shared_ptr<Predicate>& predicate,
9599
const std::vector<std::map<std::string, std::string>>& partition_filters,
96-
const std::optional<int32_t>& bucket_filter, const std::vector<Range>& row_ranges)
100+
const std::optional<int32_t>& bucket_filter)
97101
: predicates_(predicate),
98102
bucket_filter_(bucket_filter),
99-
partition_filters_(partition_filters),
100-
row_ranges_(row_ranges) {}
103+
partition_filters_(partition_filters) {}
101104

102105
std::shared_ptr<Predicate> GetPredicate() const {
103106
return predicates_;
@@ -109,15 +112,10 @@ class PAIMON_EXPORT ScanFilter {
109112
return partition_filters_;
110113
}
111114

112-
const std::vector<Range>& GetRowRanges() const {
113-
return row_ranges_;
114-
}
115-
116115
private:
117116
std::shared_ptr<Predicate> predicates_;
118117
std::optional<int32_t> bucket_filter_;
119118
std::vector<std::map<std::string, std::string>> partition_filters_;
120-
std::vector<Range> row_ranges_;
121119
};
122120

123121
/// `ScanContextBuilder` used to build a `ScanContext`, has input validation.
@@ -138,10 +136,11 @@ class PAIMON_EXPORT ScanContextBuilder {
138136
/// Set a predicate for filtering data.
139137
ScanContextBuilder& SetPredicate(const std::shared_ptr<Predicate>& predicate);
140138

141-
/// Specify the row id ranges for scan. This is usually used to read specific rows in
142-
/// data-evolution mode. File ranges that do not have any intersection with range_ids will be
143-
/// filtered. If not set, all rows are returned
144-
ScanContextBuilder& SetRowRanges(const std::vector<Range>& row_ranges);
139+
/// Sets the result of a global index search (e.g., row ids (may with scores) from a distributed
140+
/// index lookup). This is used to push down index-filtered row ids into the scan for efficient
141+
/// data retrieval.
142+
ScanContextBuilder& SetGlobalIndexResult(
143+
const std::shared_ptr<GlobalIndexResult>& global_index_result);
145144
/// The options added or set in `ScanContextBuilder` have high priority and will be merged with
146145
/// the options in table schema.
147146
ScanContextBuilder& AddOption(const std::string& key, const std::string& value);

include/paimon/utils/range.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ struct PAIMON_EXPORT Range {
4646
/// Computes the set intersection of two collections of disjoint, sorted ranges.
4747
static std::vector<Range> And(const std::vector<Range>& left, const std::vector<Range>& right);
4848

49+
/// Excludes the given ranges from this range and returns the remaining ranges.
50+
///
51+
/// For example, if this range is [0, 10000] and ranges to exclude are [1000, 2000], [3000,
52+
/// 4000], [5000, 6000], then the result is [0, 999], [2001, 2999], [4001, 4999], [6001, 10000].
53+
///
54+
/// @param ranges The ranges to exclude (can be unsorted and overlapping).
55+
/// @return The remaining ranges after exclusion.
56+
std::vector<Range> Exclude(const std::vector<Range>& ranges) const;
57+
4958
bool operator==(const Range& other) const;
5059
bool operator<(const Range& other) const;
5160

src/paimon/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ set(PAIMON_CORE_SRCS
241241
core/table/source/startup_mode.cpp
242242
core/table/source/table_read.cpp
243243
core/table/source/table_scan.cpp
244+
core/table/source/data_evolution_batch_scan.cpp
244245
core/utils/field_mapping.cpp
245246
core/utils/fields_comparator.cpp
246247
core/utils/file_store_path_factory.cpp

src/paimon/common/defs.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,5 +79,5 @@ const char Options::ROW_TRACKING_ENABLED[] = "row-tracking.enabled";
7979
const char Options::DATA_EVOLUTION_ENABLED[] = "data-evolution.enabled";
8080
const char Options::PARTITION_GENERATE_LEGACY_NAME[] = "partition.legacy-name";
8181
const char Options::BLOB_AS_DESCRIPTOR[] = "blob-as-descriptor";
82-
82+
const char Options::GLOBAL_INDEX_ENABLED[] = "global-index.enabled";
8383
} // namespace paimon

src/paimon/common/global_index/bitmap_global_index_result.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,13 @@ std::string BitmapGlobalIndexResult::ToString() const {
7474
return bitmap.value()->ToString();
7575
}
7676

77-
std::shared_ptr<BitmapGlobalIndexResult> BitmapGlobalIndexResult::FromRange(const Range& range) {
78-
BitmapGlobalIndexResult::BitmapSupplier supplier = [range]() -> Result<RoaringBitmap64> {
77+
std::shared_ptr<BitmapGlobalIndexResult> BitmapGlobalIndexResult::FromRanges(
78+
const std::vector<Range>& ranges) {
79+
BitmapGlobalIndexResult::BitmapSupplier supplier = [ranges]() -> Result<RoaringBitmap64> {
7980
RoaringBitmap64 bitmap;
80-
bitmap.AddRange(range.from, range.to + 1);
81+
for (const auto& range : ranges) {
82+
bitmap.AddRange(range.from, range.to + 1);
83+
}
8184
return bitmap;
8285
};
8386
return std::make_shared<BitmapGlobalIndexResult>(supplier);

src/paimon/common/global_index/bitmap_global_index_result_test.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,14 +164,18 @@ TEST_F(BitmapGlobalIndexResultTest, TestInvalidBitmapResult) {
164164
ASSERT_TRUE(result->ToString().find("Invalid: invalid supplier") != std::string::npos);
165165
}
166166

167-
TEST_F(BitmapGlobalIndexResultTest, TestFromRange) {
167+
TEST_F(BitmapGlobalIndexResultTest, TestFromRanges) {
168168
{
169-
auto result = BitmapGlobalIndexResult::FromRange(Range(0, 5));
169+
auto result = BitmapGlobalIndexResult::FromRanges({Range(0, 5)});
170170
ASSERT_EQ(result->ToString(), "{0,1,2,3,4,5}");
171171
}
172172
{
173-
auto result = BitmapGlobalIndexResult::FromRange(Range(10, 10));
173+
auto result = BitmapGlobalIndexResult::FromRanges({Range(10, 10)});
174174
ASSERT_EQ(result->ToString(), "{10}");
175175
}
176+
{
177+
auto result = BitmapGlobalIndexResult::FromRanges({Range(0, 5), Range(10, 10)});
178+
ASSERT_EQ(result->ToString(), "{0,1,2,3,4,5,10}");
179+
}
176180
}
177181
} // namespace paimon::test

0 commit comments

Comments
 (0)