Skip to content

Commit 34f1fa6

Browse files
authored
refactor(global_index): remove global range awareness from plugin (#30)
1 parent b484005 commit 34f1fa6

37 files changed

+370
-265
lines changed

include/paimon/global_index/bitmap_global_index_result.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
#include "paimon/visibility.h"
2727

2828
namespace paimon {
29-
/// Represents a global index query result that **lazily materializes** its matching row IDs as a
29+
/// Represents a global index query result that **lazily materializes** its matching row ids as a
3030
/// Roaring bitmap. The underlying 64-bit Roaring bitmap is **not constructed during object
3131
/// creation**; instead, it is built on-demand the first time GetBitmap() is called. This design
3232
/// avoids unnecessary computation and memory allocation when the bitmap is not needed (e.g., during
@@ -67,6 +67,8 @@ class PAIMON_EXPORT BitmapGlobalIndexResult : public GlobalIndexResult {
6767

6868
Result<bool> IsEmpty() const override;
6969

70+
Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset) override;
71+
7072
std::string ToString() const override;
7173

7274
/// @return A non-owning, const pointer to the bitmap. The returned pointer is valid as long as

include/paimon/global_index/bitmap_topk_global_index_result.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@
2626
#include "paimon/visibility.h"
2727

2828
namespace paimon {
29-
/// Represents a Top-K global index result that combines a Roaring bitmap of candidate row IDs
29+
/// Represents a Top-K global index result that combines a Roaring bitmap of candidate row ids
3030
/// with an array of associated relevance scores.
3131
///
3232
/// **Important Ordering Note**: Despite inheriting from TopKGlobalIndexResult, the results are
3333
/// **NOT sorted by score**. Instead, both the bitmap and the score vector are ordered by
34-
/// **ascending row ID**. This design enables efficient merging and set operations while preserving
34+
/// **ascending row id**. This design enables efficient merging and set operations while preserving
3535
/// row id-to-score mapping.
3636
class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
3737
public:
@@ -74,16 +74,18 @@ class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
7474
Result<std::shared_ptr<GlobalIndexResult>> Or(
7575
const std::shared_ptr<GlobalIndexResult>& other) override;
7676

77+
Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset) override;
78+
7779
Result<bool> IsEmpty() const override;
7880

7981
std::string ToString() const override;
8082

81-
/// @return A non-owning, const pointer to the bitmap. The row IDs in the bitmap are stored in
83+
/// @return A non-owning, const pointer to the bitmap. The row ids in the bitmap are stored in
8284
/// ascending order (as guaranteed by Roaring64 iteration).
8385
Result<const RoaringBitmap64*> GetBitmap() const;
8486

8587
/// @return A const reference to a vector of float scores, where the i-th element corresponds to
86-
/// the i-th row ID when iterating the bitmap in **ascending row ID order**.
88+
/// the i-th row id when iterating the bitmap in **ascending row id order**.
8789
const std::vector<float>& GetScores() const;
8890

8991
private:

include/paimon/global_index/global_index_io_meta.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,17 @@
2525
namespace paimon {
2626
/// Metadata describing a single file entry in a global index.
2727
struct PAIMON_EXPORT GlobalIndexIOMeta {
28-
GlobalIndexIOMeta(const std::string& _file_name, int64_t _file_size, const Range& _row_id_range,
28+
GlobalIndexIOMeta(const std::string& _file_name, int64_t _file_size, int64_t _range_end,
2929
const std::shared_ptr<Bytes>& _metadata)
3030
: file_name(_file_name),
3131
file_size(_file_size),
32-
row_id_range(_row_id_range),
32+
range_end(_range_end),
3333
metadata(_metadata) {}
3434

3535
std::string file_name;
3636
int64_t file_size;
37-
/// The inclusive range of row IDs covered by this file (i.e., [from, to]).
38-
Range row_id_range;
37+
/// The inclusive range end covered by this file (i.e., the last local row id).
38+
int64_t range_end;
3939
/// Optional binary metadata associated with the file, such as serialized
4040
/// secondary index structures or inline index bytes.
4141
/// May be null if no additional metadata is available.

include/paimon/global_index/global_index_reader.h

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,20 @@
2626

2727
namespace paimon {
2828
/// Reads and evaluates filter predicates against a global file index.
29-
/// `GlobalIndexReader` is an implementation of the `FunctionVisitor` interface
30-
/// specialized to produce `std::shared_ptr<GlobalIndexResult>` objects.
3129
///
3230
/// Derived classes are expected to implement the visitor methods (e.g., `VisitEqual`,
3331
/// `VisitIsNull`, etc.) to return index-based results that indicate which
3432
/// row satisfy the given predicate.
33+
///
34+
/// @note All `GlobalIndexResult` objects returned by implementations of this class use **local row
35+
/// ids** that start from 0 — not global row ids in the entire table.
36+
/// The `GlobalIndexResult` can be converted to global row ids by calling `AddOffset()`.
3537
class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<GlobalIndexResult>> {
3638
public:
3739
/// TopKPreFilter: A lightweight pre-filtering function applied **before** similarity scoring.
38-
/// It operates solely on row_id and is typically driven by other global index, such as bitmap,
39-
/// or range index. This filter enables early pruning of irrelevant candidates (e.g., "only
40-
/// consider rows with label X"), significantly reducing the search space. Returns true to
40+
/// It operates solely on **local row ids** and is typically driven by other global index, such
41+
/// as bitmap, or range index. This filter enables early pruning of irrelevant candidates (e.g.,
42+
/// "only consider rows with label X"), significantly reducing the search space. Returns true to
4143
/// include the row in Top-K computation; false to exclude it.
4244
///
4345
/// @note Must be thread-safe.
@@ -47,7 +49,8 @@ class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<G
4749
///
4850
/// @param k Number of top results to return.
4951
/// @param query The query vector (must match the dimensionality of the indexed vectors).
50-
/// @param filter A pre-filter based on row_id, implemented by leveraging other global index
52+
/// @param filter A pre-filter based on **local row ids**, implemented by leveraging other
53+
/// global index
5154
/// structures (e.g., bitmap index) for efficient candidate pruning.
5255
/// @param predicate A runtime filtering condition that may involve graph traversal of
5356
/// structured attributes. **Using this parameter often yields better
@@ -58,7 +61,8 @@ class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<G
5861
/// context-aware filtering at query time.
5962
/// @note All fields referenced in the predicate must have been materialized
6063
/// in the index during build to ensure availability.
61-
/// @note `VisitTopK` is thread-safe while other `VisitXXX` is not.
64+
/// @note `VisitTopK` is thread-safe (not coroutine-safe) while other `VisitXXX` is not
65+
/// thread-safe.
6266
virtual Result<std::shared_ptr<TopKGlobalIndexResult>> VisitTopK(
6367
int32_t k, const std::vector<float>& query, TopKPreFilter filter,
6468
const std::shared_ptr<Predicate>& predicate) = 0;

include/paimon/global_index/global_index_result.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
4444
virtual int64_t Next() = 0;
4545
};
4646

47-
/// Checks whether the global index result contains no matching row IDs.
47+
/// Checks whether the global index result contains no matching row ids.
4848
///
4949
/// @return A `Result<bool>` where:
5050
/// - `true` indicates the result is empty (no matching rows),
@@ -67,6 +67,10 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
6767
virtual Result<std::shared_ptr<GlobalIndexResult>> Or(
6868
const std::shared_ptr<GlobalIndexResult>& other);
6969

70+
/// Adds the given offset to each row id in current result and returns the new global index
71+
/// result.
72+
virtual Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset) = 0;
73+
7074
virtual std::string ToString() const = 0;
7175

7276
/// Serializes a GlobalIndexResult object into a byte array.
@@ -103,7 +107,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
103107
};
104108

105109
/// Represents the result of a Top-K query against a global index.
106-
/// This class encapsulates a set of top-K candidates (row ID + score pairs) and provides
110+
/// This class encapsulates a set of top-K candidates (row id + score pairs) and provides
107111
/// an iterator interface to traverse them.
108112
class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
109113
public:
@@ -115,7 +119,7 @@ class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
115119
public:
116120
virtual ~TopKIterator() = default;
117121

118-
/// Checks whether more row IDs are available.
122+
/// Checks whether more row ids are available.
119123
virtual bool HasNext() const = 0;
120124

121125
/// Retrieves the next (row_id, score) pair and advances the iterator.

include/paimon/global_index/global_index_scan.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class PAIMON_EXPORT GlobalIndexScan {
3636
/// Creates a `GlobalIndexScan` instance for the specified table and context.
3737
///
3838
/// @param table_path Root directory of the table.
39-
/// @param snapshot_id Optional snapshot ID to read from; if not provided, uses the latest.
39+
/// @param snapshot_id Optional snapshot id to read from; if not provided, uses the latest.
4040
/// @param partitions Optional list of specific partitions to restrict the scan scope.
4141
/// Each map represents one partition (e.g., {"dt": "2024-06-01"}).
4242
/// If omitted, scans all partitions.
@@ -65,23 +65,23 @@ class PAIMON_EXPORT GlobalIndexScan {
6565

6666
virtual ~GlobalIndexScan() = default;
6767

68-
/// Creates a scanner for the global index over the specified row ID range.
68+
/// Creates a scanner for the global index over the specified row id range.
6969
///
7070
/// This method instantiates a low-level scanner that can evaluate predicates and
71-
/// retrieve matching row IDs from the global index data corresponding to the given
72-
/// row ID range.
71+
/// retrieve matching row ids from the global index data corresponding to the given
72+
/// row id range.
7373
///
74-
/// @param range The inclusive row ID range [start, end] for which to create the scanner.
74+
/// @param range The inclusive row id range [start, end] for which to create the scanner.
7575
/// The range must be fully covered by existing global index data (from
7676
/// `GetRowRangeList()`).
7777
/// @return A `Result` containing a range-level scanner, or an error if parse index meta fails.
7878
virtual Result<std::shared_ptr<RowRangeGlobalIndexScanner>> CreateRangeScan(
7979
const Range& range) = 0;
8080

81-
/// Returns row ID ranges covered by this global index (sorted and non-overlapping
81+
/// Returns row id ranges covered by this global index (sorted and non-overlapping
8282
/// ranges).
8383
///
84-
/// Each `Range` represents a contiguous segment of row IDs for which global index
84+
/// Each `Range` represents a contiguous segment of row ids for which global index
8585
/// data exists. This allows the query engine to parallelize scanning and be aware
8686
/// of ranges that are not covered by any global index.
8787
///

include/paimon/global_index/row_range_global_index_writer.h renamed to include/paimon/global_index/global_index_write_task.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@
2828

2929
namespace paimon {
3030
/// Writes a range-level global index for a specific data split and field.
31-
class PAIMON_EXPORT RowRangeGlobalIndexWriter {
31+
class PAIMON_EXPORT GlobalIndexWriteTask {
3232
public:
33-
RowRangeGlobalIndexWriter() = delete;
34-
~RowRangeGlobalIndexWriter() = delete;
33+
GlobalIndexWriteTask() = delete;
34+
~GlobalIndexWriteTask() = delete;
3535
/// Builds and writes a global index for the specified data range.
3636
///
3737
/// @param table_path Path to the table root directory where index files are stored.

include/paimon/global_index/row_range_global_index_scanner.h

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
#include <memory>
2020
#include <string>
2121

22-
#include "paimon/global_index/global_index_evaluator.h"
2322
#include "paimon/global_index/global_index_reader.h"
2423
#include "paimon/visibility.h"
2524

@@ -29,15 +28,6 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
2928
public:
3029
virtual ~RowRangeGlobalIndexScanner() = default;
3130

32-
/// Creates a `GlobalIndexEvaluator` tailored to this range's index layout.
33-
///
34-
/// The returned evaluator can be used to assess whether a given predicate can be
35-
/// answered using the global index data of this shard (e.g., via bitmap intersection).
36-
///
37-
/// @return A `Result` containing a shared pointer to the evaluator, or an error
38-
/// if the index metadata is invalid or unsupported.
39-
virtual Result<std::shared_ptr<GlobalIndexEvaluator>> CreateIndexEvaluator() const = 0;
40-
4131
/// Creates a `GlobalIndexReader` for a specific field and index type within this range.
4232
///
4333
/// This reader provides low-level access to the serialized index data
@@ -50,6 +40,8 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
5040
/// - Successful with a null pointer if no index was built for the given field and type;
5141
/// - An error only if loading fails (e.g., file corruption, I/O error, unsupported
5242
/// format).
43+
/// @note All `GlobalIndexResult` objects returned by `GlobalIndexReader` use **local row
44+
/// ids** that start from 0 — not global row ids in the entire table.
5345
virtual Result<std::shared_ptr<GlobalIndexReader>> CreateReader(
5446
const std::string& field_name, const std::string& index_type) const = 0;
5547

include/paimon/schema/schema.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class PAIMON_EXPORT Schema {
4444
virtual std::vector<std::string> FieldNames() const = 0;
4545

4646
/// Get the unique identifier of this table schema.
47-
/// @return The schema ID
47+
/// @return The schema id
4848
virtual int64_t Id() const = 0;
4949

5050
/// Get the list of primary key field names.
@@ -65,8 +65,8 @@ class PAIMON_EXPORT Schema {
6565
/// @return The number of buckets.
6666
virtual int32_t NumBuckets() const = 0;
6767

68-
/// Get the highest field ID assigned in this schema.
69-
/// @return The maximum field ID.
68+
/// Get the highest field id assigned in this schema.
69+
/// @return The maximum field id.
7070
virtual int32_t HighestFieldId() const = 0;
7171

7272
/// Get the table-level options associated with this schema.

include/paimon/utils/bucket_id_calculator.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct ArrowArray;
2929
namespace paimon {
3030
class MemoryPool;
3131

32-
/// Calculator for determining bucket IDs based on the given bucket keys.
32+
/// Calculator for determining bucket ids based on the given bucket keys.
3333
///
3434
/// @note `BucketIdCalculator` is compatible with the Java implementation and uses
3535
/// hash-based distribution to ensure even data distribution across buckets.
@@ -47,10 +47,10 @@ class PAIMON_EXPORT BucketIdCalculator {
4747
/// @param num_buckets Number of buckets.
4848
static Result<std::unique_ptr<BucketIdCalculator>> Create(bool is_pk_table,
4949
int32_t num_buckets);
50-
/// Calculate bucket IDs for the given bucket keys.
50+
/// Calculate bucket ids for the given bucket keys.
5151
/// @param bucket_keys Arrow struct array containing the bucket key values.
5252
/// @param bucket_schema Arrow schema describing the structure of bucket_keys.
53-
/// @param bucket_ids Output array to store calculated bucket IDs.
53+
/// @param bucket_ids Output array to store calculated bucket ids.
5454
/// @note 1. bucket_keys is a struct array, the order of fields needs to be consistent with
5555
/// "bucket-key" options in table schema. 2. bucket_keys and bucket_schema match each other. 3.
5656
/// bucket_ids is allocated enough space, at least >= bucket_keys->length

0 commit comments

Comments
 (0)