Skip to content

Commit 014cbe9

Browse files
authored
feat: Add vector search support to DataEvolutionBatchScan and rename topk to vector search (#48)
1 parent 39fea58 commit 014cbe9

31 files changed

+760
-342
lines changed

include/paimon/global_index/bitmap_topk_global_index_result.h renamed to include/paimon/global_index/bitmap_vector_search_global_index_result.h

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,25 @@
2626
#include "paimon/visibility.h"
2727

2828
namespace paimon {
29-
/// Represents a Top-K global index result that combines a Roaring bitmap of candidate row ids
30-
/// with an array of associated relevance scores.
29+
/// Represents a vector search global index result that combines a Roaring bitmap of candidate row
30+
/// ids with an array of associated relevance scores.
3131
///
32-
/// **Important Ordering Note**: Despite inheriting from TopKGlobalIndexResult, the results are
32+
/// **Important Ordering Note**: Despite inheriting from VectorSearchGlobalIndexResult, the results
33+
/// are
3334
/// **NOT sorted by score**. Instead, both the bitmap and the score vector are ordered by
3435
/// **ascending row id**. This design enables efficient merging and set operations while preserving
3536
/// row id-to-score mapping.
36-
class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
37+
class PAIMON_EXPORT BitmapVectorSearchGlobalIndexResult : public VectorSearchGlobalIndexResult {
3738
public:
38-
BitmapTopKGlobalIndexResult(RoaringBitmap64&& bitmap, std::vector<float>&& scores)
39+
BitmapVectorSearchGlobalIndexResult(RoaringBitmap64&& bitmap, std::vector<float>&& scores)
3940
: bitmap_(std::move(bitmap)), scores_(std::move(scores)) {
4041
assert(static_cast<size_t>(bitmap_.Cardinality()) == scores_.size());
4142
}
4243

43-
class TopKIterator : public TopKGlobalIndexResult::TopKIterator {
44+
class VectorSearchIterator : public VectorSearchGlobalIndexResult::VectorSearchIterator {
4445
public:
45-
TopKIterator(const RoaringBitmap64* bitmap, RoaringBitmap64::Iterator&& iter,
46-
const float* scores)
46+
VectorSearchIterator(const RoaringBitmap64* bitmap, RoaringBitmap64::Iterator&& iter,
47+
const float* scores)
4748
: bitmap_(bitmap), iter_(std::move(iter)), scores_(scores) {}
4849

4950
bool HasNext() const override {
@@ -65,8 +66,8 @@ class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
6566

6667
Result<std::unique_ptr<GlobalIndexResult::Iterator>> CreateIterator() const override;
6768

68-
Result<std::unique_ptr<TopKGlobalIndexResult::TopKIterator>> CreateTopKIterator()
69-
const override;
69+
Result<std::unique_ptr<VectorSearchGlobalIndexResult::VectorSearchIterator>>
70+
CreateVectorSearchIterator() const override;
7071

7172
Result<std::shared_ptr<GlobalIndexResult>> And(
7273
const std::shared_ptr<GlobalIndexResult>& other) override;

include/paimon/global_index/global_index_reader.h

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include "paimon/global_index/global_index_result.h"
2424
#include "paimon/predicate/function_visitor.h"
25+
#include "paimon/predicate/vector_search.h"
2526
#include "paimon/visibility.h"
2627

2728
namespace paimon {
@@ -36,36 +37,13 @@ namespace paimon {
3637
/// The `GlobalIndexResult` can be converted to global row ids by calling `AddOffset()`.
3738
class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<GlobalIndexResult>> {
3839
public:
39-
/// TopKPreFilter: A lightweight pre-filtering function applied **before** similarity scoring.
40-
/// It operates solely on **local row ids** and is typically driven by other global index, such
41-
/// as bitmap, or range index. This filter enables early pruning of irrelevant candidates (e.g.,
42-
/// "only consider rows with label X"), significantly reducing the search space. Returns true to
43-
/// include the row in Top-K computation; false to exclude it.
44-
///
45-
/// @note Must be thread-safe.
46-
using TopKPreFilter = std::function<bool(int64_t)>;
47-
48-
/// VisitTopK performs approximate top-k similarity search.
49-
///
50-
/// @param k Number of top results to return.
51-
/// @param query The query vector (must match the dimensionality of the indexed vectors).
52-
/// @param filter A pre-filter based on **local row ids**, implemented by leveraging other
53-
/// global index
54-
/// structures (e.g., bitmap index) for efficient candidate pruning.
55-
/// @param predicate A runtime filtering condition that may involve graph traversal of
56-
/// structured attributes. **Using this parameter often yields better
57-
/// filtering accuracy** because during index construction, the underlying
58-
/// graph was built with explicit consideration of field connectivity (e.g.,
59-
/// relationships between attributes). As a result, predicates can leverage
60-
/// this pre-established semantic structure to perform more meaningful and
61-
/// context-aware filtering at query time.
62-
/// @note All fields referenced in the predicate must have been materialized
63-
/// in the index during build to ensure availability.
64-
/// @note `VisitTopK` is thread-safe (not coroutine-safe) while other `VisitXXX` is not
40+
/// VisitVectorSearch performs approximate vector similarity search.
41+
/// @note `VisitVectorSearch` is thread-safe (not coroutine-safe) while other `VisitXXX` is not
6542
/// thread-safe.
66-
virtual Result<std::shared_ptr<TopKGlobalIndexResult>> VisitTopK(
67-
int32_t k, const std::vector<float>& query, TopKPreFilter filter,
68-
const std::shared_ptr<Predicate>& predicate) = 0;
43+
/// @warning `VisitVectorSearch` may return error status when it is incorrectly invoked (e.g.,
44+
/// BitmapGlobalIndexReader call `VisitVectorSearch`).
45+
virtual Result<std::shared_ptr<VectorSearchGlobalIndexResult>> VisitVectorSearch(
46+
const std::shared_ptr<VectorSearch>& vector_search) = 0;
6947
};
7048

7149
} // namespace paimon

include/paimon/global_index/global_index_result.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
7676
/// Serializes a GlobalIndexResult object into a byte array.
7777
///
7878
/// @note This method only supports the following concrete implementations:
79-
/// - BitmapTopKGlobalIndexResult
79+
/// - BitmapVectorSearchGlobalIndexResult
8080
/// - BitmapGlobalIndexResult
8181
///
8282
/// @param global_index_result The GlobalIndexResult instance to serialize (must not be null).
@@ -91,7 +91,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
9191
///
9292
/// @note The concrete type of the deserialized object is determined by metadata
9393
/// embedded in the buffer. Currently, only the following types are supported:
94-
/// - BitmapTopKGlobalIndexResult
94+
/// - BitmapVectorSearchGlobalIndexResult
9595
/// - BitmapGlobalIndexResult
9696
///
9797
/// @param buffer Pointer to the serialized byte data (must not be null).
@@ -106,18 +106,18 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
106106
static constexpr int32_t VERSION = 1;
107107
};
108108

109-
/// Represents the result of a Top-K query against a global index.
110-
/// This class encapsulates a set of top-K candidates (row id + score pairs) and provides
109+
/// Represents the result of a vector search query against a global index.
110+
/// This class encapsulates a set of search candidates (row id + score pairs) and provides
111111
/// an iterator interface to traverse them.
112-
class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
112+
class PAIMON_EXPORT VectorSearchGlobalIndexResult : public GlobalIndexResult {
113113
public:
114-
/// An iterator over the top-K results, returning (row_id, score) pairs.
114+
/// An iterator over the vector search results, returning (row_id, score) pairs.
115115
///
116116
/// @note The results are **NOT sorted by score**. Instead, they are returned in **ascending
117117
/// order of row_id**.
118-
class TopKIterator {
118+
class VectorSearchIterator {
119119
public:
120-
virtual ~TopKIterator() = default;
120+
virtual ~VectorSearchIterator() = default;
121121

122122
/// Checks whether more row ids are available.
123123
virtual bool HasNext() const = 0;
@@ -132,7 +132,7 @@ class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
132132
virtual std::pair<int64_t, float> NextWithScore() = 0;
133133
};
134134

135-
/// Creates a new iterator for traversing the Top-K results.
136-
virtual Result<std::unique_ptr<TopKIterator>> CreateTopKIterator() const = 0;
135+
/// Creates a new iterator for traversing the vector search results.
136+
virtual Result<std::unique_ptr<VectorSearchIterator>> CreateVectorSearchIterator() const = 0;
137137
};
138138
} // namespace paimon

include/paimon/global_index/row_range_global_index_scanner.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,7 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
5252
/// - Successful with several readers if the indexes exist and load correctly;
5353
/// - Successful with an empty vector if no index was built for the given field;
5454
/// - Error returns when loading fails (e.g., file corruption, I/O error, unsupported
55-
/// format) or the predicate method was incorrectly invoked (e.g., VisitTopK was invoked
56-
/// incorrectly).
55+
/// format).
5756
virtual Result<std::vector<std::shared_ptr<GlobalIndexReader>>> CreateReaders(
5857
const std::string& field_name) const = 0;
5958
};
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
#include <functional>
19+
#include <memory>
20+
#include <string>
21+
#include <vector>
22+
23+
#include "paimon/predicate/predicate.h"
24+
#include "paimon/visibility.h"
25+
26+
namespace paimon {
27+
/// `VectorSearch` to perform vector similarity search.
28+
struct PAIMON_EXPORT VectorSearch {
29+
/// `PreFilter`: A lightweight pre-filtering function applied **before** similarity
30+
/// scoring. It operates solely on **local row ids** and is typically driven by other global
31+
/// index, such as bitmap, or range index. This filter enables early pruning of irrelevant
32+
/// candidates (e.g., "only consider rows with label X"), significantly reducing the search
33+
/// space. Returns true to include the row in vector search process; false to exclude it.
34+
///
35+
/// @note Must be thread-safe.
36+
using PreFilter = std::function<bool(int64_t)>;
37+
38+
VectorSearch(const std::string& _field_name, int32_t _limit, const std::vector<float>& _query,
39+
PreFilter _pre_filter, const std::shared_ptr<Predicate>& _predicate)
40+
: field_name(_field_name),
41+
limit(_limit),
42+
query(_query),
43+
pre_filter(_pre_filter),
44+
predicate(_predicate) {}
45+
46+
std::shared_ptr<VectorSearch> ReplacePreFilter(PreFilter _pre_filter) const {
47+
return std::make_shared<VectorSearch>(field_name, limit, query, _pre_filter, predicate);
48+
}
49+
50+
/// Search field name.
51+
std::string field_name;
52+
/// Number of top results to return.
53+
int32_t limit;
54+
/// The query vector (must match the dimensionality of the indexed vectors).
55+
std::vector<float> query;
56+
/// A pre-filter based on **local row ids**, implemented by leveraging other global index
57+
std::function<bool(int64_t)> pre_filter;
58+
/// A runtime filtering condition that may involve graph traversal of
59+
/// structured attributes. **Using this parameter often yields better
60+
/// filtering accuracy** because during index construction, the underlying
61+
/// graph was built with explicit consideration of field connectivity (e.g.,
62+
/// relationships between attributes). As a result, predicates can leverage
63+
/// this pre-established semantic structure to perform more meaningful and
64+
/// context-aware filtering at query time.
65+
/// @note All fields referenced in the predicate must have been materialized
66+
/// in the index during build to ensure availability.
67+
std::shared_ptr<Predicate> predicate;
68+
};
69+
} // namespace paimon

include/paimon/scan_context.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
#include "paimon/global_index/global_index_result.h"
2727
#include "paimon/predicate/predicate.h"
28+
#include "paimon/predicate/vector_search.h"
2829
#include "paimon/result.h"
2930
#include "paimon/type_fwd.h"
3031
#include "paimon/visibility.h"
@@ -97,14 +98,19 @@ class PAIMON_EXPORT ScanFilter {
9798
public:
9899
ScanFilter(const std::shared_ptr<Predicate>& predicate,
99100
const std::vector<std::map<std::string, std::string>>& partition_filters,
100-
const std::optional<int32_t>& bucket_filter)
101+
const std::optional<int32_t>& bucket_filter,
102+
const std::shared_ptr<VectorSearch>& vector_search)
101103
: predicates_(predicate),
104+
vector_search_(vector_search),
102105
bucket_filter_(bucket_filter),
103106
partition_filters_(partition_filters) {}
104107

105108
std::shared_ptr<Predicate> GetPredicate() const {
106109
return predicates_;
107110
}
111+
std::shared_ptr<VectorSearch> GetVectorSearch() const {
112+
return vector_search_;
113+
}
108114
std::optional<int32_t> GetBucketFilter() const {
109115
return bucket_filter_;
110116
}
@@ -114,6 +120,7 @@ class PAIMON_EXPORT ScanFilter {
114120

115121
private:
116122
std::shared_ptr<Predicate> predicates_;
123+
std::shared_ptr<VectorSearch> vector_search_;
117124
std::optional<int32_t> bucket_filter_;
118125
std::vector<std::map<std::string, std::string>> partition_filters_;
119126
};
@@ -141,6 +148,9 @@ class PAIMON_EXPORT ScanContextBuilder {
141148
/// data retrieval.
142149
ScanContextBuilder& SetGlobalIndexResult(
143150
const std::shared_ptr<GlobalIndexResult>& global_index_result);
151+
152+
/// Set vector search for similarity search.
153+
ScanContextBuilder& SetVectorSearch(const std::shared_ptr<VectorSearch>& vector_search);
144154
/// The options added or set in `ScanContextBuilder` have high priority and will be merged with
145155
/// the options in table schema.
146156
ScanContextBuilder& AddOption(const std::string& key, const std::string& value);

include/paimon/table/source/table_read.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ class PAIMON_EXPORT TableRead {
5050
/// @param splits A vector of shared pointers to `Split` instances representing the
5151
/// data to be read.
5252
/// @return A Result containing a unique pointer to the `BatchReader` instance.
53+
/// @note `BatchReader`s created by the same `TableRead` are not thread-safe for
54+
/// concurrent reading.
5355
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
5456
const std::vector<std::shared_ptr<Split>>& splits);
5557

src/paimon/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ set(PAIMON_COMMON_SRCS
4747
common/fs/resolving_file_system.cpp
4848
common/fs/file_system_factory.cpp
4949
common/global_index/complete_index_score_batch_reader.cpp
50-
common/global_index/bitmap_topk_global_index_result.cpp
50+
common/global_index/bitmap_vector_search_global_index_result.cpp
5151
common/global_index/bitmap_global_index_result.cpp
5252
common/global_index/global_index_result.cpp
5353
common/global_index/global_indexer_factory.cpp
@@ -333,7 +333,7 @@ if(PAIMON_BUILD_TESTS)
333333
common/global_index/global_index_result_test.cpp
334334
common/global_index/global_indexer_factory_test.cpp
335335
common/global_index/bitmap_global_index_result_test.cpp
336-
common/global_index/bitmap_topk_global_index_result_test.cpp
336+
common/global_index/bitmap_vector_search_global_index_result_test.cpp
337337
common/global_index/bitmap/bitmap_global_index_test.cpp
338338
common/io/byte_array_input_stream_test.cpp
339339
common/io/data_input_output_stream_test.cpp

src/paimon/common/global_index/bitmap/bitmap_global_index_test.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,10 @@ TEST_F(BitmapGlobalIndexTest, TestStringType) {
216216
// result
217217
CheckResult(reader->VisitGreaterThan(lit_c).value(), {0, 1, 2, 3, 4});
218218

219-
// test visit topk
220-
ASSERT_NOK_WITH_MSG(reader->VisitTopK(10, {1.0f, 2.0f}, nullptr, nullptr),
221-
"FileIndexReaderWrapper is not supposed to handle topk query");
219+
// test visit vector search
220+
ASSERT_NOK_WITH_MSG(reader->VisitVectorSearch(std::make_shared<VectorSearch>(
221+
"f0", 10, std::vector<float>({1.0f, 2.0f}), nullptr, nullptr)),
222+
"FileIndexReaderWrapper is not supposed to handle vector search query");
222223
};
223224

224225
{

0 commit comments

Comments
 (0)