Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,25 @@
#include "paimon/visibility.h"

namespace paimon {
/// Represents a Top-K global index result that combines a Roaring bitmap of candidate row ids
/// with an array of associated relevance scores.
/// Represents a vector search global index result that combines a Roaring bitmap of candidate row
/// ids with an array of associated relevance scores.
///
/// **Important Ordering Note**: Despite inheriting from TopKGlobalIndexResult, the results are
/// **Important Ordering Note**: Despite inheriting from VectorSearchGlobalIndexResult, the results
/// are
/// **NOT sorted by score**. Instead, both the bitmap and the score vector are ordered by
/// **ascending row id**. This design enables efficient merging and set operations while preserving
/// row id-to-score mapping.
class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
class PAIMON_EXPORT BitmapVectorSearchGlobalIndexResult : public VectorSearchGlobalIndexResult {
public:
BitmapTopKGlobalIndexResult(RoaringBitmap64&& bitmap, std::vector<float>&& scores)
BitmapVectorSearchGlobalIndexResult(RoaringBitmap64&& bitmap, std::vector<float>&& scores)
: bitmap_(std::move(bitmap)), scores_(std::move(scores)) {
assert(static_cast<size_t>(bitmap_.Cardinality()) == scores_.size());
}

class TopKIterator : public TopKGlobalIndexResult::TopKIterator {
class VectorSearchIterator : public VectorSearchGlobalIndexResult::VectorSearchIterator {
public:
TopKIterator(const RoaringBitmap64* bitmap, RoaringBitmap64::Iterator&& iter,
const float* scores)
VectorSearchIterator(const RoaringBitmap64* bitmap, RoaringBitmap64::Iterator&& iter,
const float* scores)
: bitmap_(bitmap), iter_(std::move(iter)), scores_(scores) {}

bool HasNext() const override {
Expand All @@ -65,8 +66,8 @@ class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {

Result<std::unique_ptr<GlobalIndexResult::Iterator>> CreateIterator() const override;

Result<std::unique_ptr<TopKGlobalIndexResult::TopKIterator>> CreateTopKIterator()
const override;
Result<std::unique_ptr<VectorSearchGlobalIndexResult::VectorSearchIterator>>
CreateVectorSearchIterator() const override;

Result<std::shared_ptr<GlobalIndexResult>> And(
const std::shared_ptr<GlobalIndexResult>& other) override;
Expand Down
36 changes: 7 additions & 29 deletions include/paimon/global_index/global_index_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include "paimon/global_index/global_index_result.h"
#include "paimon/predicate/function_visitor.h"
#include "paimon/predicate/vector_search.h"
#include "paimon/visibility.h"

namespace paimon {
Expand All @@ -36,36 +37,13 @@ namespace paimon {
/// The `GlobalIndexResult` can be converted to global row ids by calling `AddOffset()`.
class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<GlobalIndexResult>> {
public:
/// TopKPreFilter: A lightweight pre-filtering function applied **before** similarity scoring.
/// It operates solely on **local row ids** and is typically driven by other global index, such
/// as bitmap, or range index. This filter enables early pruning of irrelevant candidates (e.g.,
/// "only consider rows with label X"), significantly reducing the search space. Returns true to
/// include the row in Top-K computation; false to exclude it.
///
/// @note Must be thread-safe.
using TopKPreFilter = std::function<bool(int64_t)>;

/// VisitTopK performs approximate top-k similarity search.
///
/// @param k Number of top results to return.
/// @param query The query vector (must match the dimensionality of the indexed vectors).
/// @param filter A pre-filter based on **local row ids**, implemented by leveraging other
/// global index
/// structures (e.g., bitmap index) for efficient candidate pruning.
/// @param predicate A runtime filtering condition that may involve graph traversal of
/// structured attributes. **Using this parameter often yields better
/// filtering accuracy** because during index construction, the underlying
/// graph was built with explicit consideration of field connectivity (e.g.,
/// relationships between attributes). As a result, predicates can leverage
/// this pre-established semantic structure to perform more meaningful and
/// context-aware filtering at query time.
/// @note All fields referenced in the predicate must have been materialized
/// in the index during build to ensure availability.
/// @note `VisitTopK` is thread-safe (not coroutine-safe) while other `VisitXXX` is not
/// VisitVectorSearch performs approximate vector similarity search.
/// @note `VisitVectorSearch` is thread-safe (not coroutine-safe) while other `VisitXXX` is not
/// thread-safe.
virtual Result<std::shared_ptr<TopKGlobalIndexResult>> VisitTopK(
int32_t k, const std::vector<float>& query, TopKPreFilter filter,
const std::shared_ptr<Predicate>& predicate) = 0;
/// @warning `VisitVectorSearch` may return error status when it is incorrectly invoked (e.g.,
/// BitmapGlobalIndexReader call `VisitVectorSearch`).
virtual Result<std::shared_ptr<VectorSearchGlobalIndexResult>> VisitVectorSearch(
const std::shared_ptr<VectorSearch>& vector_search) = 0;
};

} // namespace paimon
20 changes: 10 additions & 10 deletions include/paimon/global_index/global_index_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
/// Serializes a GlobalIndexResult object into a byte array.
///
/// @note This method only supports the following concrete implementations:
/// - BitmapTopKGlobalIndexResult
/// - BitmapVectorSearchGlobalIndexResult
/// - BitmapGlobalIndexResult
///
/// @param global_index_result The GlobalIndexResult instance to serialize (must not be null).
Expand All @@ -91,7 +91,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
///
/// @note The concrete type of the deserialized object is determined by metadata
/// embedded in the buffer. Currently, only the following types are supported:
/// - BitmapTopKGlobalIndexResult
/// - BitmapVectorSearchGlobalIndexResult
/// - BitmapGlobalIndexResult
///
/// @param buffer Pointer to the serialized byte data (must not be null).
Expand All @@ -106,18 +106,18 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
static constexpr int32_t VERSION = 1;
};

/// Represents the result of a Top-K query against a global index.
/// This class encapsulates a set of top-K candidates (row id + score pairs) and provides
/// Represents the result of a vector search query against a global index.
/// This class encapsulates a set of search candidates (row id + score pairs) and provides
/// an iterator interface to traverse them.
class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
class PAIMON_EXPORT VectorSearchGlobalIndexResult : public GlobalIndexResult {
public:
/// An iterator over the top-K results, returning (row_id, score) pairs.
/// An iterator over the vector search results, returning (row_id, score) pairs.
///
/// @note The results are **NOT sorted by score**. Instead, they are returned in **ascending
/// order of row_id**.
class TopKIterator {
class VectorSearchIterator {
public:
virtual ~TopKIterator() = default;
virtual ~VectorSearchIterator() = default;

/// Checks whether more row ids are available.
virtual bool HasNext() const = 0;
Expand All @@ -132,7 +132,7 @@ class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
virtual std::pair<int64_t, float> NextWithScore() = 0;
};

/// Creates a new iterator for traversing the Top-K results.
virtual Result<std::unique_ptr<TopKIterator>> CreateTopKIterator() const = 0;
/// Creates a new iterator for traversing the vector search results.
virtual Result<std::unique_ptr<VectorSearchIterator>> CreateVectorSearchIterator() const = 0;
};
} // namespace paimon
3 changes: 1 addition & 2 deletions include/paimon/global_index/row_range_global_index_scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
/// - Successful with several readers if the indexes exist and load correctly;
/// - Successful with an empty vector if no index was built for the given field;
/// - Error returns when loading fails (e.g., file corruption, I/O error, unsupported
/// format) or the predicate method was incorrectly invoked (e.g., VisitTopK was invoked
/// incorrectly).
/// format).
virtual Result<std::vector<std::shared_ptr<GlobalIndexReader>>> CreateReaders(
const std::string& field_name) const = 0;
};
Expand Down
69 changes: 69 additions & 0 deletions include/paimon/predicate/vector_search.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright 2026-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once
#include <functional>
#include <memory>
#include <string>
#include <vector>

#include "paimon/predicate/predicate.h"
#include "paimon/visibility.h"

namespace paimon {
/// `VectorSearch` to perform vector similarity search.
struct PAIMON_EXPORT VectorSearch {
/// `PreFilter`: A lightweight pre-filtering function applied **before** similarity
/// scoring. It operates solely on **local row ids** and is typically driven by other global
/// index, such as bitmap, or range index. This filter enables early pruning of irrelevant
/// candidates (e.g., "only consider rows with label X"), significantly reducing the search
/// space. Returns true to include the row in vector search process; false to exclude it.
///
/// @note Must be thread-safe.
using PreFilter = std::function<bool(int64_t)>;

VectorSearch(const std::string& _field_name, int32_t _limit, const std::vector<float>& _query,
PreFilter _pre_filter, const std::shared_ptr<Predicate>& _predicate)
: field_name(_field_name),
limit(_limit),
query(_query),
pre_filter(_pre_filter),
predicate(_predicate) {}

std::shared_ptr<VectorSearch> ReplacePreFilter(PreFilter _pre_filter) const {
return std::make_shared<VectorSearch>(field_name, limit, query, _pre_filter, predicate);
}

/// Search field name.
std::string field_name;
/// Number of top results to return.
int32_t limit;
/// The query vector (must match the dimensionality of the indexed vectors).
std::vector<float> query;
/// A pre-filter based on **local row ids**, implemented by leveraging other global index
std::function<bool(int64_t)> pre_filter;
/// A runtime filtering condition that may involve graph traversal of
/// structured attributes. **Using this parameter often yields better
/// filtering accuracy** because during index construction, the underlying
/// graph was built with explicit consideration of field connectivity (e.g.,
/// relationships between attributes). As a result, predicates can leverage
/// this pre-established semantic structure to perform more meaningful and
/// context-aware filtering at query time.
/// @note All fields referenced in the predicate must have been materialized
/// in the index during build to ensure availability.
std::shared_ptr<Predicate> predicate;
};
} // namespace paimon
12 changes: 11 additions & 1 deletion include/paimon/scan_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include "paimon/global_index/global_index_result.h"
#include "paimon/predicate/predicate.h"
#include "paimon/predicate/vector_search.h"
#include "paimon/result.h"
#include "paimon/type_fwd.h"
#include "paimon/visibility.h"
Expand Down Expand Up @@ -97,14 +98,19 @@ class PAIMON_EXPORT ScanFilter {
public:
ScanFilter(const std::shared_ptr<Predicate>& predicate,
const std::vector<std::map<std::string, std::string>>& partition_filters,
const std::optional<int32_t>& bucket_filter)
const std::optional<int32_t>& bucket_filter,
const std::shared_ptr<VectorSearch>& vector_search)
: predicates_(predicate),
vector_search_(vector_search),
bucket_filter_(bucket_filter),
partition_filters_(partition_filters) {}

std::shared_ptr<Predicate> GetPredicate() const {
return predicates_;
}
std::shared_ptr<VectorSearch> GetVectorSearch() const {
return vector_search_;
}
std::optional<int32_t> GetBucketFilter() const {
return bucket_filter_;
}
Expand All @@ -114,6 +120,7 @@ class PAIMON_EXPORT ScanFilter {

private:
std::shared_ptr<Predicate> predicates_;
std::shared_ptr<VectorSearch> vector_search_;
std::optional<int32_t> bucket_filter_;
std::vector<std::map<std::string, std::string>> partition_filters_;
};
Expand Down Expand Up @@ -141,6 +148,9 @@ class PAIMON_EXPORT ScanContextBuilder {
/// data retrieval.
ScanContextBuilder& SetGlobalIndexResult(
const std::shared_ptr<GlobalIndexResult>& global_index_result);

/// Set vector search for similarity search.
ScanContextBuilder& SetVectorSearch(const std::shared_ptr<VectorSearch>& vector_search);
/// The options added or set in `ScanContextBuilder` have high priority and will be merged with
/// the options in table schema.
ScanContextBuilder& AddOption(const std::string& key, const std::string& value);
Expand Down
2 changes: 2 additions & 0 deletions include/paimon/table/source/table_read.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class PAIMON_EXPORT TableRead {
/// @param splits A vector of shared pointers to `Split` instances representing the
/// data to be read.
/// @return A Result containing a unique pointer to the `BatchReader` instance.
/// @note `BatchReader`s created by the same `TableRead` are not thread-safe for
/// concurrent reading.
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
const std::vector<std::shared_ptr<Split>>& splits);

Expand Down
4 changes: 2 additions & 2 deletions src/paimon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ set(PAIMON_COMMON_SRCS
common/fs/resolving_file_system.cpp
common/fs/file_system_factory.cpp
common/global_index/complete_index_score_batch_reader.cpp
common/global_index/bitmap_topk_global_index_result.cpp
common/global_index/bitmap_vector_search_global_index_result.cpp
common/global_index/bitmap_global_index_result.cpp
common/global_index/global_index_result.cpp
common/global_index/global_indexer_factory.cpp
Expand Down Expand Up @@ -333,7 +333,7 @@ if(PAIMON_BUILD_TESTS)
common/global_index/global_index_result_test.cpp
common/global_index/global_indexer_factory_test.cpp
common/global_index/bitmap_global_index_result_test.cpp
common/global_index/bitmap_topk_global_index_result_test.cpp
common/global_index/bitmap_vector_search_global_index_result_test.cpp
common/global_index/bitmap/bitmap_global_index_test.cpp
common/io/byte_array_input_stream_test.cpp
common/io/data_input_output_stream_test.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,10 @@ TEST_F(BitmapGlobalIndexTest, TestStringType) {
// result
CheckResult(reader->VisitGreaterThan(lit_c).value(), {0, 1, 2, 3, 4});

// test visit topk
ASSERT_NOK_WITH_MSG(reader->VisitTopK(10, {1.0f, 2.0f}, nullptr, nullptr),
"FileIndexReaderWrapper is not supposed to handle topk query");
// test visit vector search
ASSERT_NOK_WITH_MSG(reader->VisitVectorSearch(std::make_shared<VectorSearch>(
"f0", 10, std::vector<float>({1.0f, 2.0f}), nullptr, nullptr)),
"FileIndexReaderWrapper is not supposed to handle vector search query");
};

{
Expand Down
Loading
Loading