Skip to content

Commit 5a97cbf

Browse files
authored
Merge branch 'alibaba:main' into fix-ci
2 parents 4520797 + b484005 commit 5a97cbf

File tree

66 files changed

+1691
-476
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+1691
-476
lines changed

cmake_modules/ThirdpartyToolchain.cmake

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,10 @@ macro(build_avro)
548548

549549
get_target_property(AVRO_ZSTD_INCLUDE_DIR zstd INTERFACE_INCLUDE_DIRECTORIES)
550550
get_filename_component(AVRO_ZSTD_ROOT "${AVRO_ZSTD_INCLUDE_DIR}" DIRECTORY)
551+
552+
get_target_property(AVRO_ZLIB_INCLUDE_DIR zlib INTERFACE_INCLUDE_DIRECTORIES)
553+
get_filename_component(AVRO_ZLIB_ROOT "${AVRO_ZLIB_INCLUDE_DIR}" DIRECTORY)
554+
551555
set(AVRO_CMAKE_CXX_FLAGS "${EP_CXX_FLAGS} -Wno-error")
552556
set(AVRO_CMAKE_C_FLAGS "${EP_C_FLAGS} -Wno-error")
553557

@@ -558,7 +562,7 @@ macro(build_avro)
558562
"-DCMAKE_C_FLAGS=${AVRO_CMAKE_C_FLAGS}"
559563
"-DAVRO_BUILD_TESTS=OFF"
560564
"-DAVRO_BUILD_EXECUTABLES=OFF"
561-
"-DZLIB_ROOT=${THIRDPARTY_ZLIB_ROOT}"
565+
"-DZLIB_ROOT=${AVRO_ZLIB_ROOT}"
562566
"-Dzstd_ROOT=${AVRO_ZSTD_ROOT}"
563567
"-DSnappy_ROOT=${AVRO_SNAPPY_ROOT}")
564568
externalproject_add(avro_ep

include/paimon/defs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,8 @@ struct PAIMON_EXPORT Options {
282282
/// "blob-as-descriptor" - Read and write blob field using blob descriptor rather than blob
283283
/// bytes. Default value is "false".
284284
static const char BLOB_AS_DESCRIPTOR[];
285+
/// "global-index.enabled" - Whether to enable global index for scan. Default value is "true".
286+
static const char GLOBAL_INDEX_ENABLED[];
285287
};
286288

287289
static constexpr int64_t BATCH_WRITE_COMMIT_IDENTIFIER = std::numeric_limits<int64_t>::max();

include/paimon/global_index/bitmap_global_index_result.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,9 @@ class PAIMON_EXPORT BitmapGlobalIndexResult : public GlobalIndexResult {
7878
/// bitmap is not actually required. **Not thread-safe**.
7979
Result<const RoaringBitmap64*> GetBitmap() const;
8080

81-
/// @return A shared pointer to a new BitmapGlobalIndexResult instance representing the given
82-
/// inclusive range [from, to].
83-
static std::shared_ptr<BitmapGlobalIndexResult> FromRange(const Range& range);
81+
/// Creates `BitmapGlobalIndexResult` for all row ids in the given ranges.
82+
/// @note Overlapping or unsorted ranges are accepted.
83+
static std::shared_ptr<BitmapGlobalIndexResult> FromRanges(const std::vector<Range>& ranges);
8484

8585
private:
8686
mutable bool initialized_ = false;

include/paimon/global_index/global_index_result.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "paimon/memory/bytes.h"
2424
#include "paimon/memory/memory_pool.h"
2525
#include "paimon/result.h"
26+
#include "paimon/utils/range.h"
2627
#include "paimon/visibility.h"
2728

2829
namespace paimon {
@@ -55,6 +56,9 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
5556
/// Creates a new iterator over the selected global row ids.
5657
virtual Result<std::unique_ptr<Iterator>> CreateIterator() const = 0;
5758

59+
/// Returns non-overlapping, sorted ranges covering all row ids in `GlobalIndexResult`.
60+
Result<std::vector<Range>> ToRanges() const;
61+
5862
/// Computes the logical AND (intersection) between current result and another.
5963
virtual Result<std::shared_ptr<GlobalIndexResult>> And(
6064
const std::shared_ptr<GlobalIndexResult>& other);

include/paimon/global_index/global_index_scan.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class PAIMON_EXPORT GlobalIndexScan {
3737
///
3838
/// @param table_path Root directory of the table.
3939
/// @param snapshot_id Optional snapshot ID to read from; if not provided, uses the latest.
40-
/// @param partitions Optional list of partition specs to restrict the scan scope.
40+
/// @param partitions Optional list of specific partitions to restrict the scan scope.
4141
/// Each map represents one partition (e.g., {"dt": "2024-06-01"}).
4242
/// If omitted, scans all partitions.
4343
/// @param options Index-specific configuration.
@@ -53,6 +53,16 @@ class PAIMON_EXPORT GlobalIndexScan {
5353
const std::map<std::string, std::string>& options,
5454
const std::shared_ptr<FileSystem>& file_system, const std::shared_ptr<MemoryPool>& pool);
5555

56+
/// Creates a `GlobalIndexScan` instance for the specified table and context.
57+
///
58+
/// @param partition_filters Optional specific partition predicates.
59+
static Result<std::unique_ptr<GlobalIndexScan>> Create(
60+
const std::string& root_path, const std::optional<int64_t>& snapshot_id,
61+
const std::shared_ptr<Predicate>& partition_filters,
62+
const std::map<std::string, std::string>& options,
63+
const std::shared_ptr<FileSystem>& file_system,
64+
const std::shared_ptr<MemoryPool>& memory_pool);
65+
5666
virtual ~GlobalIndexScan() = default;
5767

5868
/// Creates a scanner for the global index over the specified row ID range.

include/paimon/global_index/row_range_global_index_scanner.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,18 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
5252
/// format).
5353
virtual Result<std::shared_ptr<GlobalIndexReader>> CreateReader(
5454
const std::string& field_name, const std::string& index_type) const = 0;
55+
56+
/// Creates several `GlobalIndexReader`s for a specific field within this range.
57+
///
58+
/// @param field_name Name of the indexed column.
59+
/// @return A `Result` that is:
60+
/// - Successful with several readers if the indexes exist and load correctly;
61+
/// - Successful with an empty vector if no index was built for the given field;
62+
/// - Error returns when loading fails (e.g., file corruption, I/O error, unsupported
63+
/// format) or the predicate method was incorrectly invoked (e.g., VisitTopK was invoked
64+
/// incorrectly).
65+
virtual Result<std::vector<std::shared_ptr<GlobalIndexReader>>> CreateReaders(
66+
const std::string& field_name) const = 0;
5567
};
5668

5769
} // namespace paimon

include/paimon/scan_context.h

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,11 @@
2323
#include <string>
2424
#include <vector>
2525

26+
#include "paimon/global_index/global_index_result.h"
2627
#include "paimon/predicate/predicate.h"
2728
#include "paimon/result.h"
2829
#include "paimon/type_fwd.h"
29-
#include "paimon/utils/range.h"
3030
#include "paimon/visibility.h"
31-
3231
namespace paimon {
3332
class ScanContextBuilder;
3433
class ScanFilter;
@@ -45,6 +44,7 @@ class PAIMON_EXPORT ScanContext {
4544
public:
4645
ScanContext(const std::string& path, bool is_streaming_mode, std::optional<int32_t> limit,
4746
const std::shared_ptr<ScanFilter>& scan_filter,
47+
const std::shared_ptr<GlobalIndexResult>& global_index_result,
4848
const std::shared_ptr<MemoryPool>& memory_pool,
4949
const std::shared_ptr<Executor>& executor,
5050
const std::map<std::string, std::string>& options);
@@ -77,12 +77,16 @@ class PAIMON_EXPORT ScanContext {
7777
std::shared_ptr<Executor> GetExecutor() const {
7878
return executor_;
7979
}
80+
std::shared_ptr<GlobalIndexResult> GetGlobalIndexResult() const {
81+
return global_index_result_;
82+
}
8083

8184
private:
8285
std::string path_;
8386
bool is_streaming_mode_;
8487
std::optional<int32_t> limit_;
8588
std::shared_ptr<ScanFilter> scan_filters_;
89+
std::shared_ptr<GlobalIndexResult> global_index_result_;
8690
std::shared_ptr<MemoryPool> memory_pool_;
8791
std::shared_ptr<Executor> executor_;
8892
std::map<std::string, std::string> options_;
@@ -93,11 +97,10 @@ class PAIMON_EXPORT ScanFilter {
9397
public:
9498
ScanFilter(const std::shared_ptr<Predicate>& predicate,
9599
const std::vector<std::map<std::string, std::string>>& partition_filters,
96-
const std::optional<int32_t>& bucket_filter, const std::vector<Range>& row_ranges)
100+
const std::optional<int32_t>& bucket_filter)
97101
: predicates_(predicate),
98102
bucket_filter_(bucket_filter),
99-
partition_filters_(partition_filters),
100-
row_ranges_(row_ranges) {}
103+
partition_filters_(partition_filters) {}
101104

102105
std::shared_ptr<Predicate> GetPredicate() const {
103106
return predicates_;
@@ -109,15 +112,10 @@ class PAIMON_EXPORT ScanFilter {
109112
return partition_filters_;
110113
}
111114

112-
const std::vector<Range>& GetRowRanges() const {
113-
return row_ranges_;
114-
}
115-
116115
private:
117116
std::shared_ptr<Predicate> predicates_;
118117
std::optional<int32_t> bucket_filter_;
119118
std::vector<std::map<std::string, std::string>> partition_filters_;
120-
std::vector<Range> row_ranges_;
121119
};
122120

123121
/// `ScanContextBuilder` used to build a `ScanContext`, has input validation.
@@ -138,10 +136,11 @@ class PAIMON_EXPORT ScanContextBuilder {
138136
/// Set a predicate for filtering data.
139137
ScanContextBuilder& SetPredicate(const std::shared_ptr<Predicate>& predicate);
140138

141-
/// Specify the row id ranges for scan. This is usually used to read specific rows in
142-
/// data-evolution mode. File ranges that do not have any intersection with range_ids will be
143-
/// filtered. If not set, all rows are returned
144-
ScanContextBuilder& SetRowRanges(const std::vector<Range>& row_ranges);
139+
/// Sets the result of a global index search (e.g., row ids (may with scores) from a distributed
140+
/// index lookup). This is used to push down index-filtered row ids into the scan for efficient
141+
/// data retrieval.
142+
ScanContextBuilder& SetGlobalIndexResult(
143+
const std::shared_ptr<GlobalIndexResult>& global_index_result);
145144
/// The options added or set in `ScanContextBuilder` have high priority and will be merged with
146145
/// the options in table schema.
147146
ScanContextBuilder& AddOption(const std::string& key, const std::string& value);

include/paimon/utils/range.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,15 @@ struct PAIMON_EXPORT Range {
4646
/// Computes the set intersection of two collections of disjoint, sorted ranges.
4747
static std::vector<Range> And(const std::vector<Range>& left, const std::vector<Range>& right);
4848

49+
/// Excludes the given ranges from this range and returns the remaining ranges.
50+
///
51+
/// For example, if this range is [0, 10000] and ranges to exclude are [1000, 2000], [3000,
52+
/// 4000], [5000, 6000], then the result is [0, 999], [2001, 2999], [4001, 4999], [6001, 10000].
53+
///
54+
/// @param ranges The ranges to exclude (can be unsorted and overlapping).
55+
/// @return The remaining ranges after exclusion.
56+
std::vector<Range> Exclude(const std::vector<Range>& ranges) const;
57+
4958
bool operator==(const Range& other) const;
5059
bool operator<(const Range& other) const;
5160

src/paimon/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ set(PAIMON_CORE_SRCS
241241
core/table/source/startup_mode.cpp
242242
core/table/source/table_read.cpp
243243
core/table/source/table_scan.cpp
244+
core/table/source/data_evolution_batch_scan.cpp
244245
core/utils/field_mapping.cpp
245246
core/utils/fields_comparator.cpp
246247
core/utils/file_store_path_factory.cpp

src/paimon/common/defs.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,5 +79,5 @@ const char Options::ROW_TRACKING_ENABLED[] = "row-tracking.enabled";
7979
const char Options::DATA_EVOLUTION_ENABLED[] = "data-evolution.enabled";
8080
const char Options::PARTITION_GENERATE_LEGACY_NAME[] = "partition.legacy-name";
8181
const char Options::BLOB_AS_DESCRIPTOR[] = "blob-as-descriptor";
82-
82+
const char Options::GLOBAL_INDEX_ENABLED[] = "global-index.enabled";
8383
} // namespace paimon

0 commit comments

Comments
 (0)