Skip to content

Commit b0de2f9

Browse files
lszskyelxy-9602
andauthored
feat: Add IndexSplit and support returning index scores in read process (#11)
* feat: add IndexedSplit * feat: support returning index scores in read process --------- Co-authored-by: lxy <[email protected]>
1 parent 7169cb9 commit b0de2f9

File tree

74 files changed

+2256
-962
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+2256
-962
lines changed

include/paimon/global_index/global_index_scan.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,15 @@ class PAIMON_EXPORT GlobalIndexScan {
6363
virtual Result<std::shared_ptr<RowRangeGlobalIndexScanner>> CreateRangeScan(
6464
const Range& range) = 0;
6565

66-
/// Returns the set of row ID ranges covered by this global index.
66+
/// Returns row ID ranges covered by this global index (sorted and non-overlapping
67+
/// ranges).
6768
///
6869
/// Each `Range` represents a contiguous segment of row IDs for which global index
6970
/// data exists. This allows the query engine to parallelize scanning and be aware
7071
/// of ranges that are not covered by any global index.
7172
///
72-
/// @return A `Result` containing a set of non-overlapping `Range` objects.
73-
virtual Result<std::set<Range>> GetRowRangeList() = 0;
73+
/// @return A `Result` containing sorted and non-overlapping `Range` objects.
74+
virtual Result<std::vector<Range>> GetRowRangeList() = 0;
7475
};
7576

7677
} // namespace paimon
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Copyright 2025-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cstddef>
20+
#include <cstdint>
21+
#include <memory>
22+
#include <optional>
23+
#include <string>
24+
#include <vector>
25+
26+
#include "paimon/table/source/data_split.h"
27+
#include "paimon/utils/range.h"
28+
#include "paimon/visibility.h"
29+
30+
namespace paimon {
31+
/// Indexed split for global index reading operation.
32+
class PAIMON_EXPORT IndexedSplit : public Split {
33+
public:
34+
/// @returns The underlying physical data split containing actual data file details.
35+
virtual std::shared_ptr<DataSplit> GetDataSplit() const = 0;
36+
37+
/// @returns A list of row intervals [start, end] indicating which rows
38+
/// are relevant (e.g., passed predicate pushdown).
39+
virtual const std::vector<Range>& RowRanges() const = 0;
40+
41+
/// @returns A score for **each individual row** included in `RowRanges()`,
42+
/// in the order they appear when traversing the ranges.
43+
virtual const std::vector<float>& Scores() const = 0;
44+
};
45+
} // namespace paimon

include/paimon/global_index/row_range_global_index_writer.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
#include <map>
1919
#include <string>
2020

21+
#include "paimon/global_index/indexed_split.h"
2122
#include "paimon/memory/memory_pool.h"
2223
#include "paimon/result.h"
23-
#include "paimon/table/source/data_split.h"
2424
#include "paimon/utils/range.h"
2525
#include "paimon/visibility.h"
2626

@@ -35,8 +35,8 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter {
3535
/// @param table_path Path to the table root directory where index files are stored.
3636
/// @param field_name Name of the indexed column (must be present in the table schema).
3737
/// @param index_type Type of global index to build (e.g., "bitmap", "lumina").
38-
/// @param split The data split (e.g., Parquet file) containing the actual data.
39-
/// @param range Row ID range [from, to] for data to build index.
38+
/// @param index_split The indexed split containing the actual data (e.g., Parquet file) and
39+
// row id range [from, to] for data to build index.
4040
/// The range must be fully contained within the data covered
4141
/// by the given `split`.
4242
/// @param options Index-specific configuration (e.g., false positive rate for bloom
@@ -47,7 +47,7 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter {
4747
/// or an error if indexing fails (e.g., unsupported type, I/O error).
4848
static Result<std::shared_ptr<CommitMessage>> WriteIndex(
4949
const std::string& table_path, const std::string& field_name, const std::string& index_type,
50-
const std::shared_ptr<DataSplit>& split, const Range& range,
50+
const std::shared_ptr<IndexedSplit>& indexed_split,
5151
const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool);
5252
};
5353

include/paimon/read_context.h

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#include "paimon/predicate/predicate.h"
2727
#include "paimon/result.h"
2828
#include "paimon/type_fwd.h"
29-
#include "paimon/utils/range.h"
3029
#include "paimon/visibility.h"
3130

3231
namespace paimon {
@@ -43,8 +42,8 @@ class PAIMON_EXPORT ReadContext {
4342
public:
4443
ReadContext(const std::string& path, const std::string& branch,
4544
const std::vector<std::string>& read_schema,
46-
const std::shared_ptr<Predicate>& predicate, const std::vector<Range>& row_ranges,
47-
bool enable_predicate_filter, bool enable_prefetch, uint32_t prefetch_batch_count,
45+
const std::shared_ptr<Predicate>& predicate, bool enable_predicate_filter,
46+
bool enable_prefetch, uint32_t prefetch_batch_count,
4847
uint32_t prefetch_max_parallel_num, bool enable_multi_thread_row_to_batch,
4948
uint32_t row_to_batch_thread_number, const std::optional<std::string>& table_schema,
5049
const std::shared_ptr<MemoryPool>& memory_pool,
@@ -77,10 +76,6 @@ class PAIMON_EXPORT ReadContext {
7776
return predicate_;
7877
}
7978

80-
const std::vector<Range>& GetRowRanges() const {
81-
return row_ranges_;
82-
}
83-
8479
bool EnablePredicateFilter() const {
8580
return enable_predicate_filter_;
8681
}
@@ -114,7 +109,6 @@ class PAIMON_EXPORT ReadContext {
114109
std::string branch_;
115110
std::vector<std::string> read_schema_;
116111
std::shared_ptr<Predicate> predicate_;
117-
std::vector<Range> row_ranges_;
118112
bool enable_predicate_filter_;
119113
bool enable_prefetch_;
120114
uint32_t prefetch_batch_count_;
@@ -273,18 +267,6 @@ class PAIMON_EXPORT ReadContextBuilder {
273267
ReadContextBuilder& WithFileSystemSchemeToIdentifierMap(
274268
const std::map<std::string, std::string>& fs_scheme_to_identifier_map);
275269

276-
/// Set specific row ranges to read for targeted data access.
277-
///
278-
/// This is primarily used in data evolution scenarios where only specific rows
279-
/// need to be read. File ranges that do not intersect with the specified row ranges
280-
/// will be filtered out, improving performance by avoiding unnecessary I/O.
281-
///
282-
/// @param row_ranges Vector of specific row ranges to read.
283-
/// @return Reference to this builder for method chaining.
284-
/// @note If not set, all rows in the selected files will be returned.
285-
/// @note This is commonly used in data evolution mode for selective reading.
286-
ReadContextBuilder& SetRowRanges(const std::vector<Range>& row_ranges);
287-
288270
/// Build and return a `ReadContext` instance with input validation.
289271
/// @return Result containing the constructed `ReadContext` or an error status.
290272
Result<std::unique_ptr<ReadContext>> Finish();

include/paimon/table/source/data_split.h

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -26,42 +26,15 @@
2626
#include "paimon/data/timestamp.h"
2727
#include "paimon/memory/memory_pool.h"
2828
#include "paimon/result.h"
29+
#include "paimon/table/source/split.h"
2930
#include "paimon/visibility.h"
3031

3132
namespace paimon {
3233
class MemoryPool;
3334

34-
/// Input splits for read operation. Needed by most batch computation engines. Support Serialize and
35-
/// Deserialize, compatible with java version.
36-
class PAIMON_EXPORT DataSplit {
35+
/// Input data split for reading operation. Needed by most batch computation engines.
36+
class PAIMON_EXPORT DataSplit : public Split {
3737
public:
38-
virtual ~DataSplit() = default;
39-
40-
/// Deserialize a `DataSplit` from a binary buffer.
41-
///
42-
/// Creates a `DataSplit` instance from its serialized binary representation.
43-
/// This is typically used in distributed computing scenarios where splits
44-
/// are transmitted between different nodes or processes.
45-
///
46-
/// @param buffer Const pointer to the binary data containing the serialized `DataSplit`.
47-
/// @param length Size of the buffer in bytes.
48-
/// @param pool Memory pool for allocating objects during deserialization.
49-
/// @return Result containing the deserialized `DataSplit` or an error status.
50-
static Result<std::shared_ptr<DataSplit>> Deserialize(const char* buffer, size_t length,
51-
const std::shared_ptr<MemoryPool>& pool);
52-
53-
/// Serialize a `DataSplit` to a binary string.
54-
///
55-
/// Converts a `DataSplit` instance to its binary representation for storage
56-
/// or transmission. The serialized data can later be deserialized using
57-
/// the Deserialize method.
58-
///
59-
/// @param data_split The `DataSplit` instance to serialize.
60-
/// @param pool Memory pool for allocating temporary objects during serialization.
61-
/// @return Result containing the serialized binary data as a string or an error status.
62-
static Result<std::string> Serialize(const std::shared_ptr<DataSplit>& data_split,
63-
const std::shared_ptr<MemoryPool>& pool);
64-
6538
/// Metadata structure for simple data files.
6639
///
6740
/// Contains essential information about a data file including its location,
@@ -97,6 +70,7 @@ class PAIMON_EXPORT DataSplit {
9770
std::optional<int64_t> delete_row_count;
9871

9972
bool operator==(const SimpleDataFileMeta& other) const;
73+
10074
std::string ToString() const;
10175
};
10276

include/paimon/table/source/plan.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@
2020
#include <optional>
2121
#include <vector>
2222

23-
#include "paimon/table/source/data_split.h"
23+
#include "paimon/table/source/split.h"
2424

2525
namespace paimon {
2626
/// %Result plan of this `TableScan`.
2727
class PAIMON_EXPORT Plan {
2828
public:
2929
virtual ~Plan() = default;
3030
/// %Result splits.
31-
virtual const std::vector<std::shared_ptr<DataSplit>>& Splits() const = 0;
31+
virtual const std::vector<std::shared_ptr<Split>>& Splits() const = 0;
3232
/// Snapshot id of this plan, return `std::nullopt` if the table is empty.
3333
virtual std::optional<int64_t> SnapshotId() const = 0;
3434
};
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/*
2+
* Copyright 2025-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cstddef>
20+
#include <cstdint>
21+
#include <memory>
22+
#include <optional>
23+
#include <string>
24+
#include <vector>
25+
26+
#include "paimon/memory/memory_pool.h"
27+
#include "paimon/result.h"
28+
#include "paimon/visibility.h"
29+
30+
namespace paimon {
31+
class MemoryPool;
32+
33+
/// An input split for reading operation. Needed by most batch computation engines. Support
34+
/// Serialize and Deserialize, compatible with java version.
35+
/// This split can be either a `DataSplit` (for direct data file reads) or an `IndexedSplit`
36+
/// (for reads leveraging global indexes).
37+
class PAIMON_EXPORT Split {
38+
public:
39+
virtual ~Split() = default;
40+
41+
/// Deserialize a `Split` from a binary buffer.
42+
///
43+
/// Creates a `Split` instance from its serialized binary representation.
44+
/// This is typically used in distributed computing scenarios where splits
45+
/// are transmitted between different nodes or processes.
46+
///
47+
/// @param buffer Const pointer to the binary data containing the serialized `Split`.
48+
/// @param length Size of the buffer in bytes.
49+
/// @param pool Memory pool for allocating objects during deserialization.
50+
/// @return Result containing the deserialized `Split` or an error status.
51+
static Result<std::shared_ptr<Split>> Deserialize(const char* buffer, size_t length,
52+
const std::shared_ptr<MemoryPool>& pool);
53+
54+
/// Serialize a `Split` to a binary string.
55+
///
56+
/// Converts a `Split` instance to its binary representation for storage
57+
/// or transmission. The serialized data can later be deserialized using
58+
/// the Deserialize method.
59+
///
60+
/// @param split The `Split` instance to serialize.
61+
/// @param pool Memory pool for allocating temporary objects during serialization.
62+
/// @return Result containing the serialized binary data as a string or an error status.
63+
static Result<std::string> Serialize(const std::shared_ptr<Split>& split,
64+
const std::shared_ptr<MemoryPool>& pool);
65+
};
66+
} // namespace paimon

include/paimon/table/source/table_read.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,14 @@
2424
#include "paimon/read_context.h"
2525
#include "paimon/reader/batch_reader.h"
2626
#include "paimon/result.h"
27-
#include "paimon/table/source/data_split.h"
27+
#include "paimon/table/source/split.h"
2828
#include "paimon/visibility.h"
2929

3030
namespace paimon {
31-
class DataSplit;
3231
class MemoryPool;
3332
class ReadContext;
3433

35-
/// Given a `DataSplit` or a list of `DataSplit`, generate a reader for batch reading.
34+
/// Given a `Split` or a list of `Split`, generate a reader for batch reading.
3635
class PAIMON_EXPORT TableRead {
3736
public:
3837
virtual ~TableRead() = default;
@@ -46,21 +45,21 @@ class PAIMON_EXPORT TableRead {
4645
/// Creates a `BatchReader` instance for reading data.
4746
///
4847
/// This method creates a BatchReader that will be responsible for reading data from the
49-
/// provided data splits.
48+
/// provided splits.
5049
///
51-
/// @param data_splits A vector of shared pointers to `DataSplit` instances representing the
50+
/// @param splits A vector of shared pointers to `Split` instances representing the
5251
/// data to be read.
5352
/// @return A Result containing a unique pointer to the `BatchReader` instance.
5453
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
55-
const std::vector<std::shared_ptr<DataSplit>>& data_splits);
54+
const std::vector<std::shared_ptr<Split>>& splits);
5655

57-
/// Creates a `BatchReader` instance for a single data split.
56+
/// Creates a `BatchReader` instance for a single split.
5857
///
59-
/// @param data_split A shared pointer to the `DataSplit` instance that defines the data to be
58+
/// @param split A shared pointer to the `Split` instance that defines the data to be
6059
/// read.
6160
/// @return A Result containing a unique pointer to the `BatchReader` instance.
6261
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
63-
const std::shared_ptr<DataSplit>& data_split) = 0;
62+
const std::shared_ptr<Split>& split) = 0;
6463

6564
protected:
6665
explicit TableRead(const std::shared_ptr<MemoryPool>& memory_pool);

include/paimon/utils/range.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#pragma once
1818
#include <optional>
1919
#include <string>
20+
#include <vector>
2021

2122
#include "paimon/visibility.h"
2223

@@ -28,9 +29,23 @@ struct PAIMON_EXPORT Range {
2829
/// Returns the number of integers in the range [from, to].
2930
int64_t Count() const;
3031

32+
/// Computes the intersection of two ranges.
3133
static std::optional<Range> Intersection(const Range& left, const Range& right);
34+
35+
/// Checks whether two ranges have any overlap.
3236
static bool HasIntersection(const Range& left, const Range& right);
3337

38+
/// Sorts a list of ranges by `from`, then merges overlapping or adjacent ranges.
39+
/// @param ranges Input vector of ranges to merge.
40+
/// @param adjacent If true, also merges ranges that are adjacent (e.g., [1,3] and [4,5] →
41+
/// [1,5]).
42+
/// If false, only merges strictly overlapping ranges.
43+
/// @return A new vector of non-overlapping, sorted ranges.
44+
static std::vector<Range> SortAndMergeOverlap(const std::vector<Range>& ranges, bool adjacent);
45+
46+
/// Computes the set intersection of two collections of disjoint, sorted ranges.
47+
static std::vector<Range> And(const std::vector<Range>& left, const std::vector<Range>& right);
48+
3449
bool operator==(const Range& other) const;
3550
bool operator<(const Range& other) const;
3651

0 commit comments

Comments
 (0)