Skip to content

Commit 2c4994c

Browse files
feat: Introduce sst file format for btree global index (alibaba#49)
1 parent f3de4e7 commit 2c4994c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+3060
-0
lines changed

src/paimon/CMakeLists.txt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,17 @@ set(PAIMON_COMMON_SRCS
5757
common/io/data_output_stream.cpp
5858
common/io/memory_segment_output_stream.cpp
5959
common/io/offset_input_stream.cpp
60+
common/io/cache/cache.cpp
61+
common/io/cache/cache_key.cpp
62+
common/io/cache/cache_manager.cpp
6063
common/logging/logging.cpp
6164
common/memory/bytes.cpp
6265
common/memory/memory_pool.cpp
6366
common/memory/memory_segment.cpp
6467
common/memory/memory_segment_utils.cpp
68+
common/memory/memory_slice.cpp
69+
common/memory/memory_slice_input.cpp
70+
common/memory/memory_slice_output.cpp
6571
common/metrics/metrics_impl.cpp
6672
common/options/memory_size.cpp
6773
common/options/time_duration.cpp
@@ -90,13 +96,23 @@ set(PAIMON_COMMON_SRCS
9096
common/reader/reader_utils.cpp
9197
common/reader/complete_row_kind_batch_reader.cpp
9298
common/reader/data_evolution_file_reader.cpp
99+
common/sst/block_handle.cpp
100+
common/sst/block_footer.cpp
101+
common/sst/block_iterator.cpp
102+
common/sst/block_trailer.cpp
103+
common/sst/block_reader.cpp
104+
common/sst/block_writer.cpp
105+
common/sst/sst_file_reader.cpp
106+
common/sst/sst_file_writer.cpp
93107
common/types/data_field.cpp
94108
common/types/data_type.cpp
95109
common/types/data_type_json_parser.cpp
96110
common/types/row_kind.cpp
97111
common/types/row_type.cpp
98112
common/utils/arrow/mem_utils.cpp
99113
common/utils/binary_row_partition_computer.cpp
114+
common/utils/bit_set.cpp
115+
common/utils/bloom_filter.cpp
100116
common/utils/bloom_filter64.cpp
101117
common/utils/bucket_id_calculator.cpp
102118
common/utils/decimal_utils.cpp
@@ -367,6 +383,8 @@ if(PAIMON_BUILD_TESTS)
367383
common/utils/concurrent_hash_map_test.cpp
368384
common/utils/projected_row_test.cpp
369385
common/utils/projected_array_test.cpp
386+
common/utils/bit_set_test.cpp
387+
common/utils/bloom_filter_test.cpp
370388
common/utils/bloom_filter64_test.cpp
371389
common/utils/xxhash_test.cpp
372390
common/utils/bucket_id_calculator_test.cpp
@@ -414,6 +432,17 @@ if(PAIMON_BUILD_TESTS)
414432
test_utils_static
415433
${GTEST_LINK_TOOLCHAIN})
416434

435+
add_paimon_test(common_sst_file_format_test
436+
SOURCES
437+
common/sst/sst_file_io_test.cpp
438+
STATIC_LINK_LIBS
439+
paimon_shared
440+
test_utils_static
441+
"-Wl,--whole-archive"
442+
paimon_local_file_system_static
443+
"-Wl,--no-whole-archive"
444+
${GTEST_LINK_TOOLCHAIN})
445+
417446
add_paimon_test(core_test
418447
SOURCES
419448
core/append/append_only_writer_test.cpp
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "paimon/common/io/cache/cache.h"
18+
19+
namespace paimon {
20+
std::shared_ptr<CacheValue> NoCache::Get(
21+
const std::shared_ptr<CacheKey>& key,
22+
std::function<std::shared_ptr<CacheValue>(const std::shared_ptr<CacheKey>&)> supplier) {
23+
return supplier(key);
24+
}
25+
26+
void NoCache::Put(const std::shared_ptr<CacheKey>& key, const std::shared_ptr<CacheValue>& value) {
27+
// do nothing
28+
}
29+
30+
void NoCache::Invalidate(const std::shared_ptr<CacheKey>& key) {
31+
// do nothing
32+
}
33+
34+
void NoCache::InvalidateAll() {
35+
// do nothing
36+
}
37+
38+
std::unordered_map<std::shared_ptr<CacheKey>, std::shared_ptr<CacheValue>> NoCache::AsMap() {
39+
return {};
40+
}
41+
42+
} // namespace paimon

src/paimon/common/io/cache/cache.h

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
#include <cstdint>
19+
#include <functional>
20+
#include <memory>
21+
#include <string>
22+
23+
#include "paimon/common/io/cache/cache_key.h"
24+
#include "paimon/common/memory/memory_segment.h"
25+
#include "paimon/status.h"
26+
27+
namespace paimon {
28+
class CacheValue;
29+
30+
class Cache {
31+
public:
32+
virtual ~Cache() = default;
33+
virtual std::shared_ptr<CacheValue> Get(
34+
const std::shared_ptr<CacheKey>& key,
35+
std::function<std::shared_ptr<CacheValue>(const std::shared_ptr<CacheKey>&)> supplier) = 0;
36+
37+
virtual void Put(const std::shared_ptr<CacheKey>& key,
38+
const std::shared_ptr<CacheValue>& value) = 0;
39+
40+
virtual void Invalidate(const std::shared_ptr<CacheKey>& key) = 0;
41+
42+
virtual void InvalidateAll() = 0;
43+
44+
virtual std::unordered_map<std::shared_ptr<CacheKey>, std::shared_ptr<CacheValue>> AsMap() = 0;
45+
};
46+
47+
class NoCache : public Cache {
48+
public:
49+
std::shared_ptr<CacheValue> Get(
50+
const std::shared_ptr<CacheKey>& key,
51+
std::function<std::shared_ptr<CacheValue>(const std::shared_ptr<CacheKey>&)> supplier)
52+
override;
53+
void Put(const std::shared_ptr<CacheKey>& key,
54+
const std::shared_ptr<CacheValue>& value) override;
55+
void Invalidate(const std::shared_ptr<CacheKey>& key) override;
56+
void InvalidateAll() override;
57+
std::unordered_map<std::shared_ptr<CacheKey>, std::shared_ptr<CacheValue>> AsMap() override;
58+
};
59+
60+
class CacheValue {
61+
public:
62+
explicit CacheValue(const std::shared_ptr<MemorySegment>& segment) : segment_(segment) {}
63+
64+
std::shared_ptr<MemorySegment> GetSegment() {
65+
return segment_;
66+
}
67+
68+
private:
69+
std::shared_ptr<MemorySegment> segment_;
70+
};
71+
} // namespace paimon
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "paimon/common/io/cache/cache_key.h"
18+
19+
namespace paimon {
20+
21+
std::shared_ptr<CacheKey> CacheKey::ForPosition(const std::string& file_path, int64_t position,
22+
int32_t length, bool is_index) {
23+
return std::make_shared<PositionCacheKey>(file_path, position, length, is_index);
24+
}
25+
26+
bool PositionCacheKey::IsIndex() {
27+
return is_index_;
28+
}
29+
30+
int64_t PositionCacheKey::Position() const {
31+
return position_;
32+
}
33+
34+
int32_t PositionCacheKey::Length() const {
35+
return length_;
36+
}
37+
38+
bool PositionCacheKey::operator==(const PositionCacheKey& other) const {
39+
return file_path_ == other.file_path_ && position_ == other.position_ &&
40+
41+
length_ == other.length_ && is_index_ == other.is_index_;
42+
}
43+
44+
size_t PositionCacheKey::HashCode() const {
45+
size_t seed = 0;
46+
seed ^= std::hash<std::string>{}(file_path_) + HASH_CONSTANT + (seed << 6) + (seed >> 2);
47+
seed ^= std::hash<int64_t>{}(position_) + HASH_CONSTANT + (seed << 6) + (seed >> 2);
48+
seed ^= std::hash<int32_t>{}(length_) + HASH_CONSTANT + (seed << 6) + (seed >> 2);
49+
seed ^= std::hash<bool>{}(is_index_) + HASH_CONSTANT + (seed << 6) + (seed >> 2);
50+
return seed;
51+
}
52+
53+
} // namespace paimon
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
#include <cstdint>
19+
#include <functional>
20+
#include <memory>
21+
#include <string>
22+
23+
#include "paimon/status.h"
24+
25+
namespace paimon {
26+
27+
class CacheKey {
28+
public:
29+
static std::shared_ptr<CacheKey> ForPosition(const std::string& file_path, int64_t position,
30+
int32_t length, bool is_index);
31+
32+
public:
33+
virtual ~CacheKey() = default;
34+
35+
virtual bool IsIndex() = 0;
36+
};
37+
38+
class PositionCacheKey : public CacheKey {
39+
public:
40+
PositionCacheKey(const std::string& file_path, int64_t position, int32_t length, bool is_index)
41+
: file_path_(file_path), position_(position), length_(length), is_index_(is_index) {}
42+
43+
bool IsIndex() override;
44+
45+
int64_t Position() const;
46+
int32_t Length() const;
47+
48+
bool operator==(const PositionCacheKey& other) const;
49+
size_t HashCode() const;
50+
51+
private:
52+
static constexpr uint64_t HASH_CONSTANT = 0x9e3779b97f4a7c15ULL;
53+
54+
const std::string file_path_;
55+
const int64_t position_;
56+
const int32_t length_;
57+
const bool is_index_;
58+
};
59+
} // namespace paimon
60+
61+
namespace std {
62+
template <>
63+
struct hash<paimon::PositionCacheKey> {
64+
size_t operator()(const paimon::PositionCacheKey& key) const {
65+
return key.HashCode();
66+
}
67+
};
68+
} // namespace std
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Copyright 2026-present Alibaba Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "paimon/common/io/cache/cache_manager.h"
18+
19+
namespace paimon {
20+
21+
std::shared_ptr<MemorySegment> CacheManager::GetPage(
22+
std::shared_ptr<CacheKey>& key,
23+
std::function<Result<MemorySegment>(const std::shared_ptr<CacheKey>&)> reader) {
24+
auto& cache = key->IsIndex() ? index_cache_ : data_cache_;
25+
auto supplier = [=](const std::shared_ptr<CacheKey>& k) -> std::shared_ptr<CacheValue> {
26+
auto ret = reader(k);
27+
if (!ret.ok()) {
28+
return nullptr;
29+
}
30+
auto segment = ret.value();
31+
auto ptr = std::make_shared<MemorySegment>(segment);
32+
return std::make_shared<CacheValue>(ptr);
33+
};
34+
return cache->Get(key, supplier)->GetSegment();
35+
}
36+
37+
void CacheManager::InvalidPage(std::shared_ptr<CacheKey>& key) {
38+
if (key->IsIndex()) {
39+
index_cache_->Invalidate(key);
40+
} else {
41+
data_cache_->Invalidate(key);
42+
}
43+
}
44+
45+
} // namespace paimon

0 commit comments

Comments
 (0)