From 1f2871c48e6dc554bedc9eb2b56f649c7779ec70 Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Thu, 2 Jul 2026 11:46:36 +0800 Subject: [PATCH 1/2] feat(commit): align functionality with java paimon commit --- src/paimon/CMakeLists.txt | 5 + src/paimon/common/data/binary_row.h | 13 + .../operation/commit/conflict_detection.cpp | 531 +++++++++++++++ .../operation/commit/conflict_detection.h | 119 ++++ .../commit/conflict_detection_test.cpp | 499 ++++++++++++++ .../commit/manifest_entry_changes.cpp | 137 ++++ .../operation/commit/manifest_entry_changes.h | 69 ++ .../commit/manifest_entry_changes_test.cpp | 168 +++++ .../commit/row_id_column_conflict_checker.cpp | 213 ++++++ .../commit/row_id_column_conflict_checker.h | 70 ++ .../core/operation/file_store_commit.cpp | 10 - .../core/operation/file_store_commit_impl.cpp | 389 ++++++----- .../core/operation/file_store_commit_impl.h | 39 +- .../operation/file_store_commit_impl_test.cpp | 620 ++++++++++++++++-- .../core/operation/file_store_commit_test.cpp | 70 ++ 15 files changed, 2704 insertions(+), 248 deletions(-) create mode 100644 src/paimon/core/operation/commit/conflict_detection.cpp create mode 100644 src/paimon/core/operation/commit/conflict_detection.h create mode 100644 src/paimon/core/operation/commit/conflict_detection_test.cpp create mode 100644 src/paimon/core/operation/commit/manifest_entry_changes.cpp create mode 100644 src/paimon/core/operation/commit/manifest_entry_changes.h create mode 100644 src/paimon/core/operation/commit/manifest_entry_changes_test.cpp create mode 100644 src/paimon/core/operation/commit/row_id_column_conflict_checker.cpp create mode 100644 src/paimon/core/operation/commit/row_id_column_conflict_checker.h diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index c883ec012..b0319827e 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -288,6 +288,9 @@ set(PAIMON_CORE_SRCS core/operation/abstract_split_read.cpp core/operation/append_only_file_store_scan.cpp core/operation/append_only_file_store_write.cpp + core/operation/commit/conflict_detection.cpp + core/operation/commit/row_id_column_conflict_checker.cpp + core/operation/commit/manifest_entry_changes.cpp core/operation/commit_context.cpp core/operation/expire_snapshots.cpp core/operation/file_store_commit.cpp @@ -697,6 +700,8 @@ if(PAIMON_BUILD_TESTS) core/operation/metrics/compaction_metrics_test.cpp core/operation/data_evolution_file_store_scan_test.cpp core/operation/data_evolution_split_read_test.cpp + core/operation/commit/conflict_detection_test.cpp + core/operation/commit/manifest_entry_changes_test.cpp core/operation/key_value_file_store_write_test.cpp core/operation/internal_read_context_test.cpp core/operation/abstract_split_read_test.cpp diff --git a/src/paimon/common/data/binary_row.h b/src/paimon/common/data/binary_row.h index 71359d918..2ab0d9fa9 100644 --- a/src/paimon/common/data/binary_row.h +++ b/src/paimon/common/data/binary_row.h @@ -156,6 +156,19 @@ struct hash> { } }; +/// for std::unordered_map, ...> +template <> +struct hash> { + size_t operator()( + const std::tuple& partition_bucket_level) const { + const auto& [partition, bucket, level] = partition_bucket_level; + size_t hash = paimon::MurmurHashUtils::HashUnsafeBytes( + reinterpret_cast(&bucket), 0, sizeof(bucket), partition.HashCode()); + return paimon::MurmurHashUtils::HashUnsafeBytes(reinterpret_cast(&level), 0, + sizeof(level), hash); + } +}; + template <> struct hash { size_t operator()(const paimon::BinaryRow& row) const { diff --git a/src/paimon/core/operation/commit/conflict_detection.cpp b/src/paimon/core/operation/commit/conflict_detection.cpp new file mode 100644 index 000000000..f3e507f39 --- /dev/null +++ b/src/paimon/core/operation/commit/conflict_detection.cpp @@ -0,0 +1,531 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/operation/commit/conflict_detection.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fmt/format.h" +#include "paimon/common/data/blob_utils.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/common/types/data_field.h" +#include "paimon/common/utils/fields_comparator.h" +#include "paimon/core/manifest/file_entry.h" +#include "paimon/core/manifest/file_kind.h" +#include "paimon/core/manifest/manifest_file.h" +#include "paimon/core/manifest/manifest_file_meta.h" +#include "paimon/core/manifest/manifest_list.h" +#include "paimon/core/manifest/index_manifest_file.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/operation/commit/manifest_entry_changes.h" +#include "paimon/core/operation/commit/row_id_column_conflict_checker.h" +#include "paimon/core/utils/snapshot_manager.h" +#include "paimon/core/deletionvectors/deletion_vectors_index_file.h" +#include "paimon/core/schema/table_schema.h" +#include "paimon/utils/range.h" + +namespace paimon { + +namespace { + +bool IsVectorStoreFile(const std::string& file_name) { + return file_name.find(".vector.") != std::string::npos; +} + +bool IsDedicatedStorageFile(const std::string& file_name) { + return BlobUtils::IsBlobFile(file_name) || IsVectorStoreFile(file_name); +} + +} // namespace + +ConflictDetection::ConflictDetection(std::shared_ptr table_schema, + const CoreOptions& options, + std::shared_ptr snapshot_manager, + std::shared_ptr manifest_list, + std::shared_ptr manifest_file) + : table_schema_(std::move(table_schema)), + options_(options), + snapshot_manager_(std::move(snapshot_manager)), + manifest_list_(std::move(manifest_list)), + manifest_file_(std::move(manifest_file)) {} + +void ConflictDetection::SetRowIdCheckFromSnapshot( + const std::optional& row_id_check_from_snapshot) { + row_id_check_from_snapshot_ = row_id_check_from_snapshot; +} + +bool ConflictDetection::HasRowIdCheckFromSnapshot() const { + return row_id_check_from_snapshot_.has_value(); +} + +Status ConflictDetection::CheckConflicts(const Snapshot& latest_snapshot, + const std::vector& base_entries, + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const std::optional>& + row_id_column_conflict_checker, + const Snapshot::CommitKind& commit_kind) const { + std::vector all_entries = base_entries; + all_entries.insert(all_entries.end(), delta_entries.begin(), delta_entries.end()); + PAIMON_RETURN_NOT_OK(CheckBucketKeepSame(all_entries, commit_kind)); + + // check the delta, it is important not to delete and add the same file. Since scan + // relies on map for deduplication, this may result in the loss of this file + std::vector merged_delta_entries; + PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(delta_entries, &merged_delta_entries)); + + std::vector merged_entries; + // merge manifest entries and also check if the files we want to delete are still there + PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(all_entries, &merged_entries)); + PAIMON_RETURN_NOT_OK(CheckFileDeletionConflicts(merged_entries)); + PAIMON_RETURN_NOT_OK(CheckKeyRange(merged_entries)); + if (!(commit_kind == Snapshot::CommitKind::Compact())) { + PAIMON_RETURN_NOT_OK( + CheckRowIdExistence(base_entries, delta_entries, latest_snapshot.NextRowId())); + } + PAIMON_RETURN_NOT_OK(CheckRowIdRangeConflicts(commit_kind, merged_entries)); + PAIMON_RETURN_NOT_OK(CheckGlobalIndexRowIdExistence(base_entries, delta_index_entries)); + PAIMON_RETURN_NOT_OK(CheckForRowIdFromSnapshot(latest_snapshot, delta_entries, + delta_index_entries, + row_id_column_conflict_checker)); + return Status::OK(); +} + +bool ConflictDetection::ShouldBeOverwriteCommit( + const std::vector& append_table_files, + const std::vector& append_index_files) const { + for (const ManifestEntry& entry : append_table_files) { + if (entry.Kind() == FileKind::Delete()) { + return true; + } + } + + for (const IndexManifestEntry& entry : append_index_files) { + if (entry.index_file->IndexType() == DeletionVectorsIndexFile::DELETION_VECTORS_INDEX) { + return true; + } + } + + return false; +} + +Status ConflictDetection::CheckBucketKeepSame(const std::vector& all_entries, + const Snapshot::CommitKind& commit_kind) const { + if (commit_kind == Snapshot::CommitKind::Overwrite()) { + return Status::OK(); + } + + // total buckets within the same partition should remain the same + std::unordered_map total_buckets; + for (const ManifestEntry& entry : all_entries) { + if (entry.TotalBuckets() <= 0) { + continue; + } + if (same_bucket_checked_partitions_.find(entry.Partition()) != + same_bucket_checked_partitions_.end()) { + continue; + } + + auto [iter, inserted] = total_buckets.emplace(entry.Partition(), entry.TotalBuckets()); + if (inserted || iter->second == entry.TotalBuckets()) { + continue; + } + + return BucketNumMismatch(entry.Partition(), entry.TotalBuckets(), iter->second); + } + + MarkBucketCheckedPartitions(total_buckets); + return Status::OK(); +} + +Status ConflictDetection::CollectUncheckedBucketPartitions( + const std::vector& delta_entries, + std::unordered_map* total_buckets) const { + total_buckets->clear(); + for (const ManifestEntry& entry : delta_entries) { + if (!(entry.Kind() == FileKind::Add()) || entry.TotalBuckets() <= 0 || + same_bucket_checked_partitions_.find(entry.Partition()) != + same_bucket_checked_partitions_.end()) { + continue; + } + + auto [iter, inserted] = total_buckets->emplace(entry.Partition(), entry.TotalBuckets()); + if (!inserted && iter->second != entry.TotalBuckets()) { + return BucketNumMismatch(entry.Partition(), entry.TotalBuckets(), iter->second); + } + } + + return Status::OK(); +} + +Status ConflictDetection::CheckSameBucketByTotalBuckets( + const std::unordered_map& expected_total_buckets, + const std::unordered_map& previous_total_buckets) const { + for (const auto& [partition, total_buckets] : expected_total_buckets) { + auto iter = previous_total_buckets.find(partition); + if (iter != previous_total_buckets.end() && iter->second != total_buckets) { + return BucketNumMismatch(partition, total_buckets, iter->second); + } + } + + MarkBucketCheckedPartitions(expected_total_buckets); + return Status::OK(); +} + +Status ConflictDetection::BucketNumMismatch(const BinaryRow& partition, int32_t num_buckets, + int32_t previous_num_buckets) const { + return Status::Invalid(fmt::format( + "Total buckets of partition {} changed from {} to {} without overwrite. Give up " + "committing.", + partition.ToString(), previous_num_buckets, num_buckets)); +} + +void ConflictDetection::MarkBucketCheckedPartitions( + const std::unordered_map& total_buckets) const { + if (total_buckets.empty()) { + return; + } + + for (const auto& [partition, _] : total_buckets) { + same_bucket_checked_partitions_.insert_or_assign(partition, true); + while (same_bucket_checked_partitions_.size() > kSameBucketCheckCacheMaxSize) { + same_bucket_checked_partitions_.erase(same_bucket_checked_partitions_.begin()->first); + } + } +} + +Status ConflictDetection::CheckFileDeletionConflicts( + const std::vector& merged_entries) const { + for (const auto& entry : merged_entries) { + if (entry.Kind() == FileKind::Delete()) { + return Status::Invalid(fmt::format( + "Trying to delete file {} which is not previously added.", entry.FileName())); + } + } + + return Status::OK(); +} + +Status ConflictDetection::CheckKeyRange(const std::vector& merged_entries) const { + if (table_schema_->PrimaryKeys().empty()) { + return Status::OK(); + } + + PAIMON_ASSIGN_OR_RAISE(std::vector trimmed_primary_key_fields, + table_schema_->TrimmedPrimaryKeyFields()); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr key_comparator, + FieldsComparator::Create(trimmed_primary_key_fields, + options_.SequenceFieldSortOrderIsAscending())); + + // group entries by partitions, buckets and levels + std::unordered_map, std::vector> levels; + for (const auto& entry : merged_entries) { + if (!(entry.Kind() == FileKind::Add())) { + continue; + } + int32_t level = entry.Level(); + if (level < 1) { + continue; + } + + levels[std::make_tuple(entry.Partition(), entry.Bucket(), level)].push_back(entry); + } + + // check for all LSM level >= 1, key ranges of files do not intersect + for (auto& [_, entries] : levels) { + std::sort(entries.begin(), entries.end(), + [&key_comparator](const ManifestEntry& a, const ManifestEntry& b) { + return key_comparator->CompareTo(a.MinKey(), b.MinKey()) < 0; + }); + for (size_t i = 0; i + 1 < entries.size(); ++i) { + const ManifestEntry& a = entries[i]; + const ManifestEntry& b = entries[i + 1]; + if (key_comparator->CompareTo(a.MaxKey(), b.MinKey()) >= 0) { + return Status::Invalid(fmt::format( + "LSM conflicts detected! Give up committing. Conflict files are {} and {}.", + a.FileName(), b.FileName())); + } + } + } + return Status::OK(); +} + +Status ConflictDetection::CheckRowIdExistence( + const std::vector& base_entries, + const std::vector& delta_entries, + const std::optional& next_row_id) const { + if (!options_.DataEvolutionEnabled() || !next_row_id) { + return Status::OK(); + } + + std::vector existing_ranges; + std::set> exact_ranges; + for (const ManifestEntry& base_entry : base_entries) { + if (!(base_entry.Kind() == FileKind::Add()) || !base_entry.File()->first_row_id || + IsDedicatedStorageFile(base_entry.FileName())) { + continue; + } + int64_t range_from = base_entry.File()->first_row_id.value(); + int64_t range_to = range_from + base_entry.File()->row_count - 1; + existing_ranges.emplace_back(range_from, range_to); + exact_ranges.emplace(range_from, range_to); + } + std::vector merged_ranges = Range::SortAndMergeOverlap(existing_ranges, + /*adjacent=*/false); + + for (const ManifestEntry& delta_entry : delta_entries) { + if (!(delta_entry.Kind() == FileKind::Add()) || !delta_entry.File()->first_row_id) { + continue; + } + int64_t first_row_id = delta_entry.File()->first_row_id.value(); + if (first_row_id >= next_row_id.value()) { + continue; + } + int64_t range_from = first_row_id; + int64_t range_to = first_row_id + delta_entry.File()->row_count - 1; + Range row_range(range_from, range_to); + + bool exists = false; + if (IsDedicatedStorageFile(delta_entry.FileName())) { + for (const Range& existing_range : merged_ranges) { + if (existing_range.from <= row_range.from && existing_range.to >= row_range.to) { + exists = true; + break; + } + } + } else { + exists = exact_ranges.find({row_range.from, row_range.to}) != exact_ranges.end(); + } + + if (!exists) { + return Status::Invalid(fmt::format( + "Row ID existence conflict: file '{}' references firstRowId={}, rowCount={} in " + "bucket {}, but no matching file exists in the current snapshot.", + delta_entry.FileName(), first_row_id, delta_entry.File()->row_count, + delta_entry.Bucket())); + } + } + + return Status::OK(); +} + +Status ConflictDetection::CheckRowIdRangeConflicts( + const Snapshot::CommitKind& commit_kind, + const std::vector& merged_entries) const { + if (!options_.DataEvolutionEnabled()) { + return Status::OK(); + } + if (!row_id_check_from_snapshot_ && + !(commit_kind == Snapshot::CommitKind::Compact())) { + return Status::OK(); + } + + std::vector> entries_with_ranges; + entries_with_ranges.reserve(merged_entries.size()); + for (const ManifestEntry& entry : merged_entries) { + if (!entry.File()->first_row_id) { + continue; + } + int64_t range_from = entry.File()->first_row_id.value(); + int64_t range_to = range_from + entry.File()->row_count - 1; + entries_with_ranges.emplace_back(Range(range_from, range_to), entry); + } + if (entries_with_ranges.empty()) { + return Status::OK(); + } + + std::sort(entries_with_ranges.begin(), entries_with_ranges.end(), + [](const auto& lhs, const auto& rhs) { + return lhs.first.from < rhs.first.from; + }); + + size_t group_start = 0; + int64_t group_max_to = entries_with_ranges[0].first.to; + for (size_t i = 1; i <= entries_with_ranges.size(); ++i) { + bool overlap_group_end = (i == entries_with_ranges.size()) || + (entries_with_ranges[i].first.from > group_max_to); + if (!overlap_group_end) { + group_max_to = std::max(group_max_to, entries_with_ranges[i].first.to); + continue; + } + + std::optional> expected_range; + for (size_t j = group_start; j < i; ++j) { + const ManifestEntry& entry = entries_with_ranges[j].second; + if (IsDedicatedStorageFile(entry.FileName())) { + continue; + } + std::pair current = {entries_with_ranges[j].first.from, + entries_with_ranges[j].first.to}; + if (!expected_range) { + expected_range = current; + } else if (expected_range.value() != current) { + return Status::Invalid( + "For Data Evolution table, multiple MERGE INTO/COMPACT operations have " + "encountered row-id range conflicts."); + } + } + + if (i < entries_with_ranges.size()) { + group_start = i; + group_max_to = entries_with_ranges[i].first.to; + } + } + + return Status::OK(); +} + +Status ConflictDetection::CheckForRowIdFromSnapshot( + const Snapshot& latest_snapshot, + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const std::optional>& + row_id_column_conflict_checker) const { + if (!options_.DataEvolutionEnabled() || !row_id_check_from_snapshot_ || !snapshot_manager_ || + !manifest_list_ || !manifest_file_ || !row_id_column_conflict_checker || + !row_id_column_conflict_checker.value() || + row_id_column_conflict_checker.value()->IsEmpty()) { + return Status::OK(); + } + + if (row_id_check_from_snapshot_.value() > latest_snapshot.Id()) { + return Status::OK(); + } + + PAIMON_ASSIGN_OR_RAISE(Snapshot check_snapshot, + snapshot_manager_->LoadSnapshot(row_id_check_from_snapshot_.value())); + if (!check_snapshot.NextRowId()) { + return Status::OK(); + } + int64_t check_next_row_id = check_snapshot.NextRowId().value(); + + int64_t from_snapshot_id = row_id_check_from_snapshot_.value() + 1; + if (from_snapshot_id < Snapshot::FIRST_SNAPSHOT_ID) { + from_snapshot_id = Snapshot::FIRST_SNAPSHOT_ID; + } + if (from_snapshot_id > latest_snapshot.Id()) { + return Status::OK(); + } + + std::vector changed_partitions = + ManifestEntryChanges::ChangedPartitions(delta_entries, delta_index_entries); + if (changed_partitions.empty()) { + return Status::OK(); + } + std::unordered_set changed_partition_set(changed_partitions.begin(), + changed_partitions.end()); + + for (int64_t snapshot_id = from_snapshot_id; snapshot_id <= latest_snapshot.Id(); + ++snapshot_id) { + PAIMON_ASSIGN_OR_RAISE(Snapshot snapshot, snapshot_manager_->LoadSnapshot(snapshot_id)); + if (snapshot.GetCommitKind() == Snapshot::CommitKind::Compact()) { + continue; + } + + std::vector delta_manifests; + PAIMON_RETURN_NOT_OK(manifest_list_->ReadDeltaManifests(snapshot, &delta_manifests)); + for (const ManifestFileMeta& manifest_meta : delta_manifests) { + std::vector history_entries; + PAIMON_RETURN_NOT_OK( + manifest_file_->Read(manifest_meta.FileName(), /*filter=*/nullptr, + &history_entries)); + for (const ManifestEntry& history_entry : history_entries) { + if (!(history_entry.Kind() == FileKind::Add()) || + !history_entry.File()->first_row_id || + changed_partition_set.find(history_entry.Partition()) == + changed_partition_set.end()) { + continue; + } + int64_t history_first_row_id = history_entry.File()->first_row_id.value(); + if (history_first_row_id >= check_next_row_id) { + continue; + } + if (row_id_column_conflict_checker.value()->ConflictsWith(*history_entry.File())) { + return Status::Invalid( + "For Data Evolution table, multiple MERGE INTO operations have " + "encountered conflicts while checking row-id history from " + "snapshot."); + } + } + } + } + + return Status::OK(); +} + +Status ConflictDetection::CheckGlobalIndexRowIdExistence( + const std::vector& base_entries, + const std::vector& delta_index_entries) const { + if (!options_.DataEvolutionEnabled()) { + return Status::OK(); + } + + std::vector indexes_to_check; + for (const IndexManifestEntry& index_entry : delta_index_entries) { + if (!(index_entry.kind == FileKind::Add()) || + !index_entry.index_file->GetGlobalIndexMeta()) { + continue; + } + indexes_to_check.push_back(index_entry); + } + if (indexes_to_check.empty()) { + return Status::OK(); + } + + for (const IndexManifestEntry& index_entry : indexes_to_check) { + std::vector data_ranges; + for (const ManifestEntry& base_entry : base_entries) { + if (!(base_entry.Kind() == FileKind::Add()) || + !(base_entry.Partition() == index_entry.partition) || + base_entry.Bucket() != index_entry.bucket || !base_entry.File()->first_row_id) { + continue; + } + + int64_t first_row_id = base_entry.File()->first_row_id.value(); + data_ranges.emplace_back(first_row_id, first_row_id + base_entry.File()->row_count - 1); + } + + const GlobalIndexMeta& global_index = index_entry.index_file->GetGlobalIndexMeta().value(); + Range index_range(global_index.row_range_start, global_index.row_range_end); + std::vector merged_ranges = Range::SortAndMergeOverlap(data_ranges, + /*adjacent=*/true); + bool covered = false; + for (const Range& data_range : merged_ranges) { + if (data_range.from <= index_range.from && data_range.to >= index_range.to) { + covered = true; + break; + } + } + if (!covered) { + return Status::Invalid(fmt::format( + "Global index row ID existence conflict: index file '{}' references row range {}, " + "but this range is not fully covered by current data files.", + index_entry.index_file->FileName(), index_range.ToString())); + } + } + + return Status::OK(); +} + +} // namespace paimon diff --git a/src/paimon/core/operation/commit/conflict_detection.h b/src/paimon/core/operation/commit/conflict_detection.h new file mode 100644 index 000000000..493fdc038 --- /dev/null +++ b/src/paimon/core/operation/commit/conflict_detection.h @@ -0,0 +1,119 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paimon/common/data/binary_row.h" +#include "paimon/common/utils/linked_hash_map.h" +#include "paimon/core/core_options.h" +#include "paimon/core/snapshot.h" +#include "paimon/status.h" + +namespace paimon { + +class ManifestEntry; +class IndexManifestEntry; +class ManifestFile; +class ManifestList; +class RowIdColumnConflictChecker; +class SnapshotManager; +class TableSchema; + +class ConflictDetection { + public: + ConflictDetection(std::shared_ptr table_schema, const CoreOptions& options, + std::shared_ptr snapshot_manager, + std::shared_ptr manifest_list, + std::shared_ptr manifest_file); + + Status CheckConflicts( + const Snapshot& latest_snapshot, + const std::vector& base_entries, + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const std::optional>& + row_id_column_conflict_checker, + const Snapshot::CommitKind& commit_kind) const; + + void SetRowIdCheckFromSnapshot(const std::optional& row_id_check_from_snapshot); + + bool HasRowIdCheckFromSnapshot() const; + + bool ShouldBeOverwriteCommit( + const std::vector& append_table_files, + const std::vector& append_index_files) const; + + Status CollectUncheckedBucketPartitions( + const std::vector& delta_entries, + std::unordered_map* total_buckets) const; + + Status CheckSameBucketByTotalBuckets( + const std::unordered_map& expected_total_buckets, + const std::unordered_map& previous_total_buckets) const; + + private: + Status CheckBucketKeepSame(const std::vector& all_entries, + const Snapshot::CommitKind& commit_kind) const; + + Status BucketNumMismatch(const BinaryRow& partition, int32_t num_buckets, + int32_t previous_num_buckets) const; + + void MarkBucketCheckedPartitions( + const std::unordered_map& total_buckets) const; + + Status CheckFileDeletionConflicts(const std::vector& merged_entries) const; + + Status CheckKeyRange(const std::vector& merged_entries) const; + + Status CheckRowIdExistence(const std::vector& base_entries, + const std::vector& delta_entries, + const std::optional& next_row_id) const; + + Status CheckRowIdRangeConflicts( + const Snapshot::CommitKind& commit_kind, + const std::vector& merged_entries) const; + + Status CheckForRowIdFromSnapshot( + const Snapshot& latest_snapshot, + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const std::optional>& + row_id_column_conflict_checker) const; + + Status CheckGlobalIndexRowIdExistence( + const std::vector& base_entries, + const std::vector& delta_index_entries) const; + + private: + static constexpr size_t kSameBucketCheckCacheMaxSize = 1000; + + std::shared_ptr table_schema_; + CoreOptions options_; + std::optional row_id_check_from_snapshot_; + std::shared_ptr snapshot_manager_; + std::shared_ptr manifest_list_; + std::shared_ptr manifest_file_; + mutable LinkedHashMap same_bucket_checked_partitions_; +}; + +} // namespace paimon diff --git a/src/paimon/core/operation/commit/conflict_detection_test.cpp b/src/paimon/core/operation/commit/conflict_detection_test.cpp new file mode 100644 index 000000000..5a0c27917 --- /dev/null +++ b/src/paimon/core/operation/commit/conflict_detection_test.cpp @@ -0,0 +1,499 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/operation/commit/conflict_detection.h" + +#include "arrow/c/abi.h" +#include "arrow/c/bridge.h" +#include "arrow/type.h" +#include "gtest/gtest.h" +#include "paimon/catalog/catalog.h" +#include "paimon/catalog/identifier.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/common/data/binary_row_writer.h" +#include "paimon/common/utils/path_util.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/index/global_index_meta.h" +#include "paimon/core/index/index_file_meta.h" +#include "paimon/core/manifest/file_kind.h" +#include "paimon/core/manifest/index_manifest_entry.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/schema/table_schema.h" +#include "paimon/core/stats/simple_stats.h" +#include "paimon/data/timestamp.h" +#include "paimon/defs.h" +#include "paimon/memory/bytes.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +namespace { + +Snapshot MakeSnapshot(const Snapshot::CommitKind& commit_kind) { + return Snapshot( + /*id=*/1, + /*schema_id=*/1, + /*base_manifest_list=*/"base-manifest-list", + /*base_manifest_list_size=*/std::nullopt, + /*delta_manifest_list=*/"delta-manifest-list", + /*delta_manifest_list_size=*/std::nullopt, + /*changelog_manifest_list=*/std::nullopt, + /*changelog_manifest_list_size=*/std::nullopt, + /*index_manifest=*/std::nullopt, + /*commit_user=*/"test-user", + /*commit_identifier=*/1, + commit_kind, + /*time_millis=*/0, + /*log_offsets=*/std::nullopt, + /*total_record_count=*/std::nullopt, + /*delta_record_count=*/std::nullopt, + /*changelog_record_count=*/std::nullopt, + /*watermark=*/std::nullopt, + /*statistics=*/std::nullopt, + /*properties=*/std::nullopt, + /*next_row_id=*/std::nullopt); +} + +Status CheckConflicts( + const ConflictDetection& detection, + const std::vector& base_entries, + const std::vector& delta_entries, + const Snapshot::CommitKind& commit_kind) { + return detection.CheckConflicts(MakeSnapshot(commit_kind), base_entries, delta_entries, + /*delta_index_entries=*/{}, + /*row_id_column_conflict_checker=*/std::nullopt, + commit_kind); +} + +Status CheckConflicts( + const ConflictDetection& detection, + const std::vector& base_entries, + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const Snapshot::CommitKind& commit_kind) { + return detection.CheckConflicts(MakeSnapshot(commit_kind), base_entries, delta_entries, + delta_index_entries, + /*row_id_column_conflict_checker=*/std::nullopt, + commit_kind); +} + +} // namespace + +class ConflictDetectionTest : public testing::Test { + public: + void SetUp() override { + fields_ = {arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::int32()), + arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; + } + + protected: + ManifestEntry CreateManifestEntry(const std::string& file_name, const FileKind& kind) const { + int32_t arity = 1; + BinaryRow row(arity); + BinaryRowWriter writer(&row, 20, GetDefaultPool().get()); + writer.WriteInt(0, 10); + writer.Complete(); + return CreateManifestEntry(file_name, row, kind); + } + + ManifestEntry CreateManifestEntry(const std::string& file_name, const BinaryRow& partition, + const FileKind& kind) const { + return CreateManifestEntry(file_name, partition, kind, DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/2, /*bucket=*/0); + } + + ManifestEntry CreateManifestEntry(const std::string& file_name, const BinaryRow& partition, + const FileKind& kind, const BinaryRow& min_key, + const BinaryRow& max_key, int32_t level, + int32_t bucket = 0, int32_t total_buckets = 2) const { + auto data_file_meta = std::make_shared( + file_name, 1024, 8, min_key, max_key, SimpleStats::EmptyStats(), + SimpleStats::EmptyStats(), /*min_seq_no=*/16, /*max_seq_no=*/32, + /*schema_id=*/1, level, + /*extra_files=*/std::vector>(), + /*creation_time=*/Timestamp(0, 0), + /*delete_row_count=*/3, + /*embedded_index=*/nullptr, /*file_source=*/std::nullopt, + /*external_path=*/std::nullopt, + /*value_stats_cols=*/std::nullopt, /*first_row_id=*/std::nullopt, + /*write_cols=*/std::nullopt); + return ManifestEntry(kind, partition, bucket, total_buckets, data_file_meta); + } + + ManifestEntry CreateManifestEntryWithFirstRowId(const std::string& file_name, + const BinaryRow& partition, + const FileKind& kind, int32_t bucket, + int64_t first_row_id, + int64_t row_count) const { + auto data_file_meta = std::make_shared( + file_name, 1024, row_count, DataFileMeta::EmptyMinKey(), DataFileMeta::EmptyMaxKey(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), /*min_seq_no=*/16, + /*max_seq_no=*/32, + /*schema_id=*/1, /*level=*/2, + /*extra_files=*/std::vector>(), + /*creation_time=*/Timestamp(0, 0), + /*delete_row_count=*/std::nullopt, + /*embedded_index=*/nullptr, /*file_source=*/std::nullopt, + /*external_path=*/std::nullopt, + /*value_stats_cols=*/std::nullopt, first_row_id, + /*write_cols=*/std::nullopt); + return ManifestEntry(kind, partition, bucket, /*total_buckets=*/2, data_file_meta); + } + + IndexManifestEntry CreateGlobalIndexEntry(const std::string& file_name, + const BinaryRow& partition, int32_t bucket, + int64_t row_range_start, + int64_t row_range_end) const { + GlobalIndexMeta global_index_meta(row_range_start, row_range_end, /*index_field_id=*/1, + /*extra_field_ids=*/std::nullopt, + std::make_shared("meta", GetDefaultPool().get())); + auto index_file_meta = std::make_shared( + "HASH", file_name, /*file_size=*/100, /*row_count=*/5, + /*dv_ranges=*/std::nullopt, /*external_path=*/std::nullopt, global_index_meta); + return IndexManifestEntry(FileKind::Add(), partition, bucket, index_file_meta); + } + + BinaryRow CreateIntRow(int32_t value) const { + BinaryRow row(1); + BinaryRowWriter writer(&row, 20, GetDefaultPool().get()); + writer.WriteInt(0, value); + writer.Complete(); + return row; + } + + arrow::FieldVector fields_; +}; + +TEST_F(ConflictDetectionTest, TestFileDeletionConflicts) { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr table_schema, + TableSchema::Create(/*schema_id=*/0, arrow::schema(fields_), /*partition_keys=*/{"f1"}, + /*primary_keys=*/{}, /*options=*/{})); + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap({})); + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + + { + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("f1", FileKind::Add())); + + std::vector changes; + changes.push_back(CreateManifestEntry("f1", FileKind::Delete())); + + ASSERT_OK(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append())); + } + { + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("f2", FileKind::Add())); + + std::vector changes; + changes.push_back(CreateManifestEntry("f1", FileKind::Delete())); + changes.push_back(CreateManifestEntry("f3", FileKind::Add())); + + ASSERT_NOK_WITH_MSG(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append()), + "Trying to delete file f1"); + } + { + std::vector base_entries; + std::vector changes; + changes.push_back(CreateManifestEntry("f1", FileKind::Delete())); + + ASSERT_NOK_WITH_MSG(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append()), + "Trying to delete file f1"); + } +} + +TEST_F(ConflictDetectionTest, TestGlobalIndexRowIdExistenceConflicts) { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr table_schema, + TableSchema::Create(/*schema_id=*/0, arrow::schema(fields_), /*partition_keys=*/{"f1"}, + /*primary_keys=*/{}, /*options=*/{})); + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, + CoreOptions::FromMap({{Options::DATA_EVOLUTION_ENABLED, "true"}})); + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + + const BinaryRow partition = CreateIntRow(10); + std::vector base_entries; + base_entries.push_back(CreateManifestEntryWithFirstRowId( + "base-1", partition, FileKind::Add(), /*bucket=*/0, /*first_row_id=*/0, + /*row_count=*/10)); + base_entries.push_back(CreateManifestEntryWithFirstRowId( + "base-2", partition, FileKind::Add(), /*bucket=*/0, /*first_row_id=*/10, + /*row_count=*/10)); + std::vector changes; + + ASSERT_OK(CheckConflicts(detection, + base_entries, changes, + {CreateGlobalIndexEntry("global-index-covered", partition, /*bucket=*/0, + /*row_range_start=*/0, /*row_range_end=*/19)}, + Snapshot::CommitKind::Append())); + + ASSERT_NOK_WITH_MSG( + CheckConflicts(detection, + base_entries, changes, + {CreateGlobalIndexEntry("global-index-missing", partition, /*bucket=*/0, + /*row_range_start=*/0, /*row_range_end=*/20)}, + Snapshot::CommitKind::Append()), + "Global index row ID existence conflict"); +} + +TEST_F(ConflictDetectionTest, TestBucketKeepSame) { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr table_schema, + TableSchema::Create(/*schema_id=*/0, arrow::schema(fields_), /*partition_keys=*/{"f1"}, + /*primary_keys=*/{}, /*options=*/{})); + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap({})); + + const BinaryRow partition = CreateIntRow(10); + { + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("base", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/4)); + std::vector changes; + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/4)); + + ASSERT_OK(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append())); + } + { + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("base", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/2)); + std::vector changes; + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/4)); + + ASSERT_NOK_WITH_MSG(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append()), + "Total buckets of partition"); + } + { + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("base", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/2)); + std::vector changes; + changes.push_back(CreateManifestEntry("delta", CreateIntRow(20), FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/4)); + + ASSERT_OK(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append())); + } + { + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("base", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/2)); + std::vector changes; + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/4)); + + ASSERT_OK(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Overwrite())); + } +} + +TEST_F(ConflictDetectionTest, TestBucketKeepSameHelpers) { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr table_schema, + TableSchema::Create(/*schema_id=*/0, arrow::schema(fields_), /*partition_keys=*/{"f1"}, + /*primary_keys=*/{}, /*options=*/{})); + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap({})); + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + + const BinaryRow partition = CreateIntRow(10); + std::vector changes; + changes.push_back(CreateManifestEntry("delta-1", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/4)); + changes.push_back(CreateManifestEntry("delta-2", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/1, /*total_buckets=*/4)); + + std::unordered_map expected_total_buckets; + ASSERT_OK(detection.CollectUncheckedBucketPartitions(changes, &expected_total_buckets)); + ASSERT_EQ(1U, expected_total_buckets.size()); + ASSERT_EQ(4, expected_total_buckets.at(partition)); + + std::unordered_map previous_total_buckets; + previous_total_buckets.emplace(partition, 4); + ASSERT_OK( + detection.CheckSameBucketByTotalBuckets(expected_total_buckets, previous_total_buckets)); + + std::unordered_map cached_total_buckets; + ASSERT_OK(detection.CollectUncheckedBucketPartitions(changes, &cached_total_buckets)); + ASSERT_TRUE(cached_total_buckets.empty()); + + ConflictDetection mismatch_detection(table_schema, core_options, nullptr, nullptr, nullptr); + ASSERT_NOK_WITH_MSG( + mismatch_detection.CheckSameBucketByTotalBuckets(expected_total_buckets, + {{partition, 2}}), + "Total buckets of partition"); +} + +TEST_F(ConflictDetectionTest, TestCollectUncheckedBucketPartitionsMismatch) { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr table_schema, + TableSchema::Create(/*schema_id=*/0, arrow::schema(fields_), /*partition_keys=*/{"f1"}, + /*primary_keys=*/{}, /*options=*/{})); + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap({})); + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + + const BinaryRow partition = CreateIntRow(10); + std::vector changes; + changes.push_back(CreateManifestEntry("delta-1", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, /*total_buckets=*/2)); + changes.push_back(CreateManifestEntry("delta-2", partition, FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/1, /*total_buckets=*/4)); + + std::unordered_map total_buckets; + ASSERT_NOK_WITH_MSG(detection.CollectUncheckedBucketPartitions(changes, &total_buckets), + "Total buckets of partition"); +} + +TEST_F(ConflictDetectionTest, TestBucketKeepSameCacheEviction) { + ASSERT_OK_AND_ASSIGN( + std::shared_ptr table_schema, + TableSchema::Create(/*schema_id=*/0, arrow::schema(fields_), /*partition_keys=*/{"f1"}, + /*primary_keys=*/{}, /*options=*/{})); + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap({})); + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + + const int32_t total_buckets = 4; + for (int32_t value = 0; value <= 1000; ++value) { + std::vector changes; + changes.push_back(CreateManifestEntry("delta", CreateIntRow(value), FileKind::Add(), + DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, + /*bucket=*/0, total_buckets)); + + std::unordered_map expected_total_buckets; + ASSERT_OK(detection.CollectUncheckedBucketPartitions(changes, &expected_total_buckets)); + ASSERT_EQ(1U, expected_total_buckets.size()); + ASSERT_OK(detection.CheckSameBucketByTotalBuckets(expected_total_buckets, + expected_total_buckets)); + } + + std::vector evicted_partition_changes; + evicted_partition_changes.push_back(CreateManifestEntry( + "delta", CreateIntRow(0), FileKind::Add(), DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, /*bucket=*/0, total_buckets)); + std::unordered_map evicted_partition_buckets; + ASSERT_OK(detection.CollectUncheckedBucketPartitions(evicted_partition_changes, + &evicted_partition_buckets)); + ASSERT_EQ(1U, evicted_partition_buckets.size()); +} + +TEST_F(ConflictDetectionTest, TestCheckLsmKeyRangeConflict) { + auto fields = {arrow::field("f0", arrow::int32(), /*nullable=*/false), + arrow::field("f1", arrow::int32(), /*nullable=*/false), + arrow::field("f2", arrow::int32())}; + ASSERT_OK_AND_ASSIGN( + std::shared_ptr table_schema, + TableSchema::Create(/*schema_id=*/0, arrow::schema(fields), /*partition_keys=*/{"f1"}, + /*primary_keys=*/{"f1", "f0"}, + {{Options::BUCKET, "4"}})); + ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap({{Options::BUCKET, "4"}})); + ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); + + const BinaryRow partition = CreateIntRow(10); + { + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("base", partition, FileKind::Add(), + CreateIntRow(1), CreateIntRow(3), + /*level=*/1)); + std::vector changes; + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), + CreateIntRow(3), CreateIntRow(5), /*level=*/1)); + ASSERT_NOK_WITH_MSG(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append()), + "LSM conflicts detected"); + } + { + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("base", partition, FileKind::Add(), + CreateIntRow(1), CreateIntRow(3), + /*level=*/1)); + std::vector changes; + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), + CreateIntRow(4), CreateIntRow(5), /*level=*/1)); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append())); + } + { + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("base", partition, FileKind::Add(), + CreateIntRow(1), CreateIntRow(3), + /*level=*/0)); + std::vector changes; + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), + CreateIntRow(2), CreateIntRow(5), /*level=*/0)); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append())); + } + { + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("base", partition, FileKind::Add(), + CreateIntRow(1), CreateIntRow(3), + /*level=*/1, /*bucket=*/0)); + std::vector changes; + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), + CreateIntRow(2), CreateIntRow(5), /*level=*/1, + /*bucket=*/1)); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append())); + } + { + std::vector base_entries; + base_entries.push_back(CreateManifestEntry("base", partition, FileKind::Add(), + CreateIntRow(1), CreateIntRow(3), + /*level=*/1)); + std::vector changes; + changes.push_back(CreateManifestEntry("delta", CreateIntRow(20), FileKind::Add(), + CreateIntRow(2), CreateIntRow(5), /*level=*/1)); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, + Snapshot::CommitKind::Append())); + } +} + +} // namespace paimon::test diff --git a/src/paimon/core/operation/commit/manifest_entry_changes.cpp b/src/paimon/core/operation/commit/manifest_entry_changes.cpp new file mode 100644 index 000000000..9e7381275 --- /dev/null +++ b/src/paimon/core/operation/commit/manifest_entry_changes.cpp @@ -0,0 +1,137 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/operation/commit/manifest_entry_changes.h" + +#include + +#include "fmt/format.h" +#include "fmt/ranges.h" +#include "paimon/core/deletionvectors/deletion_vectors_index_file.h" +#include "paimon/core/io/compact_increment.h" +#include "paimon/core/io/data_increment.h" + +namespace paimon { + +ManifestEntryChanges::ManifestEntryChanges(int32_t default_num_bucket) + : default_num_bucket_(default_num_bucket) {} + +Status ManifestEntryChanges::Collect(const std::shared_ptr& message) { + auto commit_message = std::dynamic_pointer_cast(message); + if (!commit_message) { + return Status::Invalid("fail to cast commit message to commit message impl"); + } + + DataIncrement new_files_increment = commit_message->GetNewFilesIncrement(); + for (const std::shared_ptr& file : new_files_increment.NewFiles()) { + append_table_files.push_back(MakeEntry(FileKind::Add(), commit_message, file)); + } + for (const std::shared_ptr& file : new_files_increment.DeletedFiles()) { + append_table_files.push_back(MakeEntry(FileKind::Delete(), commit_message, file)); + } + for (const std::shared_ptr& file : new_files_increment.ChangelogFiles()) { + append_changelog.push_back(MakeEntry(FileKind::Add(), commit_message, file)); + } + for (const std::shared_ptr& file : new_files_increment.DeletedIndexFiles()) { + append_index_files.emplace_back(FileKind::Delete(), commit_message->Partition(), + commit_message->Bucket(), file); + } + for (const std::shared_ptr& file : new_files_increment.NewIndexFiles()) { + append_index_files.emplace_back(FileKind::Add(), commit_message->Partition(), + commit_message->Bucket(), file); + } + + CompactIncrement compact_increment = commit_message->GetCompactIncrement(); + for (const std::shared_ptr& file : compact_increment.CompactBefore()) { + compact_table_files.push_back(MakeEntry(FileKind::Delete(), commit_message, file)); + } + for (const std::shared_ptr& file : compact_increment.CompactAfter()) { + compact_table_files.push_back(MakeEntry(FileKind::Add(), commit_message, file)); + } + for (const std::shared_ptr& file : compact_increment.ChangelogFiles()) { + compact_changelog.push_back(MakeEntry(FileKind::Add(), commit_message, file)); + } + for (const std::shared_ptr& file : compact_increment.DeletedIndexFiles()) { + compact_index_files.emplace_back(FileKind::Delete(), commit_message->Partition(), + commit_message->Bucket(), file); + } + for (const std::shared_ptr& file : compact_increment.NewIndexFiles()) { + compact_index_files.emplace_back(FileKind::Add(), commit_message->Partition(), + commit_message->Bucket(), file); + } + + return Status::OK(); +} + +bool ManifestEntryChanges::HasAppendChanges() const { + return !append_table_files.empty() || !append_changelog.empty() || !append_index_files.empty(); +} + +bool ManifestEntryChanges::HasCompactChanges() const { + return !compact_table_files.empty() || !compact_changelog.empty() || + !compact_index_files.empty(); +} + +std::string ManifestEntryChanges::ToString() const { + std::vector parts; + if (!append_table_files.empty()) { + parts.push_back(fmt::format("{} append table files", append_table_files.size())); + } + if (!append_changelog.empty()) { + parts.push_back(fmt::format("{} append Changelogs", append_changelog.size())); + } + if (!append_index_files.empty()) { + parts.push_back(fmt::format("{} append index files", append_index_files.size())); + } + if (!compact_table_files.empty()) { + parts.push_back(fmt::format("{} compact table files", compact_table_files.size())); + } + if (!compact_changelog.empty()) { + parts.push_back(fmt::format("{} compact Changelogs", compact_changelog.size())); + } + if (!compact_index_files.empty()) { + parts.push_back(fmt::format("{} compact index files", compact_index_files.size())); + } + return fmt::format("{}", fmt::join(parts, ", ")); +} + +std::vector ManifestEntryChanges::ChangedPartitions( + const std::vector& data_file_changes, + const std::vector& index_file_changes) { + std::unordered_set changed_partitions; + for (const ManifestEntry& file : data_file_changes) { + changed_partitions.insert(file.Partition()); + } + for (const IndexManifestEntry& file : index_file_changes) { + if (file.index_file->IndexType() == DeletionVectorsIndexFile::DELETION_VECTORS_INDEX || + file.index_file->GetGlobalIndexMeta()) { + changed_partitions.insert(file.partition); + } + } + return std::vector(changed_partitions.begin(), changed_partitions.end()); +} + +ManifestEntry ManifestEntryChanges::MakeEntry( + const FileKind& kind, const std::shared_ptr& commit_message, + const std::shared_ptr& file) const { + int32_t total_buckets = commit_message->TotalBuckets() == std::nullopt + ? default_num_bucket_ + : commit_message->TotalBuckets().value(); + return ManifestEntry(kind, commit_message->Partition(), commit_message->Bucket(), + total_buckets, file); +} + +} // namespace paimon diff --git a/src/paimon/core/operation/commit/manifest_entry_changes.h b/src/paimon/core/operation/commit/manifest_entry_changes.h new file mode 100644 index 000000000..84e9b2b91 --- /dev/null +++ b/src/paimon/core/operation/commit/manifest_entry_changes.h @@ -0,0 +1,69 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paimon/commit_message.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/manifest/file_kind.h" +#include "paimon/core/manifest/index_manifest_entry.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/table/sink/commit_message_impl.h" +#include "paimon/status.h" + +namespace paimon { + +class ManifestEntryChanges { + public: + explicit ManifestEntryChanges(int32_t default_num_bucket); + + Status Collect(const std::shared_ptr& message); + + bool HasAppendChanges() const; + + bool HasCompactChanges() const; + + std::string ToString() const; + + static std::vector ChangedPartitions( + const std::vector& data_file_changes, + const std::vector& index_file_changes); + + public: + std::vector append_table_files; + std::vector append_changelog; + std::vector append_index_files; + std::vector compact_table_files; + std::vector compact_changelog; + std::vector compact_index_files; + + private: + ManifestEntry MakeEntry(const FileKind& kind, + const std::shared_ptr& commit_message, + const std::shared_ptr& file) const; + + private: + int32_t default_num_bucket_; +}; + +} // namespace paimon diff --git a/src/paimon/core/operation/commit/manifest_entry_changes_test.cpp b/src/paimon/core/operation/commit/manifest_entry_changes_test.cpp new file mode 100644 index 000000000..c1f224e8f --- /dev/null +++ b/src/paimon/core/operation/commit/manifest_entry_changes_test.cpp @@ -0,0 +1,168 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/operation/commit/manifest_entry_changes.h" + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/common/data/binary_row_writer.h" +#include "paimon/core/index/global_index_meta.h" +#include "paimon/core/index/index_file_meta.h" +#include "paimon/core/io/compact_increment.h" +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/io/data_increment.h" +#include "paimon/core/manifest/file_kind.h" +#include "paimon/core/manifest/index_manifest_entry.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/core/table/sink/commit_message_impl.h" +#include "paimon/core/stats/simple_stats.h" +#include "paimon/data/timestamp.h" +#include "paimon/defs.h" +#include "paimon/memory/bytes.h" +#include "paimon/testing/utils/testharness.h" + +#include + +namespace paimon::test { + +class ManifestEntryChangesTest : public testing::Test { + protected: + BinaryRow CreateIntRow(int32_t value) const { + BinaryRow row(1); + BinaryRowWriter writer(&row, 20, GetDefaultPool().get()); + writer.WriteInt(0, value); + writer.Complete(); + return row; + } + + std::shared_ptr CreateDataFileMeta(const std::string& file_name) const { + return std::make_shared( + file_name, 1024, 8, DataFileMeta::EmptyMinKey(), DataFileMeta::EmptyMaxKey(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), /*min_seq_no=*/16, + /*max_seq_no=*/32, + /*schema_id=*/1, /*level=*/2, + /*extra_files=*/std::vector>(), + /*creation_time=*/Timestamp(0, 0), + /*delete_row_count=*/std::nullopt, + /*embedded_index=*/nullptr, /*file_source=*/std::nullopt, + /*external_path=*/std::nullopt, + /*value_stats_cols=*/std::nullopt, /*first_row_id=*/std::nullopt, + /*write_cols=*/std::nullopt); + } + + std::shared_ptr CreateIndexFileMeta(const std::string& file_name, + const std::string& index_type = "bitmap") const { + return std::make_shared( + index_type, file_name, /*file_size=*/100, /*row_count=*/5, + /*dv_ranges=*/std::nullopt, /*external_path=*/std::nullopt, + /*global_index_meta=*/std::nullopt); + } + + std::shared_ptr CreateGlobalIndexFileMeta(const std::string& file_name) const { + GlobalIndexMeta global_index(/*row_range_start=*/0, /*row_range_end=*/3, + /*index_field_id=*/1, /*extra_field_ids=*/std::nullopt, + std::make_shared("meta", GetDefaultPool().get())); + return std::make_shared( + "HASH", file_name, /*file_size=*/100, /*row_count=*/5, + /*dv_ranges=*/std::nullopt, /*external_path=*/std::nullopt, global_index); + } +}; + +TEST_F(ManifestEntryChangesTest, TestCollectAndSummary) { + const BinaryRow partition = CreateIntRow(10); + + DataIncrement data_increment( + {CreateDataFileMeta("append-add")}, + {CreateDataFileMeta("append-del")}, + {CreateDataFileMeta("append-changelog")}, + {CreateIndexFileMeta("append-index-add")}, + {CreateIndexFileMeta("append-index-del")}); + CompactIncrement compact_increment( + {CreateDataFileMeta("compact-before")}, + {CreateDataFileMeta("compact-after")}, + {CreateDataFileMeta("compact-changelog")}, + {CreateIndexFileMeta("compact-index-add")}, + {CreateIndexFileMeta("compact-index-del")}); + + std::shared_ptr message = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/4, data_increment, compact_increment); + + ManifestEntryChanges changes(/*default_num_bucket=*/8); + ASSERT_OK(changes.Collect(message)); + + ASSERT_EQ(2u, changes.append_table_files.size()); + ASSERT_EQ(1u, changes.append_changelog.size()); + ASSERT_EQ(2u, changes.append_index_files.size()); + ASSERT_EQ(2u, changes.compact_table_files.size()); + ASSERT_EQ(1u, changes.compact_changelog.size()); + ASSERT_EQ(2u, changes.compact_index_files.size()); + + EXPECT_TRUE(changes.HasAppendChanges()); + EXPECT_TRUE(changes.HasCompactChanges()); + + EXPECT_EQ(FileKind::Add(), changes.append_table_files[0].Kind()); + EXPECT_EQ(FileKind::Delete(), changes.append_table_files[1].Kind()); + EXPECT_EQ(4, changes.append_table_files[0].TotalBuckets()); + + std::string summary = changes.ToString(); + EXPECT_NE(std::string::npos, summary.find("2 append table files")); + EXPECT_NE(std::string::npos, summary.find("1 append Changelogs")); + EXPECT_NE(std::string::npos, summary.find("2 compact index files")); +} + +TEST_F(ManifestEntryChangesTest, TestCollectInvalidCommitMessageType) { + ManifestEntryChanges changes(/*default_num_bucket=*/8); + std::shared_ptr invalid_message = std::make_shared(); + ASSERT_NOK_WITH_MSG(changes.Collect(invalid_message), + "fail to cast commit message to commit message impl"); +} + +TEST_F(ManifestEntryChangesTest, TestChangedPartitionsIncludesDvAndGlobalIndex) { + const BinaryRow partition_data = CreateIntRow(10); + const BinaryRow partition_dv = CreateIntRow(20); + const BinaryRow partition_global = CreateIntRow(30); + const BinaryRow partition_plain_index = CreateIntRow(40); + + std::vector data_changes; + data_changes.emplace_back(FileKind::Add(), partition_data, /*bucket=*/0, /*total_buckets=*/2, + CreateDataFileMeta("data-file")); + + std::vector index_changes; + index_changes.emplace_back(FileKind::Add(), partition_dv, /*bucket=*/0, + CreateIndexFileMeta("dv-file", "DELETION_VECTORS")); + index_changes.emplace_back(FileKind::Add(), partition_global, /*bucket=*/0, + CreateGlobalIndexFileMeta("global-index")); + index_changes.emplace_back(FileKind::Add(), partition_plain_index, /*bucket=*/0, + CreateIndexFileMeta("plain-index", "bitmap")); + + std::vector changed = + ManifestEntryChanges::ChangedPartitions(data_changes, index_changes); + + auto contains = [&changed](const BinaryRow& target) { + return std::find(changed.begin(), changed.end(), target) != changed.end(); + }; + + EXPECT_TRUE(contains(partition_data)); + EXPECT_TRUE(contains(partition_dv)); + EXPECT_TRUE(contains(partition_global)); + EXPECT_FALSE(contains(partition_plain_index)); +} + +} // namespace paimon::test diff --git a/src/paimon/core/operation/commit/row_id_column_conflict_checker.cpp b/src/paimon/core/operation/commit/row_id_column_conflict_checker.cpp new file mode 100644 index 000000000..1acb0ea0f --- /dev/null +++ b/src/paimon/core/operation/commit/row_id_column_conflict_checker.cpp @@ -0,0 +1,213 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/operation/commit/row_id_column_conflict_checker.h" + +#include +#include + +#include "paimon/common/table/special_fields.h" +#include "paimon/core/manifest/manifest_entry.h" +#include "paimon/status.h" + +namespace paimon { + +namespace { + +struct DataFileRangeItem { + Range range; + const DataFileMeta* file; +}; + +} // namespace + +Result> RowIdColumnConflictChecker::FromManifestEntries( + const std::shared_ptr& schema_manager, + const std::vector& delta_entries) { + auto checker = std::shared_ptr( + new RowIdColumnConflictChecker(schema_manager)); + + std::vector row_id_files; + row_id_files.reserve(delta_entries.size()); + for (const auto& entry : delta_entries) { + if (entry.File()->first_row_id.has_value()) { + row_id_files.push_back(entry.File().get()); + } + } + + PAIMON_RETURN_NOT_OK(checker->BuildWriteRanges(row_id_files)); + return checker; +} + +Status RowIdColumnConflictChecker::BuildWriteRanges( + const std::vector& row_id_files) { + if (row_id_files.empty()) { + write_ranges_.clear(); + return Status::OK(); + } + + std::vector sorted_files; + sorted_files.reserve(row_id_files.size()); + for (const auto* file : row_id_files) { + const int64_t from = file->first_row_id.value(); + sorted_files.push_back(DataFileRangeItem{Range(from, from + file->row_count - 1), file}); + } + + std::sort(sorted_files.begin(), sorted_files.end(), + [](const DataFileRangeItem& a, const DataFileRangeItem& b) { + if (a.range.from != b.range.from) { + return a.range.from < b.range.from; + } + return a.range.to < b.range.to; + }); + + write_ranges_.clear(); + + size_t i = 0; + while (i < sorted_files.size()) { + int64_t merged_from = sorted_files[i].range.from; + int64_t merged_to = sorted_files[i].range.to; + std::unordered_set field_ids; + PAIMON_RETURN_NOT_OK(AddWriteFieldIds(&field_ids, *sorted_files[i].file)); + + size_t j = i + 1; + while (j < sorted_files.size() && sorted_files[j].range.from <= merged_to) { + merged_to = std::max(merged_to, sorted_files[j].range.to); + PAIMON_RETURN_NOT_OK(AddWriteFieldIds(&field_ids, *sorted_files[j].file)); + ++j; + } + + write_ranges_.push_back(WriteRange{Range(merged_from, merged_to), std::move(field_ids)}); + i = j; + } + + std::sort(write_ranges_.begin(), write_ranges_.end(), + [](const WriteRange& a, const WriteRange& b) { + if (a.range.from != b.range.from) { + return a.range.from < b.range.from; + } + return a.range.to < b.range.to; + }); + + return Status::OK(); +} + +Status RowIdColumnConflictChecker::AddWriteFieldIds( + std::unordered_set* field_ids, const DataFileMeta& file) { + if (!file.write_cols.has_value()) { + const std::map* field_id_by_name = nullptr; + PAIMON_RETURN_NOT_OK(FieldIdByName(file.schema_id, &field_id_by_name)); + for (const auto& entry : *field_id_by_name) { + field_ids->insert(entry.second); + } + return Status::OK(); + } + + for (const auto& write_col : file.write_cols.value()) { + PAIMON_ASSIGN_OR_RAISE(std::optional field_id, ResolveFieldId(file, write_col)); + if (field_id.has_value()) { + field_ids->insert(field_id.value()); + } + } + + return Status::OK(); +} + +bool RowIdColumnConflictChecker::ConflictsWith(const DataFileMeta& file) const { + if (!file.first_row_id.has_value()) { + return false; + } + + Range target(file.first_row_id.value(), file.first_row_id.value() + file.row_count - 1); + + int low = 0; + int high = static_cast(write_ranges_.size()); + while (low < high) { + const int mid = (low + high) >> 1; + if (write_ranges_[mid].range.to < target.from) { + low = mid + 1; + } else { + high = mid; + } + } + + for (int index = low; index < static_cast(write_ranges_.size()); ++index) { + const auto& write_range = write_ranges_[index]; + if (write_range.range.from > target.to) { + return false; + } + if (!Range::HasIntersection(write_range.range, target)) { + continue; + } + + if (!file.write_cols.has_value()) { + return true; + } + + for (const auto& write_col : file.write_cols.value()) { + auto field_id_result = ResolveFieldId(file, write_col); + if (!field_id_result.ok()) { + return false; + } + const auto& field_id = field_id_result.value(); + if (field_id.has_value() && write_range.field_ids.count(field_id.value()) > 0) { + return true; + } + } + } + + return false; +} + +Result> RowIdColumnConflictChecker::ResolveFieldId( + const DataFileMeta& file, const std::string& write_col) const { + const std::map* field_id_by_name = nullptr; + PAIMON_RETURN_NOT_OK(FieldIdByName(file.schema_id, &field_id_by_name)); + auto it = field_id_by_name->find(write_col); + if (it != field_id_by_name->end()) { + return std::optional(it->second); + } + + if (SpecialFields::IsSpecialFieldName(write_col)) { + return std::optional(); + } + + return Status::Invalid( + "Cannot find write column '" + write_col + "' in schema " + + std::to_string(file.schema_id) + "."); +} + +Status RowIdColumnConflictChecker::FieldIdByName( + int64_t schema_id, const std::map** field_id_by_name) const { + auto cache_it = field_id_by_name_cache_.find(schema_id); + if (cache_it != field_id_by_name_cache_.end()) { + *field_id_by_name = &cache_it->second; + return Status::OK(); + } + + PAIMON_ASSIGN_OR_RAISE(auto schema, schema_manager_->ReadSchema(schema_id)); + std::map mapping; + for (const auto& field : schema->Fields()) { + mapping[field.Name()] = field.Id(); + } + + auto [it, inserted] = field_id_by_name_cache_.emplace(schema_id, std::move(mapping)); + (void)inserted; + *field_id_by_name = &it->second; + return Status::OK(); +} + +} // namespace paimon diff --git a/src/paimon/core/operation/commit/row_id_column_conflict_checker.h b/src/paimon/core/operation/commit/row_id_column_conflict_checker.h new file mode 100644 index 000000000..0a7c3ea29 --- /dev/null +++ b/src/paimon/core/operation/commit/row_id_column_conflict_checker.h @@ -0,0 +1,70 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paimon/core/io/data_file_meta.h" +#include "paimon/core/schema/schema_manager.h" +#include "paimon/utils/range.h" + +namespace paimon { + +class ManifestEntry; + +// Java-aligned row-id conflict checker: overlap in row-id range AND write columns. +class RowIdColumnConflictChecker { + public: + static Result> FromManifestEntries( + const std::shared_ptr& schema_manager, + const std::vector& delta_entries); + + bool IsEmpty() const { + return write_ranges_.empty(); + } + + bool ConflictsWith(const DataFileMeta& file) const; + + private: + struct WriteRange { + Range range; + std::unordered_set field_ids; + }; + + explicit RowIdColumnConflictChecker(const std::shared_ptr& schema_manager) + : schema_manager_(schema_manager) {} + + Status BuildWriteRanges(const std::vector& row_id_files); + Status AddWriteFieldIds(std::unordered_set* field_ids, const DataFileMeta& file); + Result> ResolveFieldId(const DataFileMeta& file, + const std::string& write_col) const; + Status FieldIdByName(int64_t schema_id, + const std::map** field_id_by_name) const; + + private: + std::shared_ptr schema_manager_; + mutable std::map> field_id_by_name_cache_; + std::vector write_ranges_; +}; + +} // namespace paimon diff --git a/src/paimon/core/operation/file_store_commit.cpp b/src/paimon/core/operation/file_store_commit.cpp index 82f1b6416..71f80a6fa 100644 --- a/src/paimon/core/operation/file_store_commit.cpp +++ b/src/paimon/core/operation/file_store_commit.cpp @@ -31,7 +31,6 @@ #include "paimon/core/operation/file_store_commit_impl.h" #include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" -#include "paimon/core/table/bucket_mode.h" #include "paimon/core/utils/field_mapping.h" #include "paimon/core/utils/file_store_path_factory.h" #include "paimon/core/utils/snapshot_manager.h" @@ -67,15 +66,6 @@ Result> FileStoreCommit::Create( return Status::Invalid("not found latest schema"); } const auto& schema = table_schema.value(); - if (!schema->PrimaryKeys().empty() && - ctx->GetOptions().find("enable-pk-commit-in-inte-test") == ctx->GetOptions().end()) { - // Postpone bucket mode (bucket=-2) writes all data files to the bucket-postpone/ directory. - // A compaction job will later redistribute files into real buckets. The commit logic - // (manifest and snapshot generation) is the same as append tables, so we allow it. - if (schema->NumBuckets() != BucketModeDefine::POSTPONE_BUCKET) { - return Status::NotImplemented("not support pk table commit yet"); - } - } auto opts = schema->Options(); for (const auto& [key, value] : ctx->GetOptions()) { opts[key] = value; diff --git a/src/paimon/core/operation/file_store_commit_impl.cpp b/src/paimon/core/operation/file_store_commit_impl.cpp index 3f263ab19..5bb914e8d 100644 --- a/src/paimon/core/operation/file_store_commit_impl.cpp +++ b/src/paimon/core/operation/file_store_commit_impl.cpp @@ -55,6 +55,7 @@ #include "paimon/core/manifest/manifest_list.h" #include "paimon/core/manifest/partition_entry.h" #include "paimon/core/operation/append_only_file_store_scan.h" +#include "paimon/core/operation/commit/conflict_detection.h" #include "paimon/core/operation/expire_snapshots.h" #include "paimon/core/operation/file_store_scan.h" #include "paimon/core/operation/manifest_file_merger.h" @@ -75,6 +76,30 @@ namespace paimon { class Executor; class MemoryPool; +namespace { + +bool MatchPartitionSpec(const std::map& partition, + const std::map& partition_spec) { + for (const auto& [key, value] : partition_spec) { + auto iter = partition.find(key); + if (iter == partition.end() || iter->second != value) { + return false; + } + } + return true; +} + +bool HasGlobalIndexFileAdditions(const std::vector& index_entries) { + for (const IndexManifestEntry& index_entry : index_entries) { + if (index_entry.kind == FileKind::Add() && index_entry.index_file->GetGlobalIndexMeta()) { + return true; + } + } + return false; +} + +} // namespace + FileStoreCommitImpl::FileStoreCommitImpl( const std::shared_ptr& pool, const std::shared_ptr& executor, const std::shared_ptr& schema, const std::string& root_path, @@ -101,6 +126,8 @@ FileStoreCommitImpl::FileStoreCommitImpl( ignore_empty_commit_(ignore_empty_commit), num_bucket_(options.GetBucket()), table_schema_(table_schema), + conflict_detection_(table_schema, options, snapshot_manager_, manifest_list, + manifest_file), manifest_file_(manifest_file), manifest_list_(manifest_list), index_manifest_file_(index_manifest_file), @@ -128,7 +155,7 @@ Status FileStoreCommitImpl::DropPartition( } std::string log_msg = fmt::format("Ready to drop partitions {}", partitions); PAIMON_LOG_DEBUG(logger_, "%s", log_msg.c_str()); - return TryOverwrite(partitions, {}, commit_identifier, std::nullopt); + return TryOverwrite(partitions, {}, /*index_entries=*/{}, commit_identifier, std::nullopt); } Result FileStoreCommitImpl::FilterAndCommit( @@ -263,20 +290,18 @@ Status FileStoreCommitImpl::Overwrite( std::optional watermark) { std::shared_ptr committable = CreateManifestCommittable(identifier, commit_messages, watermark); - std::vector append_table_files; - std::vector append_changelog_files; - std::vector compact_table_files; - std::vector compact_changelog_files; - std::vector append_table_index_files; - std::vector compact_table_index_files; - PAIMON_RETURN_NOT_OK(CollectChanges(committable->FileCommittables(), &append_table_files, - &append_changelog_files, &compact_table_files, - &compact_changelog_files, &append_table_index_files, - &compact_table_index_files)); - if (!append_table_index_files.empty()) { - return Status::NotImplemented("Overwrite not support index for now"); - } - return TryOverwrite(partitions, append_table_files, identifier, watermark); + PAIMON_ASSIGN_OR_RAISE(ManifestEntryChanges changes, + CollectManifestEntryChanges(committable->FileCommittables())); + PAIMON_RETURN_NOT_OK(TryOverwrite(partitions, changes.append_table_files, + changes.append_index_files, identifier, watermark)); + if (changes.HasCompactChanges()) { + PAIMON_RETURN_NOT_OK(TryCommit(changes.compact_table_files, + changes.compact_index_files, identifier, watermark, + /*log_offsets=*/{}, /*properties=*/{}, + Snapshot::CommitKind::Compact(), + /*check_append_files=*/true)); + } + return Status::OK(); } Result FileStoreCommitImpl::FilterAndOverwrite( @@ -290,20 +315,18 @@ Result FileStoreCommitImpl::FilterAndOverwrite( PAIMON_ASSIGN_OR_RAISE(std::vector> actual_committables, FilterCommitted(committables)); if (!actual_committables.empty()) { - std::vector append_table_files; - std::vector append_changelog_files; - std::vector compact_table_files; - std::vector compact_changelog_files; - std::vector append_table_index_files; - std::vector compact_table_index_files; - PAIMON_RETURN_NOT_OK(CollectChanges(actual_committables[0]->FileCommittables(), - &append_table_files, &append_changelog_files, - &compact_table_files, &compact_changelog_files, - &append_table_index_files, &compact_table_index_files)); - if (!append_table_index_files.empty()) { - return Status::NotImplemented("FilterAndOverwrite not support index for now"); - } - PAIMON_RETURN_NOT_OK(TryOverwrite(partitions, append_table_files, identifier, watermark)); + PAIMON_ASSIGN_OR_RAISE( + ManifestEntryChanges changes, + CollectManifestEntryChanges(actual_committables[0]->FileCommittables())); + PAIMON_RETURN_NOT_OK(TryOverwrite(partitions, changes.append_table_files, + changes.append_index_files, identifier, watermark)); + if (changes.HasCompactChanges()) { + PAIMON_RETURN_NOT_OK(TryCommit(changes.compact_table_files, + changes.compact_index_files, identifier, + watermark, /*log_offsets=*/{}, + /*properties=*/{}, Snapshot::CommitKind::Compact(), + /*check_append_files=*/true)); + } } return actual_committables.size(); } @@ -326,15 +349,55 @@ Result> FileStoreCommitImpl::GetAllFiles( return plan->Files(); } +Result> FileStoreCommitImpl::GetAllIndexFiles( + const Snapshot& snapshot, + const std::vector>& partitions) const { + std::vector index_entries; + if (!snapshot.IndexManifest()) { + return index_entries; + } + + auto filter = [this, &partitions](const IndexManifestEntry& entry) -> Result { + if (partitions.empty()) { + return true; + } + + Result>> part_values_result = + partition_computer_->GeneratePartitionVector(entry.partition); + if (!part_values_result.ok()) { + return part_values_result.status(); + } + const auto& part_values = part_values_result.value(); + std::map partition; + for (const auto& [key, value] : part_values) { + partition[key] = value; + } + + for (const auto& partition_spec : partitions) { + if (MatchPartitionSpec(partition, partition_spec)) { + return true; + } + } + return false; + }; + + PAIMON_RETURN_NOT_OK( + index_manifest_file_->Read(snapshot.IndexManifest().value(), filter, &index_entries)); + return index_entries; +} + Status FileStoreCommitImpl::TryOverwrite( const std::vector>& partitions, - const std::vector& changes, int64_t commit_identifier, + const std::vector& changes, + const std::vector& index_entries, int64_t commit_identifier, std::optional watermark) { int32_t retry_count = 0; + std::optional retry_start_snapshot_id; while (true) { PAIMON_ASSIGN_OR_RAISE(std::optional latest_snapshot, snapshot_manager_->LatestSnapshot()); std::vector changes_with_overwrite; + std::vector index_entries_with_overwrite = index_entries; if (latest_snapshot) { PAIMON_ASSIGN_OR_RAISE(std::vector entries, GetAllFiles(latest_snapshot.value(), partitions)); @@ -343,17 +406,29 @@ Status FileStoreCommitImpl::TryOverwrite( entry.Bucket(), entry.TotalBuckets(), entry.File()); } + + PAIMON_ASSIGN_OR_RAISE(std::vector previous_index_entries, + GetAllIndexFiles(latest_snapshot.value(), partitions)); + for (const auto& entry : previous_index_entries) { + index_entries_with_overwrite.emplace_back(FileKind::Delete(), entry.partition, + entry.bucket, entry.index_file); + } } changes_with_overwrite.insert(changes_with_overwrite.end(), changes.begin(), changes.end()); PAIMON_ASSIGN_OR_RAISE(bool commit_success, - TryCommitOnce(changes_with_overwrite, /*index_entries=*/{}, + TryCommitOnce(changes_with_overwrite, index_entries_with_overwrite, commit_identifier, watermark, /*log_offsets=*/{}, /*properties=*/{}, Snapshot::CommitKind::Overwrite(), latest_snapshot, - /*need_conflict_check=*/true)); + /*need_conflict_check=*/true, + retry_start_snapshot_id)); if (commit_success) { break; } + retry_start_snapshot_id = latest_snapshot + ? std::optional(latest_snapshot.value().Id() + 1) + : std::optional(Snapshot::FIRST_SNAPSHOT_ID); + conflict_detection_.SetRowIdCheckFromSnapshot(retry_start_snapshot_id); if (retry_count >= options_.GetCommitMaxRetries()) { return Status::Invalid( fmt::format("Commit failed after {} attempts, there maybe exist commit conflicts " @@ -367,44 +442,48 @@ Status FileStoreCommitImpl::TryOverwrite( Status FileStoreCommitImpl::Commit(const std::shared_ptr& committable, bool check_append_files) { - std::vector append_table_files; - std::vector append_changelog_files; - std::vector compact_table_files; - std::vector compact_changelog_files; - std::vector append_table_index_files; - std::vector compact_table_index_files; - PAIMON_RETURN_NOT_OK(CollectChanges(committable->FileCommittables(), &append_table_files, - &append_changelog_files, &compact_table_files, - &compact_changelog_files, &append_table_index_files, - &compact_table_index_files)); + PAIMON_ASSIGN_OR_RAISE(ManifestEntryChanges changes, + CollectManifestEntryChanges(committable->FileCommittables())); int32_t attempt = 0; int32_t generated_snapshot = 0; Duration duration; - if (!ignore_empty_commit_ || !append_table_files.empty() || !append_table_index_files.empty()) { + if (!ignore_empty_commit_ || changes.HasAppendChanges()) { + Snapshot::CommitKind append_commit_kind = Snapshot::CommitKind::Append(); + if (conflict_detection_.ShouldBeOverwriteCommit(changes.append_table_files, + changes.append_index_files)) { + append_commit_kind = Snapshot::CommitKind::Overwrite(); + check_append_files = true; + } + if (HasGlobalIndexFileAdditions(changes.append_index_files)) { + check_append_files = true; + } + PAIMON_ASSIGN_OR_RAISE(int32_t cnt, - TryCommit(append_table_files, append_table_index_files, + TryCommit(changes.append_table_files, + changes.append_index_files, committable->Identifier(), committable->Watermark(), committable->LogOffsets(), committable->Properties(), - Snapshot::CommitKind::Append(), check_append_files)); + append_commit_kind, check_append_files)); attempt += cnt; ++generated_snapshot; } - if (!compact_table_files.empty() || !compact_table_index_files.empty()) { + if (changes.HasCompactChanges()) { PAIMON_ASSIGN_OR_RAISE( - int32_t cnt, TryCommit(compact_table_files, compact_table_index_files, + int32_t cnt, TryCommit(changes.compact_table_files, + changes.compact_index_files, committable->Identifier(), committable->Watermark(), committable->LogOffsets(), committable->Properties(), Snapshot::CommitKind::Compact(), /*check_append_files=*/true)); attempt += cnt; ++generated_snapshot; } - auto table_files_added = static_cast(append_table_files.size()); + auto table_files_added = static_cast(changes.append_table_files.size()); int32_t table_files_deleted = 0; int64_t compaction_input_file_size = 0; int64_t compaction_output_file_size = 0; - for (const auto& entry : compact_table_files) { + for (const auto& entry : changes.compact_table_files) { const auto& kind = entry.Kind(); if (kind == FileKind::Add()) { ++table_files_added; @@ -418,25 +497,29 @@ Status FileStoreCommitImpl::Commit(const std::shared_ptr& c metrics_->SetCounter(CommitMetrics::LAST_COMMIT_ATTEMPTS, attempt); metrics_->SetCounter(CommitMetrics::LAST_TABLE_FILES_ADDED, table_files_added); metrics_->SetCounter(CommitMetrics::LAST_TABLE_FILES_DELETED, table_files_deleted); - metrics_->SetCounter(CommitMetrics::LAST_TABLE_FILES_APPENDED, append_table_files.size()); + metrics_->SetCounter(CommitMetrics::LAST_TABLE_FILES_APPENDED, + changes.append_table_files.size()); metrics_->SetCounter(CommitMetrics::LAST_TABLE_FILES_COMMIT_COMPACTED, - compact_table_files.size()); + changes.compact_table_files.size()); metrics_->SetCounter(CommitMetrics::LAST_CHANGELOG_FILES_APPENDED, - append_changelog_files.size()); + changes.append_changelog.size()); metrics_->SetCounter(CommitMetrics::LAST_CHANGELOG_FILES_COMMIT_COMPACTED, - compact_changelog_files.size()); + changes.compact_changelog.size()); metrics_->SetCounter(CommitMetrics::LAST_GENERATED_SNAPSHOTS, generated_snapshot); - metrics_->SetCounter(CommitMetrics::LAST_DELTA_RECORDS_APPENDED, RowCounts(append_table_files)); + metrics_->SetCounter(CommitMetrics::LAST_DELTA_RECORDS_APPENDED, + RowCounts(changes.append_table_files)); metrics_->SetCounter(CommitMetrics::LAST_CHANGELOG_RECORDS_APPENDED, - RowCounts(append_changelog_files)); + RowCounts(changes.append_changelog)); metrics_->SetCounter(CommitMetrics::LAST_DELTA_RECORDS_COMMIT_COMPACTED, - RowCounts(compact_table_files)); + RowCounts(changes.compact_table_files)); metrics_->SetCounter(CommitMetrics::LAST_CHANGELOG_RECORDS_COMMIT_COMPACTED, - RowCounts(compact_changelog_files)); + RowCounts(changes.compact_changelog)); metrics_->SetCounter(CommitMetrics::LAST_PARTITIONS_WRITTEN, - NumChangedPartitions({append_table_files, compact_table_files})); + NumChangedPartitions( + {changes.append_table_files, changes.compact_table_files})); metrics_->SetCounter(CommitMetrics::LAST_BUCKETS_WRITTEN, - NumChangedBuckets({append_table_files, compact_table_files})); + NumChangedBuckets({changes.append_table_files, + changes.compact_table_files})); metrics_->SetCounter(CommitMetrics::LAST_COMPACTION_INPUT_FILE_SIZE, compaction_input_file_size); metrics_->SetCounter(CommitMetrics::LAST_COMPACTION_OUTPUT_FILE_SIZE, @@ -461,16 +544,22 @@ Result FileStoreCommitImpl::TryCommit(const std::vector& bool check_append_files) { int32_t retry_count = 0; int64_t start_millis = DateTimeUtils::GetCurrentUTCTimeUs() / 1000; + std::optional retry_start_snapshot_id; while (true) { PAIMON_ASSIGN_OR_RAISE(std::optional latest_snapshot, snapshot_manager_->LatestSnapshot()); PAIMON_ASSIGN_OR_RAISE( bool commit_success, TryCommitOnce(delta_files, index_entries, identifier, watermark, log_offsets, - properties, commit_kind, latest_snapshot, check_append_files)); + properties, commit_kind, latest_snapshot, check_append_files, + retry_start_snapshot_id)); if (commit_success) { break; } + retry_start_snapshot_id = latest_snapshot + ? std::optional(latest_snapshot.value().Id() + 1) + : std::optional(Snapshot::FIRST_SNAPSHOT_ID); + conflict_detection_.SetRowIdCheckFromSnapshot(retry_start_snapshot_id); int64_t current_millis = DateTimeUtils::GetCurrentUTCTimeUs() / 1000; if (current_millis - start_millis > options_.GetCommitTimeout() || retry_count >= options_.GetCommitMaxRetries()) { @@ -488,6 +577,8 @@ Result>> FileStoreCommitImpl::Change const std::vector& data_files, const std::vector& index_files) const { std::set> partitions; + std::vector changed_partitions = + ManifestEntryChanges::ChangedPartitions(data_files, index_files); auto add_partition = [&, this](const BinaryRow& partition_row) -> Status { std::vector> part_values; PAIMON_ASSIGN_OR_RAISE(part_values, @@ -503,13 +594,8 @@ Result>> FileStoreCommitImpl::Change return Status::OK(); }; - for (const ManifestEntry& entry : data_files) { - PAIMON_RETURN_NOT_OK(add_partition(entry.Partition())); - } - for (const IndexManifestEntry& entry : index_files) { - if (entry.index_file->IndexType() == DeletionVectorsIndexFile::DELETION_VECTORS_INDEX) { - PAIMON_RETURN_NOT_OK(add_partition(entry.partition)); - } + for (const BinaryRow& changed_partition : changed_partitions) { + PAIMON_RETURN_NOT_OK(add_partition(changed_partition)); } return partitions; } @@ -531,34 +617,76 @@ Result> FileStoreCommitImpl::ReadAllEntriesFromChange return plan->Files(); } +Result FileStoreCommitImpl::CheckCommitted( + const std::optional& latest_snapshot, + std::optional retry_start_snapshot_id, int64_t identifier, + const Snapshot::CommitKind& commit_kind) const { + if (!latest_snapshot || !retry_start_snapshot_id || + retry_start_snapshot_id.value() > latest_snapshot.value().Id()) { + return false; + } + + for (int64_t snapshot_id = retry_start_snapshot_id.value(); + snapshot_id <= latest_snapshot.value().Id(); ++snapshot_id) { + PAIMON_ASSIGN_OR_RAISE(Snapshot snapshot, snapshot_manager_->LoadSnapshot(snapshot_id)); + if (snapshot.CommitUser() == commit_user_ && + snapshot.CommitIdentifier() == identifier && + snapshot.GetCommitKind() == commit_kind) { + return true; + } + } + return false; +} + Status FileStoreCommitImpl::NoConflictsOrFail(const std::string& base_commit_user, const std::vector& base_entries, - const std::vector& changes) const { + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const Snapshot& latest_snapshot, + const Snapshot::CommitKind& commit_kind) const { ScopeGuard guard([&]() { PAIMON_LOG_WARN(logger_, "File deletion conflicts detected! Give up committing. %s", base_commit_user.c_str()); }); - std::vector all_entries = base_entries; - all_entries.insert(all_entries.end(), changes.begin(), changes.end()); - std::vector merged_entries; - PAIMON_RETURN_NOT_OK(FileEntry::MergeEntries(all_entries, &merged_entries)); - for (const auto& entry : merged_entries) { - if (entry.Kind() == FileKind::Delete()) { - return Status::Invalid(fmt::format( - "Trying to delete file {} which is not previously added.", entry.FileName())); - } - } - // TODO(yonghao.fyh): check for all LSM level >= 1, key ranges of files do not intersect + + std::optional> row_id_column_conflict_checker = + std::nullopt; + PAIMON_ASSIGN_OR_RAISE(auto checker, + RowIdColumnConflictChecker::FromManifestEntries(schema_manager_, + delta_entries)); + row_id_column_conflict_checker = checker; + + PAIMON_RETURN_NOT_OK(conflict_detection_.CheckConflicts( + latest_snapshot, base_entries, delta_entries, delta_index_entries, + row_id_column_conflict_checker, commit_kind)); guard.Release(); return Status::OK(); } +Status FileStoreCommitImpl::NoConflictsOrFail(const std::string& base_commit_user, + const std::vector& base_entries, + const std::vector& delta_entries, + const Snapshot& latest_snapshot, + const Snapshot::CommitKind& commit_kind) const { + return NoConflictsOrFail(base_commit_user, base_entries, delta_entries, + /*delta_index_entries=*/{}, latest_snapshot, + commit_kind); +} + Result FileStoreCommitImpl::TryCommitOnce( const std::vector& delta_entries, const std::vector& index_entries, int64_t identifier, std::optional watermark, std::map log_offsets, const std::map& properties, Snapshot::CommitKind commit_kind, - const std::optional& latest_snapshot, bool need_conflict_check) { + const std::optional& latest_snapshot, bool need_conflict_check, + std::optional retry_start_snapshot_id) { + PAIMON_ASSIGN_OR_RAISE(bool committed, + CheckCommitted(latest_snapshot, retry_start_snapshot_id, identifier, + commit_kind)); + if (committed) { + return true; + } + std::vector delta_files = delta_entries; int64_t start_millis = DateTimeUtils::GetCurrentUTCTimeUs() / 1000; int64_t new_snapshot_id = Snapshot::FIRST_SNAPSHOT_ID; @@ -583,7 +711,8 @@ Result FileStoreCommitImpl::TryCommitOnce( std::vector base_data_files, ReadAllEntriesFromChangedPartitions(latest_snapshot.value(), changed_partitions)); PAIMON_RETURN_NOT_OK( - NoConflictsOrFail(latest_snapshot.value().CommitUser(), base_data_files, delta_files)); + NoConflictsOrFail(latest_snapshot.value().CommitUser(), base_data_files, delta_files, + index_entries, latest_snapshot.value(), commit_kind)); } std::vector merge_before_manifests; @@ -719,20 +848,11 @@ Result FileStoreCommitImpl::TryCommitOnce( Result commit_result = CommitSnapshotImpl(new_snapshot, delta_statistics); if (!commit_result.ok()) { - // commit exception, not sure about the situation and should not clean up the files. - PAIMON_LOG_WARN(logger_, "You need call FilterAndCommit to retry commit for exception. %s", + // commit exception is uncertain; retry after checking whether this commit already exists. + PAIMON_LOG_WARN(logger_, "Retry commit for exception. %s", commit_result.status().ToString().c_str()); - - // To prevent the case where an atomic write times out but actually succeeds, - // retrying the commit could lead to the snapshot file being committed multiple times. - // Therefore, retries should be handled by the upper layer, - // which should call FilterAndCommit to avoid duplicate commits. - // Therefore, we should not trigger cleanup here, - // as it may delete meta files from a snapshot that was just written by ourselves, - // leading to an incomplete or corrupted snapshot. guard.Release(); - return Status::Invalid("You need call FilterAndCommit to retry commit for exception. ", - commit_result.status().ToString()); + return false; } bool commit_success = commit_result.value(); if (commit_success) { @@ -890,77 +1010,24 @@ Status FileStoreCommitImpl::CollectChanges( std::vector* compact_changelog_files, std::vector* append_table_index_files, std::vector* compact_table_index_files) { - for (const auto& message : commit_messages) { - auto commit_message = std::dynamic_pointer_cast(message); - if (commit_message) { - DataIncrement new_files_increment = commit_message->GetNewFilesIncrement(); - for (const std::shared_ptr& new_file : new_files_increment.NewFiles()) { - append_table_files->push_back(MakeEntry(FileKind::Add(), commit_message, new_file)); - } - for (const std::shared_ptr& deleted_file : - new_files_increment.DeletedFiles()) { - append_table_files->push_back( - MakeEntry(FileKind::Delete(), commit_message, deleted_file)); - } - for (const std::shared_ptr& changelog_file : - new_files_increment.ChangelogFiles()) { - append_changelog_files->push_back( - MakeEntry(FileKind::Add(), commit_message, changelog_file)); - } - for (const std::shared_ptr& deleted_index_file : - new_files_increment.DeletedIndexFiles()) { - append_table_index_files->emplace_back( - FileKind::Delete(), commit_message->Partition(), commit_message->Bucket(), - deleted_index_file); - } - for (const std::shared_ptr& new_index_file : - new_files_increment.NewIndexFiles()) { - append_table_index_files->emplace_back(FileKind::Add(), commit_message->Partition(), - commit_message->Bucket(), new_index_file); - } - CompactIncrement compact_increment = commit_message->GetCompactIncrement(); - for (const std::shared_ptr& compact_before : - compact_increment.CompactBefore()) { - compact_table_files->push_back( - MakeEntry(FileKind::Delete(), commit_message, compact_before)); - } - for (const std::shared_ptr& compact_after : - compact_increment.CompactAfter()) { - compact_table_files->push_back( - MakeEntry(FileKind::Add(), commit_message, compact_after)); - } - for (const std::shared_ptr& changelog_file : - compact_increment.ChangelogFiles()) { - compact_changelog_files->push_back( - MakeEntry(FileKind::Add(), commit_message, changelog_file)); - } - for (const std::shared_ptr& deleted_index_file : - compact_increment.DeletedIndexFiles()) { - compact_table_index_files->emplace_back( - FileKind::Delete(), commit_message->Partition(), commit_message->Bucket(), - deleted_index_file); - } - for (const std::shared_ptr& new_index_file : - compact_increment.NewIndexFiles()) { - compact_table_index_files->emplace_back(FileKind::Add(), - commit_message->Partition(), - commit_message->Bucket(), new_index_file); - } - } else { - return Status::Invalid("fail to cast commit message to commit message impl"); - } - } + PAIMON_ASSIGN_OR_RAISE(ManifestEntryChanges changes, + CollectManifestEntryChanges(commit_messages)); + *append_table_files = std::move(changes.append_table_files); + *append_changelog_files = std::move(changes.append_changelog); + *compact_table_files = std::move(changes.compact_table_files); + *compact_changelog_files = std::move(changes.compact_changelog); + *append_table_index_files = std::move(changes.append_index_files); + *compact_table_index_files = std::move(changes.compact_index_files); return Status::OK(); } -ManifestEntry FileStoreCommitImpl::MakeEntry( - const FileKind& kind, const std::shared_ptr& commit_message, - const std::shared_ptr& file) const { - int32_t total_buckets = commit_message->TotalBuckets() == std::nullopt - ? num_bucket_ - : commit_message->TotalBuckets().value(); - return ManifestEntry(kind, commit_message->Partition(), commit_message->Bucket(), total_buckets, - file); +Result FileStoreCommitImpl::CollectManifestEntryChanges( + const std::vector>& commit_messages) { + ManifestEntryChanges changes(num_bucket_); + for (const auto& message : commit_messages) { + PAIMON_RETURN_NOT_OK(changes.Collect(message)); + } + return changes; } int64_t FileStoreCommitImpl::RowCounts(const std::vector& files) { diff --git a/src/paimon/core/operation/file_store_commit_impl.h b/src/paimon/core/operation/file_store_commit_impl.h index f54c85ac8..9ad82ef37 100644 --- a/src/paimon/core/operation/file_store_commit_impl.h +++ b/src/paimon/core/operation/file_store_commit_impl.h @@ -28,6 +28,9 @@ #include "paimon/core/catalog/snapshot_commit.h" #include "paimon/core/core_options.h" #include "paimon/core/manifest/partition_entry.h" +#include "paimon/core/operation/commit/manifest_entry_changes.h" +#include "paimon/core/operation/commit/conflict_detection.h" +#include "paimon/core/operation/commit/row_id_column_conflict_checker.h" #include "paimon/core/snapshot.h" #include "paimon/file_store_commit.h" #include "paimon/logging.h" @@ -125,13 +128,18 @@ class FileStoreCommitImpl : public FileStoreCommit { bool check_append_files); Status TryOverwrite(const std::vector>& partition, - const std::vector& changes, int64_t commit_identifier, - std::optional watermark); + const std::vector& changes, + const std::vector& index_entries, + int64_t commit_identifier, std::optional watermark); Result> GetAllFiles( const Snapshot& snapshot, const std::vector>& partitions); + Result> GetAllIndexFiles( + const Snapshot& snapshot, + const std::vector>& partitions) const; + Result>> FilterCommitted( const std::vector>& committables); @@ -139,10 +147,6 @@ class FileStoreCommitImpl : public FileStoreCommit { int64_t identifier, const std::vector>& commit_messages, std::optional watermark); - ManifestEntry MakeEntry(const FileKind& kind, - const std::shared_ptr& commit_message, - const std::shared_ptr& file) const; - Status CollectChanges(const std::vector>& commit_messages, std::vector* append_table_files, std::vector* append_changelog_files, @@ -151,6 +155,9 @@ class FileStoreCommitImpl : public FileStoreCommit { std::vector* append_table_index_files, std::vector* compact_table_index_files); + Result CollectManifestEntryChanges( + const std::vector>& commit_messages); + Result TryCommit(const std::vector& delta_files, const std::vector& index_entries, int64_t identifier, std::optional watermark, @@ -164,7 +171,8 @@ class FileStoreCommitImpl : public FileStoreCommit { const std::map& properties, Snapshot::CommitKind commit_kind, const std::optional& latest_snapshot, - bool need_conflict_check); + bool need_conflict_check, + std::optional retry_start_snapshot_id); Result CommitSnapshotImpl(const Snapshot& new_snapshot, const std::vector& delta_statistics); @@ -180,9 +188,23 @@ class FileStoreCommitImpl : public FileStoreCommit { const Snapshot& latest_snapshot, const std::set>& partitions) const; + Result CheckCommitted(const std::optional& latest_snapshot, + std::optional retry_start_snapshot_id, + int64_t identifier, + const Snapshot::CommitKind& commit_kind) const; + + Status NoConflictsOrFail(const std::string& base_commit_user, + const std::vector& base_entries, + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const Snapshot& latest_snapshot, + const Snapshot::CommitKind& commit_kind) const; + Status NoConflictsOrFail(const std::string& base_commit_user, const std::vector& base_entries, - const std::vector& changes) const; + const std::vector& delta_entries, + const Snapshot& latest_snapshot, + const Snapshot::CommitKind& commit_kind) const; Status CheckFilesExistence( const std::vector>& committables) const; @@ -218,6 +240,7 @@ class FileStoreCommitImpl : public FileStoreCommit { bool ignore_empty_commit_ = true; int32_t num_bucket_ = 0; std::shared_ptr table_schema_; + ConflictDetection conflict_detection_; std::shared_ptr manifest_file_; std::shared_ptr manifest_list_; diff --git a/src/paimon/core/operation/file_store_commit_impl_test.cpp b/src/paimon/core/operation/file_store_commit_impl_test.cpp index 2ba008de3..3b0699d7a 100644 --- a/src/paimon/core/operation/file_store_commit_impl_test.cpp +++ b/src/paimon/core/operation/file_store_commit_impl_test.cpp @@ -40,9 +40,12 @@ #include "paimon/common/utils/path_util.h" #include "paimon/common/utils/scope_guard.h" #include "paimon/core/catalog/commit_table_request.h" +#include "paimon/core/index/global_index_meta.h" #include "paimon/core/io/data_file_meta.h" #include "paimon/core/manifest/file_kind.h" +#include "paimon/core/manifest/file_source.h" #include "paimon/core/manifest/index_manifest_entry.h" +#include "paimon/core/manifest/index_manifest_file.h" #include "paimon/core/manifest/manifest_committable.h" #include "paimon/core/manifest/manifest_entry.h" #include "paimon/core/manifest/manifest_file_meta.h" @@ -50,12 +53,14 @@ #include "paimon/core/operation/metrics/commit_metrics.h" #include "paimon/core/partition/partition_statistics.h" #include "paimon/core/stats/simple_stats.h" +#include "paimon/core/schema/table_schema.h" #include "paimon/core/table/sink/commit_message_impl.h" #include "paimon/core/utils/file_utils.h" #include "paimon/core/utils/snapshot_manager.h" #include "paimon/data/timestamp.h" #include "paimon/defs.h" #include "paimon/factories/factory_creator.h" +#include "paimon/memory/bytes.h" #include "paimon/fs/file_system.h" #include "paimon/fs/local/local_file_system.h" #include "paimon/fs/local/local_file_system_factory.h" @@ -67,6 +72,38 @@ #include "paimon/testing/utils/timezone_guard.h" namespace paimon::test { + +namespace { + +Snapshot MakeSnapshot(const std::optional& next_row_id, + const Snapshot::CommitKind& commit_kind = + Snapshot::CommitKind::Append()) { + return Snapshot( + /*id=*/1, + /*schema_id=*/1, + /*base_manifest_list=*/"base-manifest-list", + /*base_manifest_list_size=*/std::nullopt, + /*delta_manifest_list=*/"delta-manifest-list", + /*delta_manifest_list_size=*/std::nullopt, + /*changelog_manifest_list=*/std::nullopt, + /*changelog_manifest_list_size=*/std::nullopt, + /*index_manifest=*/std::nullopt, + /*commit_user=*/"test-user", + /*commit_identifier=*/1, + commit_kind, + /*time_millis=*/0, + /*log_offsets=*/std::nullopt, + /*total_record_count=*/std::nullopt, + /*delta_record_count=*/std::nullopt, + /*changelog_record_count=*/std::nullopt, + /*watermark=*/std::nullopt, + /*statistics=*/std::nullopt, + /*properties=*/std::nullopt, + next_row_id); +} + +} // namespace + class GmockFileSystem : public LocalFileSystem { public: MOCK_METHOD(Status, ReadFile, (const std::string& path, std::string* content), (override)); @@ -158,11 +195,18 @@ class FileStoreCommitImplTest : public testing::Test { ManifestEntry CreateManifestEntry(const std::string& file_name, const BinaryRow& partition, const FileKind& kind) const { + return CreateManifestEntry(file_name, partition, kind, DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/2, /*bucket=*/0); + } + + ManifestEntry CreateManifestEntry(const std::string& file_name, const BinaryRow& partition, + const FileKind& kind, const BinaryRow& min_key, + const BinaryRow& max_key, int32_t level, + int32_t bucket = 0, int32_t total_buckets = 2) const { auto data_file_meta = std::make_shared( - file_name, 1024, 8, DataFileMeta::EmptyMinKey(), DataFileMeta::EmptyMaxKey(), - SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), /*min_seq_no=*/16, - /*max_seq_no=*/32, - /*schema_id=*/1, /*level=*/2, + file_name, 1024, 8, min_key, max_key, SimpleStats::EmptyStats(), + SimpleStats::EmptyStats(), /*min_seq_no=*/16, /*max_seq_no=*/32, + /*schema_id=*/1, level, /*extra_files=*/std::vector>(), /*creation_time=*/Timestamp(0, 0), /*delete_row_count=*/3, @@ -170,7 +214,15 @@ class FileStoreCommitImplTest : public testing::Test { /*external_path=*/std::nullopt, /*value_stats_cols=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); - return ManifestEntry(kind, partition, 0, 2, data_file_meta); + return ManifestEntry(kind, partition, bucket, total_buckets, data_file_meta); + } + + BinaryRow CreateIntRow(int32_t value) const { + BinaryRow row(1); + BinaryRowWriter writer(&row, 20, GetDefaultPool().get()); + writer.WriteInt(0, value); + writer.Complete(); + return row; } ManifestEntry CreateManifestEntryWithNoPartition(const std::string& file_name, @@ -198,6 +250,41 @@ class FileStoreCommitImplTest : public testing::Test { return result; } + std::shared_ptr CreateIndexFileMeta(const std::string& file_name, + const std::string& index_type = "bitmap") { + return std::make_shared(index_type, file_name, /*file_size=*/100, + /*row_count=*/5, /*dv_ranges=*/std::nullopt, + /*external_path=*/std::nullopt, + /*global_index_meta=*/std::nullopt); + } + + std::shared_ptr CreateGlobalIndexFileMeta(const std::string& file_name, + int64_t row_range_start, + int64_t row_range_end) { + GlobalIndexMeta global_index(row_range_start, row_range_end, /*index_field_id=*/1, + /*extra_field_ids=*/std::nullopt, + std::make_shared("meta", GetDefaultPool().get())); + return std::make_shared( + "HASH", file_name, /*file_size=*/100, /*row_count=*/5, + /*dv_ranges=*/std::nullopt, /*external_path=*/std::nullopt, global_index); + } + + std::shared_ptr CreateAppendDataFileMeta(const std::string& file_name, + int64_t row_count) { + return std::make_shared( + file_name, 1024, row_count, DataFileMeta::EmptyMinKey(), DataFileMeta::EmptyMaxKey(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), /*min_seq_no=*/16, + /*max_seq_no=*/32, + /*schema_id=*/1, /*level=*/2, + /*extra_files=*/std::vector>(), + /*creation_time=*/Timestamp(0, 0), + /*delete_row_count=*/std::nullopt, + /*embedded_index=*/nullptr, FileSource::Append(), + /*external_path=*/std::nullopt, + /*value_stats_cols=*/std::nullopt, /*first_row_id=*/std::nullopt, + /*write_cols=*/std::nullopt); + } + bool IsStringInSet(const std::set& strSet, const std::string& target) { return strSet.find(target) != strSet.end(); } @@ -422,19 +509,17 @@ TEST_F(FileStoreCommitImplTest, TestCommitWithConflictSnapshotAndRetryOnce) { ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); std::string latest_hint = PathUtil::JoinPath(table_path, "snapshot/LATEST"); auto* mock_fs = dynamic_cast(fs.get()); + EXPECT_CALL(*mock_fs, ReadFile(testing::_, testing::_)) + .Times(testing::AnyNumber()) + .WillRepeatedly(testing::Invoke([&](const std::string& path, std::string* content) { + return mock_fs->FileSystem::ReadFile(path, content); + })); EXPECT_CALL(*mock_fs, ReadFile(testing::StrEq(latest_hint), testing::_)) .WillRepeatedly(testing::Invoke([](const std::string& path, std::string* content) { *content = "-1"; return Status::OK(); })); - EXPECT_CALL( - *mock_fs, - ReadFile(testing::StrEq(PathUtil::JoinPath(table_path, "snapshot/snapshot-5")), testing::_)) - .WillOnce(testing::Invoke([&](const std::string& path, std::string* content) { - return mock_fs->FileSystem::ReadFile(path, content); - })); - EXPECT_CALL(*mock_fs, ListDir(testing::_, testing::_)).Times(testing::AnyNumber()); EXPECT_CALL(*mock_fs, ListDir(testing::StrEq(PathUtil::JoinPath(table_path, "snapshot")), testing::_)) @@ -495,7 +580,12 @@ TEST_F(FileStoreCommitImplTest, TestCommitWithAtomicWriteSnapshotTimeoutAndActua "/orc/append_09.db/append_09/commit_messages/commit_messages-01", /*version=*/3); ASSERT_GT(msgs.size(), 0); - ASSERT_NOK(commit->Commit(msgs, /*commit_identifier=*/1)); + ASSERT_OK(commit->Commit(msgs, /*commit_identifier=*/1)); + std::shared_ptr metrics = commit->GetCommitMetrics(); + ASSERT_TRUE(metrics); + ASSERT_OK_AND_ASSIGN(uint64_t counter, + metrics->GetCounter(CommitMetrics::LAST_COMMIT_ATTEMPTS)); + ASSERT_EQ(2u, counter); ASSERT_OK_AND_ASSIGN( bool exist, file_system_->Exists(PathUtil::JoinPath(table_path, "snapshot/snapshot-6"))); ASSERT_TRUE(exist); @@ -508,8 +598,6 @@ TEST_F(FileStoreCommitImplTest, TestCommitWithAtomicWriteSnapshotTimeoutAndActua .Finish()); ASSERT_OK_AND_ASSIGN(auto commit_2, FileStoreCommit::Create(std::move(commit_context_2))); - ASSERT_OK_AND_ASSIGN(int32_t num_committed, commit_2->FilterAndCommit({{1, msgs}})); - ASSERT_EQ(0, num_committed); std::string new_snapshot_7 = PathUtil::JoinPath(table_path, "snapshot/snapshot-7"); EXPECT_CALL(*mock_fs, AtomicStore(testing::StrEq(new_snapshot_7), testing::_)) .WillOnce(testing::Invoke([&](const std::string& path, const std::string& content) { @@ -738,18 +826,17 @@ TEST_F(FileStoreCommitImplTest, TestCommitSuccessAfterIOException) { io_hook->Reset(i, IOHook::Mode::RETURN_ERROR); auto status = commit->Commit(msgs); io_hook->Clear(); + ASSERT_OK_AND_ASSIGN(bool exist2, file_system_->Exists(PathUtil::JoinPath( + table_path_, "snapshot/snapshot-2"))); // termination conditions: // 1. status does not hit IOHook, already touch all IO operation // 2. hit IOHook in commit hint, at this point, the snapshot is already committed - if (!HitIOHook(status) || HitIOHookInCommitHint(status)) { + // 3. snapshot file exists, which means atomic store succeeded even if a later IO failed + if (!HitIOHook(status) || HitIOHookInCommitHint(status) || exist2) { scanned_all_io_hook = true; - ASSERT_OK_AND_ASSIGN(bool exist2, file_system_->Exists(PathUtil::JoinPath( - table_path_, "snapshot/snapshot-2"))); ASSERT_TRUE(exist2); break; } - ASSERT_OK_AND_ASSIGN(bool exist2, file_system_->Exists(PathUtil::JoinPath( - table_path_, "snapshot/snapshot-2"))); ASSERT_FALSE(exist2); std::vector actual_snapshots; ASSERT_OK( @@ -868,7 +955,6 @@ TEST_F(FileStoreCommitImplTest, TestCleanUpTmpManifests) { ASSERT_OK_AND_ASSIGN(exist, file_system_->Exists(PathUtil::JoinPath( table_path_, "manifest/" + index_manifest.value()))); ASSERT_FALSE(exist); - commit_impl->CleanUpTmpManifests( snapshot.value().BaseManifestList(), snapshot.value().DeltaManifestList(), /*old_metas=*/{}, /*new_metas=*/previous_manifests, /*old_index_manifest=*/std::nullopt, @@ -925,7 +1011,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Delete())); changes.push_back(CreateManifestEntry("file4", FileKind::Delete())); changes.push_back(CreateManifestEntry("file5", FileKind::Delete())); - ASSERT_OK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes)); + ASSERT_OK(commit_impl->NoConflictsOrFail( + "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -942,7 +1030,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Delete())); changes.push_back(CreateManifestEntry("file4", FileKind::Delete())); changes.push_back(CreateManifestEntry("file5", FileKind::Delete())); - ASSERT_OK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes)); + ASSERT_OK(commit_impl->NoConflictsOrFail( + "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -954,7 +1044,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Add())); changes.push_back(CreateManifestEntry("file4", FileKind::Add())); changes.push_back(CreateManifestEntry("file5", FileKind::Add())); - ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes)); + ASSERT_NOK(commit_impl->NoConflictsOrFail( + "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -970,7 +1062,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Add())); changes.push_back(CreateManifestEntry("file4", FileKind::Add())); changes.push_back(CreateManifestEntry("file5", FileKind::Add())); - ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes)); + ASSERT_NOK(commit_impl->NoConflictsOrFail( + "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -986,7 +1080,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Add())); changes.push_back(CreateManifestEntry("file4", FileKind::Add())); changes.push_back(CreateManifestEntry("file5", FileKind::Add())); - ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes)); + ASSERT_NOK(commit_impl->NoConflictsOrFail( + "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -994,7 +1090,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { base_entries.push_back(CreateManifestEntry("file1", FileKind::Delete())); std::vector changes; - ASSERT_OK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes)); + ASSERT_OK(commit_impl->NoConflictsOrFail( + "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -1005,7 +1103,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { base_entries.push_back(CreateManifestEntry("file5", FileKind::Delete())); std::vector changes; - ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes)); + ASSERT_NOK(commit_impl->NoConflictsOrFail( + "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -1022,7 +1122,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file4", FileKind::Delete())); changes.push_back(CreateManifestEntry("file5", FileKind::Delete())); changes.push_back(CreateManifestEntry("file6", FileKind::Delete())); - ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes)); + ASSERT_NOK(commit_impl->NoConflictsOrFail( + "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } } @@ -1052,7 +1154,7 @@ TEST_F(FileStoreCommitImplTest, TestTryOverwrite) { std::vector changes; changes.push_back(CreateManifestEntry("new_file_1", FileKind::Add())); std::vector> partitions = {{{"f1", "10"}}, {{"f1", "20"}}}; - ASSERT_OK(commit_impl->TryOverwrite(partitions, changes, + ASSERT_OK(commit_impl->TryOverwrite(partitions, changes, /*index_entries=*/{}, /*commit_identifier=*/1, std::nullopt)); } @@ -1071,7 +1173,7 @@ TEST_F(FileStoreCommitImplTest, TestTryOverwriteFromNothing) { std::vector changes; changes.push_back(CreateManifestEntry("new_file_1", FileKind::Add())); std::vector> partitions = {{{"f1", "10"}}, {{"f1", "20"}}}; - ASSERT_OK(commit_impl->TryOverwrite(partitions, changes, + ASSERT_OK(commit_impl->TryOverwrite(partitions, changes, /*index_entries=*/{}, /*commit_identifier=*/0, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto snapshot1, commit_impl->snapshot_manager_->LatestSnapshot()); ASSERT_OK_AND_ASSIGN(auto entries1, commit_impl->GetAllFiles(snapshot1.value(), {})); @@ -1080,7 +1182,7 @@ TEST_F(FileStoreCommitImplTest, TestTryOverwriteFromNothing) { ASSERT_EQ(FileKind::Add(), entries1[0].Kind()); std::vector changes2; changes2.push_back(CreateManifestEntry("new_file_2", FileKind::Add())); - ASSERT_OK(commit_impl->TryOverwrite(partitions, changes2, + ASSERT_OK(commit_impl->TryOverwrite(partitions, changes2, /*index_entries=*/{}, /*commit_identifier=*/1, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto snapshot2, commit_impl->snapshot_manager_->LatestSnapshot()); ASSERT_OK_AND_ASSIGN(auto entries2, commit_impl->GetAllFiles(snapshot2.value(), {})); @@ -1104,7 +1206,7 @@ TEST_F(FileStoreCommitImplTest, TestTryOverwriteThenCommit) { std::vector changes; changes.push_back(CreateManifestEntry("new_file_1", FileKind::Add())); std::vector> partitions = {{{"f1", "10"}}, {{"f1", "20"}}}; - ASSERT_OK(commit_impl->TryOverwrite(partitions, changes, + ASSERT_OK(commit_impl->TryOverwrite(partitions, changes, /*index_entries=*/{}, /*commit_identifier=*/0, std::nullopt)); std::vector> msgs = GetCommitMessages(paimon::test::GetDataDir() + @@ -1130,7 +1232,7 @@ TEST_F(FileStoreCommitImplTest, TestTryOverwriteThenCommit) { std::vector changes2; changes2.push_back(CreateManifestEntry("new_file_2", FileKind::Add())); - ASSERT_OK(commit_impl->TryOverwrite(partitions, changes2, + ASSERT_OK(commit_impl->TryOverwrite(partitions, changes2, /*index_entries=*/{}, /*commit_identifier=*/2, std::nullopt)); ASSERT_OK_AND_ASSIGN(auto snapshot2, commit_impl->snapshot_manager_->LatestSnapshot()); ASSERT_OK_AND_ASSIGN(auto entries2, commit_impl->GetAllFiles(snapshot2.value(), {})); @@ -1484,6 +1586,268 @@ TEST_F(FileStoreCommitImplTest, TestOverwriteNonSpecifyPartition) { ASSERT_TRUE(IsStringInSet(file_names, "data-7b3f4cc7-116b-4d2f-9c62-5dadc1f11bcb-0.orc")); } +TEST_F(FileStoreCommitImplTest, TestCommitWithIndexFiles) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + const BinaryRow partition = CreateIntRow(10); + + std::vector> new_index_files; + new_index_files.push_back(CreateIndexFileMeta("bitmap-index-commit-1")); + DataIncrement data_increment({}, {}, {}, std::move(new_index_files), {}); + std::shared_ptr msg = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment, + CompactIncrement({}, {}, {})); + + ASSERT_OK(commit_impl->Commit({msg}, 1)); + ASSERT_OK_AND_ASSIGN(Snapshot snapshot, commit_impl->snapshot_manager_->LoadSnapshot(1)); + ASSERT_EQ(Snapshot::CommitKind::Append(), snapshot.GetCommitKind()); + ASSERT_TRUE(snapshot.IndexManifest()); + + std::vector index_entries; + ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot.IndexManifest().value(), + /*filter=*/nullptr, &index_entries)); + ASSERT_EQ(1u, index_entries.size()); + ASSERT_EQ("bitmap-index-commit-1", index_entries[0].index_file->FileName()); +} + +TEST_F(FileStoreCommitImplTest, TestCommitWithGlobalIndexFilesChecksConflicts) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .AddOption(Options::ROW_TRACKING_ENABLED, "true") + .AddOption(Options::DATA_EVOLUTION_ENABLED, "true") + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + const BinaryRow partition = CreateIntRow(10); + + DataIncrement data_increment_1({CreateAppendDataFileMeta("data-with-row-id", 10)}, {}, {}); + std::shared_ptr msg_1 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, + CompactIncrement({}, {}, {})); + ASSERT_OK(commit_impl->Commit({msg_1}, 1)); + + std::vector> global_index_files; + global_index_files.push_back(CreateGlobalIndexFileMeta("global-index-out-of-range", + /*row_range_start=*/0, + /*row_range_end=*/10)); + DataIncrement data_increment_2({}, {}, {}, std::move(global_index_files), {}); + std::shared_ptr msg_2 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_2, + CompactIncrement({}, {}, {})); + + ASSERT_NOK_WITH_MSG(commit_impl->Commit({msg_2}, 2), + "Global index row ID existence conflict"); +} + +TEST_F(FileStoreCommitImplTest, TestCommitWithCompactIndexFiles) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .IgnoreEmptyCommit(true) + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + const BinaryRow partition = CreateIntRow(10); + + std::vector> compact_index_files; + compact_index_files.push_back(CreateIndexFileMeta("bitmap-index-commit-compact-1")); + DataIncrement data_increment({}, {}, {}); + CompactIncrement compact_increment({}, {}, {}, std::move(compact_index_files), {}); + std::shared_ptr msg = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment, compact_increment); + + ASSERT_OK(commit_impl->Commit({msg}, 1)); + ASSERT_OK_AND_ASSIGN(Snapshot snapshot, commit_impl->snapshot_manager_->LoadSnapshot(1)); + ASSERT_EQ(Snapshot::CommitKind::Compact(), snapshot.GetCommitKind()); + ASSERT_TRUE(snapshot.IndexManifest()); + + std::vector index_entries; + ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot.IndexManifest().value(), + /*filter=*/nullptr, &index_entries)); + ASSERT_EQ(1u, index_entries.size()); + ASSERT_EQ("bitmap-index-commit-compact-1", index_entries[0].index_file->FileName()); +} + +TEST_F(FileStoreCommitImplTest, TestCommitWithDeletedIndexFiles) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + const BinaryRow partition = CreateIntRow(10); + + std::vector> new_index_files; + new_index_files.push_back(CreateIndexFileMeta("bitmap-index-delete-1")); + DataIncrement data_increment_1({}, {}, {}, std::move(new_index_files), {}); + std::shared_ptr msg_1 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, + CompactIncrement({}, {}, {})); + ASSERT_OK(commit_impl->Commit({msg_1}, 1)); + + std::vector> deleted_index_files; + deleted_index_files.push_back(CreateIndexFileMeta("bitmap-index-delete-1")); + DataIncrement data_increment_2({}, {}, {}, {}, std::move(deleted_index_files)); + std::shared_ptr msg_2 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_2, + CompactIncrement({}, {}, {})); + ASSERT_OK(commit_impl->Commit({msg_2}, 2)); + + ASSERT_OK_AND_ASSIGN(Snapshot snapshot, commit_impl->snapshot_manager_->LoadSnapshot(2)); + ASSERT_TRUE(snapshot.IndexManifest()); + + std::vector index_entries; + ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot.IndexManifest().value(), + /*filter=*/nullptr, &index_entries)); + ASSERT_TRUE(index_entries.empty()); +} + +TEST_F(FileStoreCommitImplTest, TestCommitWithCompactDeletedIndexFiles) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .IgnoreEmptyCommit(true) + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + const BinaryRow partition = CreateIntRow(10); + + std::vector> new_index_files; + new_index_files.push_back(CreateIndexFileMeta("bitmap-index-compact-delete-1")); + DataIncrement data_increment_1({}, {}, {}, std::move(new_index_files), {}); + std::shared_ptr msg_1 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, + CompactIncrement({}, {}, {})); + ASSERT_OK(commit_impl->Commit({msg_1}, 1)); + + std::vector> deleted_index_files; + deleted_index_files.push_back(CreateIndexFileMeta("bitmap-index-compact-delete-1")); + DataIncrement data_increment_2({}, {}, {}); + CompactIncrement compact_increment({}, {}, {}, {}, std::move(deleted_index_files)); + std::shared_ptr msg_2 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_2, compact_increment); + ASSERT_OK(commit_impl->Commit({msg_2}, 2)); + + ASSERT_OK_AND_ASSIGN(Snapshot snapshot, commit_impl->snapshot_manager_->LoadSnapshot(2)); + ASSERT_EQ(Snapshot::CommitKind::Compact(), snapshot.GetCommitKind()); + ASSERT_TRUE(snapshot.IndexManifest()); + + std::vector index_entries; + ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot.IndexManifest().value(), + /*filter=*/nullptr, &index_entries)); + ASSERT_TRUE(index_entries.empty()); +} + +TEST_F(FileStoreCommitImplTest, TestOverwriteWithIndexFiles) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + const BinaryRow partition = CreateIntRow(10); + + std::vector> new_index_files_1; + new_index_files_1.push_back(CreateIndexFileMeta("bitmap-index-1")); + DataIncrement data_increment_1({}, {}, {}, std::move(new_index_files_1), {}); + std::shared_ptr msg_1 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, + CompactIncrement({}, {}, {})); + + ASSERT_OK(commit_impl->Overwrite({}, {msg_1}, 1)); + ASSERT_OK_AND_ASSIGN(auto snapshot1, commit_impl->snapshot_manager_->LatestSnapshot()); + ASSERT_TRUE(snapshot1.value().IndexManifest()); + std::vector index_entries_1; + ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot1.value().IndexManifest().value(), + /*filter=*/nullptr, + &index_entries_1)); + ASSERT_EQ(1u, index_entries_1.size()); + ASSERT_EQ("bitmap-index-1", index_entries_1[0].index_file->FileName()); + + std::vector> new_index_files_2; + new_index_files_2.push_back(CreateIndexFileMeta("bitmap-index-2")); + DataIncrement data_increment_2({}, {}, {}, std::move(new_index_files_2), {}); + std::shared_ptr msg_2 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_2, + CompactIncrement({}, {}, {})); + + ASSERT_OK(commit_impl->Overwrite({}, {msg_2}, 2)); + ASSERT_OK_AND_ASSIGN(auto snapshot2, commit_impl->snapshot_manager_->LatestSnapshot()); + ASSERT_TRUE(snapshot2.value().IndexManifest()); + std::vector index_entries_2; + ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot2.value().IndexManifest().value(), + /*filter=*/nullptr, + &index_entries_2)); + ASSERT_EQ(1u, index_entries_2.size()); + ASSERT_EQ("bitmap-index-2", index_entries_2[0].index_file->FileName()); +} + +TEST_F(FileStoreCommitImplTest, TestOverwriteWithCompactIndexFiles) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + const BinaryRow partition = CreateIntRow(10); + + std::vector> compact_index_files; + compact_index_files.push_back(CreateIndexFileMeta("bitmap-index-compact-1")); + DataIncrement data_increment({}, {}, {}); + CompactIncrement compact_increment({}, {}, {}, std::move(compact_index_files), {}); + std::shared_ptr msg = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment, compact_increment); + + ASSERT_OK(commit_impl->Overwrite({}, {msg}, 1)); + + ASSERT_OK_AND_ASSIGN(Snapshot overwrite_snapshot, commit_impl->snapshot_manager_->LoadSnapshot(1)); + ASSERT_EQ(Snapshot::CommitKind::Overwrite(), overwrite_snapshot.GetCommitKind()); + ASSERT_FALSE(overwrite_snapshot.IndexManifest()); + + ASSERT_OK_AND_ASSIGN(Snapshot compact_snapshot, commit_impl->snapshot_manager_->LoadSnapshot(2)); + ASSERT_EQ(Snapshot::CommitKind::Compact(), compact_snapshot.GetCommitKind()); + ASSERT_TRUE(compact_snapshot.IndexManifest()); + + std::vector index_entries; + ASSERT_OK(commit_impl->index_manifest_file_->Read(compact_snapshot.IndexManifest().value(), + /*filter=*/nullptr, &index_entries)); + ASSERT_EQ(1u, index_entries.size()); + ASSERT_EQ("bitmap-index-compact-1", index_entries[0].index_file->FileName()); +} + TEST_F(FileStoreCommitImplTest, TestFilterAndOverwrite) { CommitContextBuilder context_builder(table_path_, "commit_user_1"); ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, @@ -1540,6 +1904,95 @@ TEST_F(FileStoreCommitImplTest, TestFilterAndOverwrite) { ASSERT_TRUE(IsStringInSet(file_names, "data-7b3f4cc7-116b-4d2f-9c62-5dadc1f11bcb-0.orc")); } +TEST_F(FileStoreCommitImplTest, TestFilterAndOverwriteWithIndexFiles) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + const BinaryRow partition = CreateIntRow(10); + + std::vector> new_index_files_1; + new_index_files_1.push_back(CreateIndexFileMeta("bitmap-index-filter-1")); + DataIncrement data_increment_1({}, {}, {}, std::move(new_index_files_1), {}); + std::shared_ptr msg_1 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, + CompactIncrement({}, {}, {})); + + ASSERT_OK_AND_ASSIGN(int32_t actual_commit, + commit_impl->FilterAndOverwrite({}, {msg_1}, 1, 10)); + ASSERT_EQ(1, actual_commit); + ASSERT_OK_AND_ASSIGN(actual_commit, commit_impl->FilterAndOverwrite({}, {msg_1}, 1, 5)); + ASSERT_EQ(0, actual_commit); + + std::vector> new_index_files_2; + new_index_files_2.push_back(CreateIndexFileMeta("bitmap-index-filter-2")); + DataIncrement data_increment_2({}, {}, {}, std::move(new_index_files_2), {}); + std::shared_ptr msg_2 = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_2, + CompactIncrement({}, {}, {})); + + ASSERT_OK_AND_ASSIGN(actual_commit, commit_impl->FilterAndOverwrite({}, {msg_2}, 2, 20)); + ASSERT_EQ(1, actual_commit); + ASSERT_OK_AND_ASSIGN(auto snapshot, commit_impl->snapshot_manager_->LatestSnapshot()); + ASSERT_TRUE(snapshot.value().IndexManifest()); + std::vector index_entries; + ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot.value().IndexManifest().value(), + /*filter=*/nullptr, &index_entries)); + ASSERT_EQ(1u, index_entries.size()); + ASSERT_EQ("bitmap-index-filter-2", index_entries[0].index_file->FileName()); +} + +TEST_F(FileStoreCommitImplTest, TestFilterAndOverwriteWithCompactIndexFiles) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + const BinaryRow partition = CreateIntRow(10); + + std::vector> compact_index_files; + compact_index_files.push_back(CreateIndexFileMeta("bitmap-index-filter-compact-1")); + DataIncrement data_increment({}, {}, {}); + CompactIncrement compact_increment({}, {}, {}, std::move(compact_index_files), {}); + std::shared_ptr msg = std::make_shared( + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment, compact_increment); + + ASSERT_OK_AND_ASSIGN(int32_t actual_commit, + commit_impl->FilterAndOverwrite({}, {msg}, 1, 10)); + ASSERT_EQ(1, actual_commit); + ASSERT_OK_AND_ASSIGN(actual_commit, commit_impl->FilterAndOverwrite({}, {msg}, 1, 5)); + ASSERT_EQ(0, actual_commit); + + ASSERT_OK_AND_ASSIGN(Snapshot overwrite_snapshot, commit_impl->snapshot_manager_->LoadSnapshot(1)); + ASSERT_EQ(Snapshot::CommitKind::Overwrite(), overwrite_snapshot.GetCommitKind()); + ASSERT_TRUE(overwrite_snapshot.Watermark()); + ASSERT_EQ(10, overwrite_snapshot.Watermark().value()); + ASSERT_FALSE(overwrite_snapshot.IndexManifest()); + + ASSERT_OK_AND_ASSIGN(Snapshot compact_snapshot, commit_impl->snapshot_manager_->LoadSnapshot(2)); + ASSERT_EQ(Snapshot::CommitKind::Compact(), compact_snapshot.GetCommitKind()); + ASSERT_TRUE(compact_snapshot.Watermark()); + ASSERT_EQ(10, compact_snapshot.Watermark().value()); + ASSERT_TRUE(compact_snapshot.IndexManifest()); + + std::vector index_entries; + ASSERT_OK(commit_impl->index_manifest_file_->Read(compact_snapshot.IndexManifest().value(), + /*filter=*/nullptr, &index_entries)); + ASSERT_EQ(1u, index_entries.size()); + ASSERT_EQ("bitmap-index-filter-compact-1", index_entries[0].index_file->FileName()); +} + TEST_F(FileStoreCommitImplTest, TestOverwriteWithSpecifyPartition) { CommitContextBuilder context_builder(table_path_, "commit_user_1"); ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, @@ -1576,6 +2029,68 @@ TEST_F(FileStoreCommitImplTest, TestOverwriteWithSpecifyPartition) { ASSERT_TRUE(IsStringInSet(file_names, "data-8dc7f04c-3c98-48b2-9d56-834d746c4a40-0.orc")); } +TEST_F(FileStoreCommitImplTest, TestOverwriteWithSpecifyPartitionIndexFiles) { + CommitContextBuilder context_builder(table_path_, "commit_user_1"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .Finish()); + + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + auto commit_impl = std::dynamic_pointer_cast( + std::shared_ptr(std::move(commit))); + + const BinaryRow partition_10 = CreateIntRow(10); + const BinaryRow partition_20 = CreateIntRow(20); + + std::vector> index_files_partition_10_v1; + index_files_partition_10_v1.push_back(CreateIndexFileMeta("bitmap-index-partition-10-v1")); + DataIncrement data_increment_partition_10_v1( + {}, {}, {}, std::move(index_files_partition_10_v1), {}); + std::shared_ptr msg_partition_10_v1 = std::make_shared( + partition_10, /*bucket=*/0, /*total_buckets=*/2, data_increment_partition_10_v1, + CompactIncrement({}, {}, {})); + + std::vector> index_files_partition_20_v1; + index_files_partition_20_v1.push_back(CreateIndexFileMeta("bitmap-index-partition-20-v1")); + DataIncrement data_increment_partition_20_v1( + {}, {}, {}, std::move(index_files_partition_20_v1), {}); + std::shared_ptr msg_partition_20_v1 = std::make_shared( + partition_20, /*bucket=*/0, /*total_buckets=*/2, data_increment_partition_20_v1, + CompactIncrement({}, {}, {})); + + ASSERT_OK(commit_impl->Overwrite({}, {msg_partition_10_v1, msg_partition_20_v1}, 1)); + + std::vector> index_files_partition_10_v2; + index_files_partition_10_v2.push_back(CreateIndexFileMeta("bitmap-index-partition-10-v2")); + DataIncrement data_increment_partition_10_v2( + {}, {}, {}, std::move(index_files_partition_10_v2), {}); + std::shared_ptr msg_partition_10_v2 = std::make_shared( + partition_10, /*bucket=*/0, /*total_buckets=*/2, data_increment_partition_10_v2, + CompactIncrement({}, {}, {})); + + std::map partition_spec; + partition_spec["f1"] = "10"; + ASSERT_OK(commit_impl->Overwrite({partition_spec}, {msg_partition_10_v2}, 2)); + + ASSERT_OK_AND_ASSIGN(auto snapshot, commit_impl->snapshot_manager_->LatestSnapshot()); + ASSERT_TRUE(snapshot.value().IndexManifest()); + std::vector index_entries; + ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot.value().IndexManifest().value(), + /*filter=*/nullptr, &index_entries)); + ASSERT_EQ(2u, index_entries.size()); + + std::set index_file_names; + for (const auto& entry : index_entries) { + index_file_names.insert(entry.index_file->FileName()); + } + + ASSERT_TRUE(IsStringInSet(index_file_names, "bitmap-index-partition-10-v2")); + ASSERT_TRUE(IsStringInSet(index_file_names, "bitmap-index-partition-20-v1")); + ASSERT_FALSE(IsStringInSet(index_file_names, "bitmap-index-partition-10-v1")); +} + TEST_F(FileStoreCommitImplTest, TestOverwriteWithSameFile) { CommitContextBuilder context_builder(table_path_, "commit_user_1"); ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, @@ -1689,9 +2204,7 @@ TEST_F(FileStoreCommitImplTest, TestObjectStoreAllowedWithRESTCatalogCommit) { ASSERT_FALSE(json.empty()); } -// Verify that FileStoreCommit::Create succeeds for PK tables with postpone bucket mode (bucket=-2) -// without requiring the enable-pk-commit-in-inte-test workaround flag. -TEST_F(FileStoreCommitImplTest, TestPostponeBucketPKTableCommitAllowed) { +TEST_F(FileStoreCommitImplTest, TestFixedBucketPKTableCommitAllowed) { auto pk_dir = UniqueTestDirectory::Create(); ASSERT_TRUE(pk_dir); std::string pk_root = pk_dir->Str(); @@ -1702,14 +2215,13 @@ TEST_F(FileStoreCommitImplTest, TestPostponeBucketPKTableCommitAllowed) { {arrow::field("pk", arrow::int32()), arrow::field("val", arrow::utf8())}); ::ArrowSchema arrow_schema; ASSERT_TRUE(arrow::ExportSchema(pk_schema, &arrow_schema).ok()); - std::map table_options = {{Options::BUCKET, "-2"}}; + std::map table_options = {{Options::BUCKET, "4"}}; ASSERT_OK(catalog->CreateTable(Identifier("db", "pk_tbl"), &arrow_schema, /*partition_keys=*/{}, /*primary_keys=*/{"pk"}, table_options, /*ignore_if_exists=*/false)); std::string pk_table_path = PathUtil::JoinPath(pk_root, "db.db/pk_tbl"); - // Create FileStoreCommit WITHOUT the workaround flag — should succeed for postpone bucket CommitContextBuilder builder(pk_table_path, "test_user"); builder.AddOption(Options::FILE_SYSTEM, "local").UseRESTCatalogCommit(true); ASSERT_OK_AND_ASSIGN(auto commit_context, builder.Finish()); @@ -1717,34 +2229,4 @@ TEST_F(FileStoreCommitImplTest, TestPostponeBucketPKTableCommitAllowed) { ASSERT_TRUE(committer != nullptr); } -// Verify that FileStoreCommit::Create still rejects PK tables with fixed bucket (bucket > 0) -// when the workaround flag is not set. -TEST_F(FileStoreCommitImplTest, TestFixedBucketPKTableCommitRejected) { - auto pk_dir = UniqueTestDirectory::Create(); - ASSERT_TRUE(pk_dir); - std::string pk_root = pk_dir->Str(); - ASSERT_OK_AND_ASSIGN(auto catalog, Catalog::Create(pk_root, {})); - ASSERT_OK(catalog->CreateDatabase("db", {}, false)); - - arrow::Schema pk_schema( - {arrow::field("pk", arrow::int32()), arrow::field("val", arrow::utf8())}); - ::ArrowSchema arrow_schema; - ASSERT_TRUE(arrow::ExportSchema(pk_schema, &arrow_schema).ok()); - std::map table_options = {{Options::BUCKET, "4"}}; - ASSERT_OK(catalog->CreateTable(Identifier("db", "pk_tbl_fixed"), &arrow_schema, - /*partition_keys=*/{}, /*primary_keys=*/{"pk"}, table_options, - /*ignore_if_exists=*/false)); - - std::string pk_table_path = PathUtil::JoinPath(pk_root, "db.db/pk_tbl_fixed"); - - CommitContextBuilder builder(pk_table_path, "test_user"); - builder.AddOption(Options::FILE_SYSTEM, "local").UseRESTCatalogCommit(true); - ASSERT_OK_AND_ASSIGN(auto commit_context, builder.Finish()); - auto result = FileStoreCommit::Create(std::move(commit_context)); - ASSERT_FALSE(result.ok()); - ASSERT_TRUE(result.status().IsNotImplemented()); - ASSERT_TRUE(result.status().ToString().find("not support pk table commit") != - std::string::npos); -} - } // namespace paimon::test diff --git a/src/paimon/core/operation/file_store_commit_test.cpp b/src/paimon/core/operation/file_store_commit_test.cpp index 4a6fcd45e..6a39670ad 100644 --- a/src/paimon/core/operation/file_store_commit_test.cpp +++ b/src/paimon/core/operation/file_store_commit_test.cpp @@ -26,10 +26,21 @@ #include "paimon/catalog/catalog.h" #include "paimon/catalog/identifier.h" #include "paimon/commit_context.h" +#include "paimon/common/utils/linked_hash_map.h" #include "paimon/common/utils/path_util.h" +#include "paimon/core/deletionvectors/deletion_vectors_index_file.h" +#include "paimon/core/deletionvectors/deletion_vector.h" +#include "paimon/core/index/deletion_vector_meta.h" +#include "paimon/core/index/index_file_meta.h" #include "paimon/core/operation/file_store_commit_impl.h" +#include "paimon/core/table/sink/commit_message_impl.h" +#include "paimon/core/utils/snapshot_manager.h" +#include "paimon/core/io/data_increment.h" +#include "paimon/core/io/compact_increment.h" +#include "paimon/fs/local/local_file_system.h" #include "paimon/defs.h" #include "paimon/result.h" +#include "paimon/testing/utils/binary_row_generator.h" #include "paimon/testing/utils/testharness.h" namespace paimon::test { @@ -72,4 +83,63 @@ TEST(FileStoreCommitTest, TestCreate) { ASSERT_TRUE(commit_impl); } +TEST(FileStoreCommitTest, TestAppendDvIndexShouldUseOverwriteCommitKind) { + auto string_field = arrow::field("f0", arrow::utf8()); + auto int_field = arrow::field("f1", arrow::int32()); + auto int_field1 = arrow::field("f2", arrow::int32()); + auto double_field = arrow::field("f3", arrow::float64()); + auto schema = + arrow::schema(arrow::FieldVector({string_field, int_field, int_field1, double_field})); + + ::ArrowSchema arrow_schema; + ASSERT_TRUE(arrow::ExportSchema(*schema, &arrow_schema).ok()); + auto dir = UniqueTestDirectory::Create(); + ASSERT_TRUE(dir); + + std::map options = {{Options::FILE_FORMAT, "orc"}, + {Options::TARGET_FILE_SIZE, "1024"}, + {Options::FILE_SYSTEM, "local"}, + {Options::BUCKET, "2"}, + {Options::BUCKET_KEY, "f2"}}; + + ASSERT_OK_AND_ASSIGN(auto catalog, Catalog::Create(dir->Str(), options)); + ASSERT_OK(catalog->CreateDatabase("foo", options, /*ignore_if_exists=*/false)); + ASSERT_OK(catalog->CreateTable(Identifier("foo", "bar"), &arrow_schema, + /*partition_keys=*/{"f1"}, + /*primary_keys=*/{}, options, + /*ignore_if_exists=*/false)); + std::string table_path = PathUtil::JoinPath(dir->Str(), "foo.db/bar"); + + CommitContextBuilder context_builder(table_path, "commit_user"); + ASSERT_OK_AND_ASSIGN(std::unique_ptr commit_context, + context_builder.AddOption(Options::MANIFEST_FORMAT, "orc") + .AddOption(Options::MANIFEST_TARGET_FILE_SIZE, "8mb") + .AddOption(Options::FILE_SYSTEM, "local") + .Finish()); + ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); + + LinkedHashMap dv_ranges; + dv_ranges.insert_or_assign( + "data-file-1", + DeletionVectorMeta( + /*data_file_name=*/"data-file-1", /*offset=*/0, /*length=*/10, + /*cardinality=*/1)); + std::vector> new_index_files; + new_index_files.push_back(std::make_shared( + DeletionVectorsIndexFile::DELETION_VECTORS_INDEX, "dv-index-1", 100, 1, + /*dv_ranges=*/dv_ranges, /*external_path=*/std::nullopt)); + DataIncrement data_increment({}, {}, {}, std::move(new_index_files), {}); + std::shared_ptr msg = std::make_shared( + BinaryRowGenerator::GenerateRow({10}, GetDefaultPool().get()), /*bucket=*/0, + /*total_bucket=*/2, data_increment, CompactIncrement({}, {}, {})); + + ASSERT_OK(commit->Commit({msg}, /*commit_identifier=*/1)); + + auto fs = std::make_shared(); + SnapshotManager snapshot_manager(fs, table_path); + ASSERT_OK_AND_ASSIGN(std::optional snapshot, snapshot_manager.LatestSnapshot()); + ASSERT_TRUE(snapshot); + ASSERT_EQ(Snapshot::CommitKind::Overwrite(), snapshot.value().GetCommitKind()); +} + } // namespace paimon::test From 12d886914e45ddf87ca85a0acc76b1940c02ad9e Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Thu, 2 Jul 2026 11:49:26 +0800 Subject: [PATCH 2/2] fix --- .../operation/commit/conflict_detection.cpp | 65 +++---- .../operation/commit/conflict_detection.h | 92 +++++----- .../commit/conflict_detection_test.cpp | 168 ++++++++---------- .../commit/manifest_entry_changes.cpp | 4 +- .../operation/commit/manifest_entry_changes.h | 6 +- .../commit/manifest_entry_changes_test.cpp | 21 +-- .../commit/row_id_column_conflict_checker.cpp | 13 +- .../core/operation/file_store_commit_impl.cpp | 106 +++++------ .../core/operation/file_store_commit_impl.h | 5 +- .../operation/file_store_commit_impl_test.cpp | 158 ++++++++-------- .../core/operation/file_store_commit_test.cpp | 17 +- 11 files changed, 302 insertions(+), 353 deletions(-) diff --git a/src/paimon/core/operation/commit/conflict_detection.cpp b/src/paimon/core/operation/commit/conflict_detection.cpp index f3e507f39..35047bc6c 100644 --- a/src/paimon/core/operation/commit/conflict_detection.cpp +++ b/src/paimon/core/operation/commit/conflict_detection.cpp @@ -22,27 +22,27 @@ #include #include #include -#include #include +#include #include #include "fmt/format.h" -#include "paimon/common/data/blob_utils.h" #include "paimon/common/data/binary_row.h" +#include "paimon/common/data/blob_utils.h" #include "paimon/common/types/data_field.h" #include "paimon/common/utils/fields_comparator.h" +#include "paimon/core/deletionvectors/deletion_vectors_index_file.h" #include "paimon/core/manifest/file_entry.h" #include "paimon/core/manifest/file_kind.h" +#include "paimon/core/manifest/index_manifest_file.h" +#include "paimon/core/manifest/manifest_entry.h" #include "paimon/core/manifest/manifest_file.h" #include "paimon/core/manifest/manifest_file_meta.h" #include "paimon/core/manifest/manifest_list.h" -#include "paimon/core/manifest/index_manifest_file.h" -#include "paimon/core/manifest/manifest_entry.h" #include "paimon/core/operation/commit/manifest_entry_changes.h" #include "paimon/core/operation/commit/row_id_column_conflict_checker.h" -#include "paimon/core/utils/snapshot_manager.h" -#include "paimon/core/deletionvectors/deletion_vectors_index_file.h" #include "paimon/core/schema/table_schema.h" +#include "paimon/core/utils/snapshot_manager.h" #include "paimon/utils/range.h" namespace paimon { @@ -79,13 +79,13 @@ bool ConflictDetection::HasRowIdCheckFromSnapshot() const { return row_id_check_from_snapshot_.has_value(); } -Status ConflictDetection::CheckConflicts(const Snapshot& latest_snapshot, - const std::vector& base_entries, - const std::vector& delta_entries, - const std::vector& delta_index_entries, - const std::optional>& - row_id_column_conflict_checker, - const Snapshot::CommitKind& commit_kind) const { +Status ConflictDetection::CheckConflicts( + const Snapshot& latest_snapshot, const std::vector& base_entries, + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const std::optional>& + row_id_column_conflict_checker, + const Snapshot::CommitKind& commit_kind) const { std::vector all_entries = base_entries; all_entries.insert(all_entries.end(), delta_entries.begin(), delta_entries.end()); PAIMON_RETURN_NOT_OK(CheckBucketKeepSame(all_entries, commit_kind)); @@ -106,9 +106,8 @@ Status ConflictDetection::CheckConflicts(const Snapshot& latest_snapshot, } PAIMON_RETURN_NOT_OK(CheckRowIdRangeConflicts(commit_kind, merged_entries)); PAIMON_RETURN_NOT_OK(CheckGlobalIndexRowIdExistence(base_entries, delta_index_entries)); - PAIMON_RETURN_NOT_OK(CheckForRowIdFromSnapshot(latest_snapshot, delta_entries, - delta_index_entries, - row_id_column_conflict_checker)); + PAIMON_RETURN_NOT_OK(CheckForRowIdFromSnapshot( + latest_snapshot, delta_entries, delta_index_entries, row_id_column_conflict_checker)); return Status::OK(); } @@ -196,9 +195,9 @@ Status ConflictDetection::CheckSameBucketByTotalBuckets( Status ConflictDetection::BucketNumMismatch(const BinaryRow& partition, int32_t num_buckets, int32_t previous_num_buckets) const { return Status::Invalid(fmt::format( - "Total buckets of partition {} changed from {} to {} without overwrite. Give up " - "committing.", - partition.ToString(), previous_num_buckets, num_buckets)); + "Total buckets of partition {} changed from {} to {} without overwrite. Give up " + "committing.", + partition.ToString(), previous_num_buckets, num_buckets)); } void ConflictDetection::MarkBucketCheckedPartitions( @@ -271,10 +270,9 @@ Status ConflictDetection::CheckKeyRange(const std::vector& merged return Status::OK(); } -Status ConflictDetection::CheckRowIdExistence( - const std::vector& base_entries, - const std::vector& delta_entries, - const std::optional& next_row_id) const { +Status ConflictDetection::CheckRowIdExistence(const std::vector& base_entries, + const std::vector& delta_entries, + const std::optional& next_row_id) const { if (!options_.DataEvolutionEnabled() || !next_row_id) { return Status::OK(); } @@ -292,7 +290,7 @@ Status ConflictDetection::CheckRowIdExistence( exact_ranges.emplace(range_from, range_to); } std::vector merged_ranges = Range::SortAndMergeOverlap(existing_ranges, - /*adjacent=*/false); + /*adjacent=*/false); for (const ManifestEntry& delta_entry : delta_entries) { if (!(delta_entry.Kind() == FileKind::Add()) || !delta_entry.File()->first_row_id) { @@ -336,8 +334,7 @@ Status ConflictDetection::CheckRowIdRangeConflicts( if (!options_.DataEvolutionEnabled()) { return Status::OK(); } - if (!row_id_check_from_snapshot_ && - !(commit_kind == Snapshot::CommitKind::Compact())) { + if (!row_id_check_from_snapshot_ && !(commit_kind == Snapshot::CommitKind::Compact())) { return Status::OK(); } @@ -356,15 +353,13 @@ Status ConflictDetection::CheckRowIdRangeConflicts( } std::sort(entries_with_ranges.begin(), entries_with_ranges.end(), - [](const auto& lhs, const auto& rhs) { - return lhs.first.from < rhs.first.from; - }); + [](const auto& lhs, const auto& rhs) { return lhs.first.from < rhs.first.from; }); size_t group_start = 0; int64_t group_max_to = entries_with_ranges[0].first.to; for (size_t i = 1; i <= entries_with_ranges.size(); ++i) { - bool overlap_group_end = (i == entries_with_ranges.size()) || - (entries_with_ranges[i].first.from > group_max_to); + bool overlap_group_end = + (i == entries_with_ranges.size()) || (entries_with_ranges[i].first.from > group_max_to); if (!overlap_group_end) { group_max_to = std::max(group_max_to, entries_with_ranges[i].first.to); continue; @@ -397,8 +392,7 @@ Status ConflictDetection::CheckRowIdRangeConflicts( } Status ConflictDetection::CheckForRowIdFromSnapshot( - const Snapshot& latest_snapshot, - const std::vector& delta_entries, + const Snapshot& latest_snapshot, const std::vector& delta_entries, const std::vector& delta_index_entries, const std::optional>& row_id_column_conflict_checker) const { @@ -447,9 +441,8 @@ Status ConflictDetection::CheckForRowIdFromSnapshot( PAIMON_RETURN_NOT_OK(manifest_list_->ReadDeltaManifests(snapshot, &delta_manifests)); for (const ManifestFileMeta& manifest_meta : delta_manifests) { std::vector history_entries; - PAIMON_RETURN_NOT_OK( - manifest_file_->Read(manifest_meta.FileName(), /*filter=*/nullptr, - &history_entries)); + PAIMON_RETURN_NOT_OK(manifest_file_->Read(manifest_meta.FileName(), /*filter=*/nullptr, + &history_entries)); for (const ManifestEntry& history_entry : history_entries) { if (!(history_entry.Kind() == FileKind::Add()) || !history_entry.File()->first_row_id || diff --git a/src/paimon/core/operation/commit/conflict_detection.h b/src/paimon/core/operation/commit/conflict_detection.h index 493fdc038..b1b5bfd29 100644 --- a/src/paimon/core/operation/commit/conflict_detection.h +++ b/src/paimon/core/operation/commit/conflict_detection.h @@ -16,9 +16,9 @@ #pragma once -#include #include #include +#include #include #include #include @@ -41,78 +41,74 @@ class TableSchema; class ConflictDetection { public: - ConflictDetection(std::shared_ptr table_schema, const CoreOptions& options, - std::shared_ptr snapshot_manager, - std::shared_ptr manifest_list, - std::shared_ptr manifest_file); - - Status CheckConflicts( - const Snapshot& latest_snapshot, - const std::vector& base_entries, - const std::vector& delta_entries, - const std::vector& delta_index_entries, - const std::optional>& - row_id_column_conflict_checker, - const Snapshot::CommitKind& commit_kind) const; + ConflictDetection(std::shared_ptr table_schema, const CoreOptions& options, + std::shared_ptr snapshot_manager, + std::shared_ptr manifest_list, + std::shared_ptr manifest_file); + + Status CheckConflicts(const Snapshot& latest_snapshot, + const std::vector& base_entries, + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const std::optional>& + row_id_column_conflict_checker, + const Snapshot::CommitKind& commit_kind) const; void SetRowIdCheckFromSnapshot(const std::optional& row_id_check_from_snapshot); bool HasRowIdCheckFromSnapshot() const; - bool ShouldBeOverwriteCommit( - const std::vector& append_table_files, - const std::vector& append_index_files) const; + bool ShouldBeOverwriteCommit(const std::vector& append_table_files, + const std::vector& append_index_files) const; - Status CollectUncheckedBucketPartitions( - const std::vector& delta_entries, - std::unordered_map* total_buckets) const; + Status CollectUncheckedBucketPartitions( + const std::vector& delta_entries, + std::unordered_map* total_buckets) const; - Status CheckSameBucketByTotalBuckets( - const std::unordered_map& expected_total_buckets, - const std::unordered_map& previous_total_buckets) const; + Status CheckSameBucketByTotalBuckets( + const std::unordered_map& expected_total_buckets, + const std::unordered_map& previous_total_buckets) const; private: - Status CheckBucketKeepSame(const std::vector& all_entries, - const Snapshot::CommitKind& commit_kind) const; + Status CheckBucketKeepSame(const std::vector& all_entries, + const Snapshot::CommitKind& commit_kind) const; - Status BucketNumMismatch(const BinaryRow& partition, int32_t num_buckets, - int32_t previous_num_buckets) const; + Status BucketNumMismatch(const BinaryRow& partition, int32_t num_buckets, + int32_t previous_num_buckets) const; - void MarkBucketCheckedPartitions( - const std::unordered_map& total_buckets) const; + void MarkBucketCheckedPartitions( + const std::unordered_map& total_buckets) const; Status CheckFileDeletionConflicts(const std::vector& merged_entries) const; Status CheckKeyRange(const std::vector& merged_entries) const; - Status CheckRowIdExistence(const std::vector& base_entries, - const std::vector& delta_entries, - const std::optional& next_row_id) const; + Status CheckRowIdExistence(const std::vector& base_entries, + const std::vector& delta_entries, + const std::optional& next_row_id) const; - Status CheckRowIdRangeConflicts( - const Snapshot::CommitKind& commit_kind, - const std::vector& merged_entries) const; + Status CheckRowIdRangeConflicts(const Snapshot::CommitKind& commit_kind, + const std::vector& merged_entries) const; - Status CheckForRowIdFromSnapshot( - const Snapshot& latest_snapshot, - const std::vector& delta_entries, - const std::vector& delta_index_entries, + Status CheckForRowIdFromSnapshot( + const Snapshot& latest_snapshot, const std::vector& delta_entries, + const std::vector& delta_index_entries, const std::optional>& - row_id_column_conflict_checker) const; + row_id_column_conflict_checker) const; - Status CheckGlobalIndexRowIdExistence( - const std::vector& base_entries, - const std::vector& delta_index_entries) const; + Status CheckGlobalIndexRowIdExistence( + const std::vector& base_entries, + const std::vector& delta_index_entries) const; private: - static constexpr size_t kSameBucketCheckCacheMaxSize = 1000; + static constexpr size_t kSameBucketCheckCacheMaxSize = 1000; std::shared_ptr table_schema_; CoreOptions options_; - std::optional row_id_check_from_snapshot_; - std::shared_ptr snapshot_manager_; - std::shared_ptr manifest_list_; - std::shared_ptr manifest_file_; + std::optional row_id_check_from_snapshot_; + std::shared_ptr snapshot_manager_; + std::shared_ptr manifest_list_; + std::shared_ptr manifest_file_; mutable LinkedHashMap same_bucket_checked_partitions_; }; diff --git a/src/paimon/core/operation/commit/conflict_detection_test.cpp b/src/paimon/core/operation/commit/conflict_detection_test.cpp index 5a0c27917..87c5ee403 100644 --- a/src/paimon/core/operation/commit/conflict_detection_test.cpp +++ b/src/paimon/core/operation/commit/conflict_detection_test.cpp @@ -25,9 +25,9 @@ #include "paimon/common/data/binary_row.h" #include "paimon/common/data/binary_row_writer.h" #include "paimon/common/utils/path_util.h" -#include "paimon/core/io/data_file_meta.h" #include "paimon/core/index/global_index_meta.h" #include "paimon/core/index/index_file_meta.h" +#include "paimon/core/io/data_file_meta.h" #include "paimon/core/manifest/file_kind.h" #include "paimon/core/manifest/index_manifest_entry.h" #include "paimon/core/manifest/manifest_entry.h" @@ -54,8 +54,7 @@ Snapshot MakeSnapshot(const Snapshot::CommitKind& commit_kind) { /*changelog_manifest_list_size=*/std::nullopt, /*index_manifest=*/std::nullopt, /*commit_user=*/"test-user", - /*commit_identifier=*/1, - commit_kind, + /*commit_identifier=*/1, commit_kind, /*time_millis=*/0, /*log_offsets=*/std::nullopt, /*total_record_count=*/std::nullopt, @@ -67,27 +66,23 @@ Snapshot MakeSnapshot(const Snapshot::CommitKind& commit_kind) { /*next_row_id=*/std::nullopt); } -Status CheckConflicts( - const ConflictDetection& detection, - const std::vector& base_entries, - const std::vector& delta_entries, - const Snapshot::CommitKind& commit_kind) { +Status CheckConflicts(const ConflictDetection& detection, + const std::vector& base_entries, + const std::vector& delta_entries, + const Snapshot::CommitKind& commit_kind) { return detection.CheckConflicts(MakeSnapshot(commit_kind), base_entries, delta_entries, /*delta_index_entries=*/{}, - /*row_id_column_conflict_checker=*/std::nullopt, - commit_kind); + /*row_id_column_conflict_checker=*/std::nullopt, commit_kind); } -Status CheckConflicts( - const ConflictDetection& detection, - const std::vector& base_entries, - const std::vector& delta_entries, - const std::vector& delta_index_entries, - const Snapshot::CommitKind& commit_kind) { +Status CheckConflicts(const ConflictDetection& detection, + const std::vector& base_entries, + const std::vector& delta_entries, + const std::vector& delta_index_entries, + const Snapshot::CommitKind& commit_kind) { return detection.CheckConflicts(MakeSnapshot(commit_kind), base_entries, delta_entries, delta_index_entries, - /*row_id_column_conflict_checker=*/std::nullopt, - commit_kind); + /*row_id_column_conflict_checker=*/std::nullopt, commit_kind); } } // namespace @@ -117,8 +112,8 @@ class ConflictDetectionTest : public testing::Test { ManifestEntry CreateManifestEntry(const std::string& file_name, const BinaryRow& partition, const FileKind& kind, const BinaryRow& min_key, - const BinaryRow& max_key, int32_t level, - int32_t bucket = 0, int32_t total_buckets = 2) const { + const BinaryRow& max_key, int32_t level, int32_t bucket = 0, + int32_t total_buckets = 2) const { auto data_file_meta = std::make_shared( file_name, 1024, 8, min_key, max_key, SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), /*min_seq_no=*/16, /*max_seq_no=*/32, @@ -136,8 +131,7 @@ class ConflictDetectionTest : public testing::Test { ManifestEntry CreateManifestEntryWithFirstRowId(const std::string& file_name, const BinaryRow& partition, const FileKind& kind, int32_t bucket, - int64_t first_row_id, - int64_t row_count) const { + int64_t first_row_id, int64_t row_count) const { auto data_file_meta = std::make_shared( file_name, 1024, row_count, DataFileMeta::EmptyMinKey(), DataFileMeta::EmptyMaxKey(), SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), /*min_seq_no=*/16, @@ -154,9 +148,9 @@ class ConflictDetectionTest : public testing::Test { } IndexManifestEntry CreateGlobalIndexEntry(const std::string& file_name, - const BinaryRow& partition, int32_t bucket, - int64_t row_range_start, - int64_t row_range_end) const { + const BinaryRow& partition, int32_t bucket, + int64_t row_range_start, + int64_t row_range_end) const { GlobalIndexMeta global_index_meta(row_range_start, row_range_end, /*index_field_id=*/1, /*extra_field_ids=*/std::nullopt, std::make_shared("meta", GetDefaultPool().get())); @@ -192,8 +186,7 @@ TEST_F(ConflictDetectionTest, TestFileDeletionConflicts) { std::vector changes; changes.push_back(CreateManifestEntry("f1", FileKind::Delete())); - ASSERT_OK(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append())); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -203,18 +196,18 @@ TEST_F(ConflictDetectionTest, TestFileDeletionConflicts) { changes.push_back(CreateManifestEntry("f1", FileKind::Delete())); changes.push_back(CreateManifestEntry("f3", FileKind::Add())); - ASSERT_NOK_WITH_MSG(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append()), - "Trying to delete file f1"); + ASSERT_NOK_WITH_MSG( + CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append()), + "Trying to delete file f1"); } { std::vector base_entries; std::vector changes; changes.push_back(CreateManifestEntry("f1", FileKind::Delete())); - ASSERT_NOK_WITH_MSG(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append()), - "Trying to delete file f1"); + ASSERT_NOK_WITH_MSG( + CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append()), + "Trying to delete file f1"); } } @@ -229,26 +222,25 @@ TEST_F(ConflictDetectionTest, TestGlobalIndexRowIdExistenceConflicts) { const BinaryRow partition = CreateIntRow(10); std::vector base_entries; - base_entries.push_back(CreateManifestEntryWithFirstRowId( - "base-1", partition, FileKind::Add(), /*bucket=*/0, /*first_row_id=*/0, - /*row_count=*/10)); - base_entries.push_back(CreateManifestEntryWithFirstRowId( - "base-2", partition, FileKind::Add(), /*bucket=*/0, /*first_row_id=*/10, - /*row_count=*/10)); + base_entries.push_back(CreateManifestEntryWithFirstRowId("base-1", partition, FileKind::Add(), + /*bucket=*/0, /*first_row_id=*/0, + /*row_count=*/10)); + base_entries.push_back(CreateManifestEntryWithFirstRowId("base-2", partition, FileKind::Add(), + /*bucket=*/0, /*first_row_id=*/10, + /*row_count=*/10)); std::vector changes; - ASSERT_OK(CheckConflicts(detection, - base_entries, changes, - {CreateGlobalIndexEntry("global-index-covered", partition, /*bucket=*/0, - /*row_range_start=*/0, /*row_range_end=*/19)}, - Snapshot::CommitKind::Append())); + ASSERT_OK( + CheckConflicts(detection, base_entries, changes, + {CreateGlobalIndexEntry("global-index-covered", partition, /*bucket=*/0, + /*row_range_start=*/0, /*row_range_end=*/19)}, + Snapshot::CommitKind::Append())); ASSERT_NOK_WITH_MSG( - CheckConflicts(detection, - base_entries, changes, - {CreateGlobalIndexEntry("global-index-missing", partition, /*bucket=*/0, - /*row_range_start=*/0, /*row_range_end=*/20)}, - Snapshot::CommitKind::Append()), + CheckConflicts(detection, base_entries, changes, + {CreateGlobalIndexEntry("global-index-missing", partition, /*bucket=*/0, + /*row_range_start=*/0, /*row_range_end=*/20)}, + Snapshot::CommitKind::Append()), "Global index row ID existence conflict"); } @@ -273,8 +265,7 @@ TEST_F(ConflictDetectionTest, TestBucketKeepSame) { DataFileMeta::EmptyMaxKey(), /*level=*/1, /*bucket=*/0, /*total_buckets=*/4)); - ASSERT_OK(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append())); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append())); } { ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); @@ -289,9 +280,9 @@ TEST_F(ConflictDetectionTest, TestBucketKeepSame) { DataFileMeta::EmptyMaxKey(), /*level=*/1, /*bucket=*/0, /*total_buckets=*/4)); - ASSERT_NOK_WITH_MSG(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append()), - "Total buckets of partition"); + ASSERT_NOK_WITH_MSG( + CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append()), + "Total buckets of partition"); } { ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); @@ -306,8 +297,7 @@ TEST_F(ConflictDetectionTest, TestBucketKeepSame) { DataFileMeta::EmptyMaxKey(), /*level=*/1, /*bucket=*/0, /*total_buckets=*/4)); - ASSERT_OK(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append())); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append())); } { ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); @@ -322,8 +312,8 @@ TEST_F(ConflictDetectionTest, TestBucketKeepSame) { DataFileMeta::EmptyMaxKey(), /*level=*/1, /*bucket=*/0, /*total_buckets=*/4)); - ASSERT_OK(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Overwrite())); + ASSERT_OK( + CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Overwrite())); } } @@ -338,12 +328,12 @@ TEST_F(ConflictDetectionTest, TestBucketKeepSameHelpers) { const BinaryRow partition = CreateIntRow(10); std::vector changes; changes.push_back(CreateManifestEntry("delta-1", partition, FileKind::Add(), - DataFileMeta::EmptyMinKey(), - DataFileMeta::EmptyMaxKey(), /*level=*/1, + DataFileMeta::EmptyMinKey(), DataFileMeta::EmptyMaxKey(), + /*level=*/1, /*bucket=*/0, /*total_buckets=*/4)); changes.push_back(CreateManifestEntry("delta-2", partition, FileKind::Add(), - DataFileMeta::EmptyMinKey(), - DataFileMeta::EmptyMaxKey(), /*level=*/1, + DataFileMeta::EmptyMinKey(), DataFileMeta::EmptyMaxKey(), + /*level=*/1, /*bucket=*/1, /*total_buckets=*/4)); std::unordered_map expected_total_buckets; @@ -362,8 +352,7 @@ TEST_F(ConflictDetectionTest, TestBucketKeepSameHelpers) { ConflictDetection mismatch_detection(table_schema, core_options, nullptr, nullptr, nullptr); ASSERT_NOK_WITH_MSG( - mismatch_detection.CheckSameBucketByTotalBuckets(expected_total_buckets, - {{partition, 2}}), + mismatch_detection.CheckSameBucketByTotalBuckets(expected_total_buckets, {{partition, 2}}), "Total buckets of partition"); } @@ -378,12 +367,12 @@ TEST_F(ConflictDetectionTest, TestCollectUncheckedBucketPartitionsMismatch) { const BinaryRow partition = CreateIntRow(10); std::vector changes; changes.push_back(CreateManifestEntry("delta-1", partition, FileKind::Add(), - DataFileMeta::EmptyMinKey(), - DataFileMeta::EmptyMaxKey(), /*level=*/1, + DataFileMeta::EmptyMinKey(), DataFileMeta::EmptyMaxKey(), + /*level=*/1, /*bucket=*/0, /*total_buckets=*/2)); changes.push_back(CreateManifestEntry("delta-2", partition, FileKind::Add(), - DataFileMeta::EmptyMinKey(), - DataFileMeta::EmptyMaxKey(), /*level=*/1, + DataFileMeta::EmptyMinKey(), DataFileMeta::EmptyMaxKey(), + /*level=*/1, /*bucket=*/1, /*total_buckets=*/4)); std::unordered_map total_buckets; @@ -415,9 +404,9 @@ TEST_F(ConflictDetectionTest, TestBucketKeepSameCacheEviction) { } std::vector evicted_partition_changes; - evicted_partition_changes.push_back(CreateManifestEntry( - "delta", CreateIntRow(0), FileKind::Add(), DataFileMeta::EmptyMinKey(), - DataFileMeta::EmptyMaxKey(), /*level=*/1, /*bucket=*/0, total_buckets)); + evicted_partition_changes.push_back( + CreateManifestEntry("delta", CreateIntRow(0), FileKind::Add(), DataFileMeta::EmptyMinKey(), + DataFileMeta::EmptyMaxKey(), /*level=*/1, /*bucket=*/0, total_buckets)); std::unordered_map evicted_partition_buckets; ASSERT_OK(detection.CollectUncheckedBucketPartitions(evicted_partition_changes, &evicted_partition_buckets)); @@ -431,8 +420,7 @@ TEST_F(ConflictDetectionTest, TestCheckLsmKeyRangeConflict) { ASSERT_OK_AND_ASSIGN( std::shared_ptr table_schema, TableSchema::Create(/*schema_id=*/0, arrow::schema(fields), /*partition_keys=*/{"f1"}, - /*primary_keys=*/{"f1", "f0"}, - {{Options::BUCKET, "4"}})); + /*primary_keys=*/{"f1", "f0"}, {{Options::BUCKET, "4"}})); ASSERT_OK_AND_ASSIGN(CoreOptions core_options, CoreOptions::FromMap({{Options::BUCKET, "4"}})); ConflictDetection detection(table_schema, core_options, nullptr, nullptr, nullptr); @@ -443,11 +431,11 @@ TEST_F(ConflictDetectionTest, TestCheckLsmKeyRangeConflict) { CreateIntRow(1), CreateIntRow(3), /*level=*/1)); std::vector changes; - changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), - CreateIntRow(3), CreateIntRow(5), /*level=*/1)); - ASSERT_NOK_WITH_MSG(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append()), - "LSM conflicts detected"); + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), CreateIntRow(3), + CreateIntRow(5), /*level=*/1)); + ASSERT_NOK_WITH_MSG( + CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append()), + "LSM conflicts detected"); } { std::vector base_entries; @@ -455,10 +443,9 @@ TEST_F(ConflictDetectionTest, TestCheckLsmKeyRangeConflict) { CreateIntRow(1), CreateIntRow(3), /*level=*/1)); std::vector changes; - changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), - CreateIntRow(4), CreateIntRow(5), /*level=*/1)); - ASSERT_OK(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append())); + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), CreateIntRow(4), + CreateIntRow(5), /*level=*/1)); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -466,10 +453,9 @@ TEST_F(ConflictDetectionTest, TestCheckLsmKeyRangeConflict) { CreateIntRow(1), CreateIntRow(3), /*level=*/0)); std::vector changes; - changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), - CreateIntRow(2), CreateIntRow(5), /*level=*/0)); - ASSERT_OK(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append())); + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), CreateIntRow(2), + CreateIntRow(5), /*level=*/0)); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -477,11 +463,10 @@ TEST_F(ConflictDetectionTest, TestCheckLsmKeyRangeConflict) { CreateIntRow(1), CreateIntRow(3), /*level=*/1, /*bucket=*/0)); std::vector changes; - changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), - CreateIntRow(2), CreateIntRow(5), /*level=*/1, + changes.push_back(CreateManifestEntry("delta", partition, FileKind::Add(), CreateIntRow(2), + CreateIntRow(5), /*level=*/1, /*bucket=*/1)); - ASSERT_OK(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append())); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -491,8 +476,7 @@ TEST_F(ConflictDetectionTest, TestCheckLsmKeyRangeConflict) { std::vector changes; changes.push_back(CreateManifestEntry("delta", CreateIntRow(20), FileKind::Add(), CreateIntRow(2), CreateIntRow(5), /*level=*/1)); - ASSERT_OK(CheckConflicts(detection, base_entries, changes, - Snapshot::CommitKind::Append())); + ASSERT_OK(CheckConflicts(detection, base_entries, changes, Snapshot::CommitKind::Append())); } } diff --git a/src/paimon/core/operation/commit/manifest_entry_changes.cpp b/src/paimon/core/operation/commit/manifest_entry_changes.cpp index 9e7381275..2670f1c52 100644 --- a/src/paimon/core/operation/commit/manifest_entry_changes.cpp +++ b/src/paimon/core/operation/commit/manifest_entry_changes.cpp @@ -130,8 +130,8 @@ ManifestEntry ManifestEntryChanges::MakeEntry( int32_t total_buckets = commit_message->TotalBuckets() == std::nullopt ? default_num_bucket_ : commit_message->TotalBuckets().value(); - return ManifestEntry(kind, commit_message->Partition(), commit_message->Bucket(), - total_buckets, file); + return ManifestEntry(kind, commit_message->Partition(), commit_message->Bucket(), total_buckets, + file); } } // namespace paimon diff --git a/src/paimon/core/operation/commit/manifest_entry_changes.h b/src/paimon/core/operation/commit/manifest_entry_changes.h index 84e9b2b91..ecaaa62b7 100644 --- a/src/paimon/core/operation/commit/manifest_entry_changes.h +++ b/src/paimon/core/operation/commit/manifest_entry_changes.h @@ -58,9 +58,9 @@ class ManifestEntryChanges { std::vector compact_index_files; private: - ManifestEntry MakeEntry(const FileKind& kind, - const std::shared_ptr& commit_message, - const std::shared_ptr& file) const; + ManifestEntry MakeEntry(const FileKind& kind, + const std::shared_ptr& commit_message, + const std::shared_ptr& file) const; private: int32_t default_num_bucket_; diff --git a/src/paimon/core/operation/commit/manifest_entry_changes_test.cpp b/src/paimon/core/operation/commit/manifest_entry_changes_test.cpp index c1f224e8f..7b4b40bcb 100644 --- a/src/paimon/core/operation/commit/manifest_entry_changes_test.cpp +++ b/src/paimon/core/operation/commit/manifest_entry_changes_test.cpp @@ -16,6 +16,7 @@ #include "paimon/core/operation/commit/manifest_entry_changes.h" +#include #include #include #include @@ -31,15 +32,13 @@ #include "paimon/core/manifest/file_kind.h" #include "paimon/core/manifest/index_manifest_entry.h" #include "paimon/core/manifest/manifest_entry.h" -#include "paimon/core/table/sink/commit_message_impl.h" #include "paimon/core/stats/simple_stats.h" +#include "paimon/core/table/sink/commit_message_impl.h" #include "paimon/data/timestamp.h" #include "paimon/defs.h" #include "paimon/memory/bytes.h" #include "paimon/testing/utils/testharness.h" -#include - namespace paimon::test { class ManifestEntryChangesTest : public testing::Test { @@ -67,8 +66,8 @@ class ManifestEntryChangesTest : public testing::Test { /*write_cols=*/std::nullopt); } - std::shared_ptr CreateIndexFileMeta(const std::string& file_name, - const std::string& index_type = "bitmap") const { + std::shared_ptr CreateIndexFileMeta( + const std::string& file_name, const std::string& index_type = "bitmap") const { return std::make_shared( index_type, file_name, /*file_size=*/100, /*row_count=*/5, /*dv_ranges=*/std::nullopt, /*external_path=*/std::nullopt, @@ -89,16 +88,12 @@ TEST_F(ManifestEntryChangesTest, TestCollectAndSummary) { const BinaryRow partition = CreateIntRow(10); DataIncrement data_increment( - {CreateDataFileMeta("append-add")}, - {CreateDataFileMeta("append-del")}, - {CreateDataFileMeta("append-changelog")}, - {CreateIndexFileMeta("append-index-add")}, + {CreateDataFileMeta("append-add")}, {CreateDataFileMeta("append-del")}, + {CreateDataFileMeta("append-changelog")}, {CreateIndexFileMeta("append-index-add")}, {CreateIndexFileMeta("append-index-del")}); CompactIncrement compact_increment( - {CreateDataFileMeta("compact-before")}, - {CreateDataFileMeta("compact-after")}, - {CreateDataFileMeta("compact-changelog")}, - {CreateIndexFileMeta("compact-index-add")}, + {CreateDataFileMeta("compact-before")}, {CreateDataFileMeta("compact-after")}, + {CreateDataFileMeta("compact-changelog")}, {CreateIndexFileMeta("compact-index-add")}, {CreateIndexFileMeta("compact-index-del")}); std::shared_ptr message = std::make_shared( diff --git a/src/paimon/core/operation/commit/row_id_column_conflict_checker.cpp b/src/paimon/core/operation/commit/row_id_column_conflict_checker.cpp index 1acb0ea0f..61a8cfc8f 100644 --- a/src/paimon/core/operation/commit/row_id_column_conflict_checker.cpp +++ b/src/paimon/core/operation/commit/row_id_column_conflict_checker.cpp @@ -37,8 +37,8 @@ struct DataFileRangeItem { Result> RowIdColumnConflictChecker::FromManifestEntries( const std::shared_ptr& schema_manager, const std::vector& delta_entries) { - auto checker = std::shared_ptr( - new RowIdColumnConflictChecker(schema_manager)); + auto checker = + std::shared_ptr(new RowIdColumnConflictChecker(schema_manager)); std::vector row_id_files; row_id_files.reserve(delta_entries.size()); @@ -105,8 +105,8 @@ Status RowIdColumnConflictChecker::BuildWriteRanges( return Status::OK(); } -Status RowIdColumnConflictChecker::AddWriteFieldIds( - std::unordered_set* field_ids, const DataFileMeta& file) { +Status RowIdColumnConflictChecker::AddWriteFieldIds(std::unordered_set* field_ids, + const DataFileMeta& file) { if (!file.write_cols.has_value()) { const std::map* field_id_by_name = nullptr; PAIMON_RETURN_NOT_OK(FieldIdByName(file.schema_id, &field_id_by_name)); @@ -185,9 +185,8 @@ Result> RowIdColumnConflictChecker::ResolveFieldId( return std::optional(); } - return Status::Invalid( - "Cannot find write column '" + write_col + "' in schema " + - std::to_string(file.schema_id) + "."); + return Status::Invalid("Cannot find write column '" + write_col + "' in schema " + + std::to_string(file.schema_id) + "."); } Status RowIdColumnConflictChecker::FieldIdByName( diff --git a/src/paimon/core/operation/file_store_commit_impl.cpp b/src/paimon/core/operation/file_store_commit_impl.cpp index 5bb914e8d..39bd674cc 100644 --- a/src/paimon/core/operation/file_store_commit_impl.cpp +++ b/src/paimon/core/operation/file_store_commit_impl.cpp @@ -126,8 +126,7 @@ FileStoreCommitImpl::FileStoreCommitImpl( ignore_empty_commit_(ignore_empty_commit), num_bucket_(options.GetBucket()), table_schema_(table_schema), - conflict_detection_(table_schema, options, snapshot_manager_, manifest_list, - manifest_file), + conflict_detection_(table_schema, options, snapshot_manager_, manifest_list, manifest_file), manifest_file_(manifest_file), manifest_list_(manifest_list), index_manifest_file_(index_manifest_file), @@ -295,11 +294,10 @@ Status FileStoreCommitImpl::Overwrite( PAIMON_RETURN_NOT_OK(TryOverwrite(partitions, changes.append_table_files, changes.append_index_files, identifier, watermark)); if (changes.HasCompactChanges()) { - PAIMON_RETURN_NOT_OK(TryCommit(changes.compact_table_files, - changes.compact_index_files, identifier, watermark, - /*log_offsets=*/{}, /*properties=*/{}, - Snapshot::CommitKind::Compact(), - /*check_append_files=*/true)); + PAIMON_RETURN_NOT_OK(TryCommit( + changes.compact_table_files, changes.compact_index_files, identifier, watermark, + /*log_offsets=*/{}, /*properties=*/{}, Snapshot::CommitKind::Compact(), + /*check_append_files=*/true)); } return Status::OK(); } @@ -321,9 +319,8 @@ Result FileStoreCommitImpl::FilterAndOverwrite( PAIMON_RETURN_NOT_OK(TryOverwrite(partitions, changes.append_table_files, changes.append_index_files, identifier, watermark)); if (changes.HasCompactChanges()) { - PAIMON_RETURN_NOT_OK(TryCommit(changes.compact_table_files, - changes.compact_index_files, identifier, - watermark, /*log_offsets=*/{}, + PAIMON_RETURN_NOT_OK(TryCommit(changes.compact_table_files, changes.compact_index_files, + identifier, watermark, /*log_offsets=*/{}, /*properties=*/{}, Snapshot::CommitKind::Compact(), /*check_append_files=*/true)); } @@ -388,9 +385,8 @@ Result> FileStoreCommitImpl::GetAllIndexFiles( Status FileStoreCommitImpl::TryOverwrite( const std::vector>& partitions, - const std::vector& changes, - const std::vector& index_entries, int64_t commit_identifier, - std::optional watermark) { + const std::vector& changes, const std::vector& index_entries, + int64_t commit_identifier, std::optional watermark) { int32_t retry_count = 0; std::optional retry_start_snapshot_id; while (true) { @@ -415,13 +411,13 @@ Status FileStoreCommitImpl::TryOverwrite( } } changes_with_overwrite.insert(changes_with_overwrite.end(), changes.begin(), changes.end()); - PAIMON_ASSIGN_OR_RAISE(bool commit_success, - TryCommitOnce(changes_with_overwrite, index_entries_with_overwrite, - commit_identifier, watermark, - /*log_offsets=*/{}, /*properties=*/{}, - Snapshot::CommitKind::Overwrite(), latest_snapshot, - /*need_conflict_check=*/true, - retry_start_snapshot_id)); + PAIMON_ASSIGN_OR_RAISE( + bool commit_success, + TryCommitOnce(changes_with_overwrite, index_entries_with_overwrite, commit_identifier, + watermark, + /*log_offsets=*/{}, /*properties=*/{}, Snapshot::CommitKind::Overwrite(), + latest_snapshot, + /*need_conflict_check=*/true, retry_start_snapshot_id)); if (commit_success) { break; } @@ -451,7 +447,7 @@ Status FileStoreCommitImpl::Commit(const std::shared_ptr& c if (!ignore_empty_commit_ || changes.HasAppendChanges()) { Snapshot::CommitKind append_commit_kind = Snapshot::CommitKind::Append(); if (conflict_detection_.ShouldBeOverwriteCommit(changes.append_table_files, - changes.append_index_files)) { + changes.append_index_files)) { append_commit_kind = Snapshot::CommitKind::Overwrite(); check_append_files = true; } @@ -460,8 +456,7 @@ Status FileStoreCommitImpl::Commit(const std::shared_ptr& c } PAIMON_ASSIGN_OR_RAISE(int32_t cnt, - TryCommit(changes.append_table_files, - changes.append_index_files, + TryCommit(changes.append_table_files, changes.append_index_files, committable->Identifier(), committable->Watermark(), committable->LogOffsets(), committable->Properties(), append_commit_kind, check_append_files)); @@ -471,8 +466,7 @@ Status FileStoreCommitImpl::Commit(const std::shared_ptr& c if (changes.HasCompactChanges()) { PAIMON_ASSIGN_OR_RAISE( - int32_t cnt, TryCommit(changes.compact_table_files, - changes.compact_index_files, + int32_t cnt, TryCommit(changes.compact_table_files, changes.compact_index_files, committable->Identifier(), committable->Watermark(), committable->LogOffsets(), committable->Properties(), Snapshot::CommitKind::Compact(), /*check_append_files=*/true)); @@ -514,12 +508,12 @@ Status FileStoreCommitImpl::Commit(const std::shared_ptr& c RowCounts(changes.compact_table_files)); metrics_->SetCounter(CommitMetrics::LAST_CHANGELOG_RECORDS_COMMIT_COMPACTED, RowCounts(changes.compact_changelog)); - metrics_->SetCounter(CommitMetrics::LAST_PARTITIONS_WRITTEN, - NumChangedPartitions( - {changes.append_table_files, changes.compact_table_files})); - metrics_->SetCounter(CommitMetrics::LAST_BUCKETS_WRITTEN, - NumChangedBuckets({changes.append_table_files, - changes.compact_table_files})); + metrics_->SetCounter( + CommitMetrics::LAST_PARTITIONS_WRITTEN, + NumChangedPartitions({changes.append_table_files, changes.compact_table_files})); + metrics_->SetCounter( + CommitMetrics::LAST_BUCKETS_WRITTEN, + NumChangedBuckets({changes.append_table_files, changes.compact_table_files})); metrics_->SetCounter(CommitMetrics::LAST_COMPACTION_INPUT_FILE_SIZE, compaction_input_file_size); metrics_->SetCounter(CommitMetrics::LAST_COMPACTION_OUTPUT_FILE_SIZE, @@ -548,11 +542,10 @@ Result FileStoreCommitImpl::TryCommit(const std::vector& while (true) { PAIMON_ASSIGN_OR_RAISE(std::optional latest_snapshot, snapshot_manager_->LatestSnapshot()); - PAIMON_ASSIGN_OR_RAISE( - bool commit_success, - TryCommitOnce(delta_files, index_entries, identifier, watermark, log_offsets, - properties, commit_kind, latest_snapshot, check_append_files, - retry_start_snapshot_id)); + PAIMON_ASSIGN_OR_RAISE(bool commit_success, + TryCommitOnce(delta_files, index_entries, identifier, watermark, + log_offsets, properties, commit_kind, latest_snapshot, + check_append_files, retry_start_snapshot_id)); if (commit_success) { break; } @@ -617,10 +610,10 @@ Result> FileStoreCommitImpl::ReadAllEntriesFromChange return plan->Files(); } -Result FileStoreCommitImpl::CheckCommitted( - const std::optional& latest_snapshot, - std::optional retry_start_snapshot_id, int64_t identifier, - const Snapshot::CommitKind& commit_kind) const { +Result FileStoreCommitImpl::CheckCommitted(const std::optional& latest_snapshot, + std::optional retry_start_snapshot_id, + int64_t identifier, + const Snapshot::CommitKind& commit_kind) const { if (!latest_snapshot || !retry_start_snapshot_id || retry_start_snapshot_id.value() > latest_snapshot.value().Id()) { return false; @@ -629,8 +622,7 @@ Result FileStoreCommitImpl::CheckCommitted( for (int64_t snapshot_id = retry_start_snapshot_id.value(); snapshot_id <= latest_snapshot.value().Id(); ++snapshot_id) { PAIMON_ASSIGN_OR_RAISE(Snapshot snapshot, snapshot_manager_->LoadSnapshot(snapshot_id)); - if (snapshot.CommitUser() == commit_user_ && - snapshot.CommitIdentifier() == identifier && + if (snapshot.CommitUser() == commit_user_ && snapshot.CommitIdentifier() == identifier && snapshot.GetCommitKind() == commit_kind) { return true; } @@ -638,12 +630,11 @@ Result FileStoreCommitImpl::CheckCommitted( return false; } -Status FileStoreCommitImpl::NoConflictsOrFail(const std::string& base_commit_user, - const std::vector& base_entries, - const std::vector& delta_entries, - const std::vector& delta_index_entries, - const Snapshot& latest_snapshot, - const Snapshot::CommitKind& commit_kind) const { +Status FileStoreCommitImpl::NoConflictsOrFail( + const std::string& base_commit_user, const std::vector& base_entries, + const std::vector& delta_entries, + const std::vector& delta_index_entries, const Snapshot& latest_snapshot, + const Snapshot::CommitKind& commit_kind) const { ScopeGuard guard([&]() { PAIMON_LOG_WARN(logger_, "File deletion conflicts detected! Give up committing. %s", base_commit_user.c_str()); @@ -651,9 +642,8 @@ Status FileStoreCommitImpl::NoConflictsOrFail(const std::string& base_commit_use std::optional> row_id_column_conflict_checker = std::nullopt; - PAIMON_ASSIGN_OR_RAISE(auto checker, - RowIdColumnConflictChecker::FromManifestEntries(schema_manager_, - delta_entries)); + PAIMON_ASSIGN_OR_RAISE(auto checker, RowIdColumnConflictChecker::FromManifestEntries( + schema_manager_, delta_entries)); row_id_column_conflict_checker = checker; PAIMON_RETURN_NOT_OK(conflict_detection_.CheckConflicts( @@ -669,8 +659,7 @@ Status FileStoreCommitImpl::NoConflictsOrFail(const std::string& base_commit_use const Snapshot& latest_snapshot, const Snapshot::CommitKind& commit_kind) const { return NoConflictsOrFail(base_commit_user, base_entries, delta_entries, - /*delta_index_entries=*/{}, latest_snapshot, - commit_kind); + /*delta_index_entries=*/{}, latest_snapshot, commit_kind); } Result FileStoreCommitImpl::TryCommitOnce( @@ -680,9 +669,8 @@ Result FileStoreCommitImpl::TryCommitOnce( const std::map& properties, Snapshot::CommitKind commit_kind, const std::optional& latest_snapshot, bool need_conflict_check, std::optional retry_start_snapshot_id) { - PAIMON_ASSIGN_OR_RAISE(bool committed, - CheckCommitted(latest_snapshot, retry_start_snapshot_id, identifier, - commit_kind)); + PAIMON_ASSIGN_OR_RAISE(bool committed, CheckCommitted(latest_snapshot, retry_start_snapshot_id, + identifier, commit_kind)); if (committed) { return true; } @@ -710,9 +698,9 @@ Result FileStoreCommitImpl::TryCommitOnce( PAIMON_ASSIGN_OR_RAISE( std::vector base_data_files, ReadAllEntriesFromChangedPartitions(latest_snapshot.value(), changed_partitions)); - PAIMON_RETURN_NOT_OK( - NoConflictsOrFail(latest_snapshot.value().CommitUser(), base_data_files, delta_files, - index_entries, latest_snapshot.value(), commit_kind)); + PAIMON_RETURN_NOT_OK(NoConflictsOrFail(latest_snapshot.value().CommitUser(), + base_data_files, delta_files, index_entries, + latest_snapshot.value(), commit_kind)); } std::vector merge_before_manifests; diff --git a/src/paimon/core/operation/file_store_commit_impl.h b/src/paimon/core/operation/file_store_commit_impl.h index 9ad82ef37..1f7cfd34f 100644 --- a/src/paimon/core/operation/file_store_commit_impl.h +++ b/src/paimon/core/operation/file_store_commit_impl.h @@ -28,8 +28,8 @@ #include "paimon/core/catalog/snapshot_commit.h" #include "paimon/core/core_options.h" #include "paimon/core/manifest/partition_entry.h" -#include "paimon/core/operation/commit/manifest_entry_changes.h" #include "paimon/core/operation/commit/conflict_detection.h" +#include "paimon/core/operation/commit/manifest_entry_changes.h" #include "paimon/core/operation/commit/row_id_column_conflict_checker.h" #include "paimon/core/snapshot.h" #include "paimon/file_store_commit.h" @@ -189,8 +189,7 @@ class FileStoreCommitImpl : public FileStoreCommit { const std::set>& partitions) const; Result CheckCommitted(const std::optional& latest_snapshot, - std::optional retry_start_snapshot_id, - int64_t identifier, + std::optional retry_start_snapshot_id, int64_t identifier, const Snapshot::CommitKind& commit_kind) const; Status NoConflictsOrFail(const std::string& base_commit_user, diff --git a/src/paimon/core/operation/file_store_commit_impl_test.cpp b/src/paimon/core/operation/file_store_commit_impl_test.cpp index 3b0699d7a..6e9b4dbd0 100644 --- a/src/paimon/core/operation/file_store_commit_impl_test.cpp +++ b/src/paimon/core/operation/file_store_commit_impl_test.cpp @@ -52,18 +52,18 @@ #include "paimon/core/manifest/manifest_list.h" #include "paimon/core/operation/metrics/commit_metrics.h" #include "paimon/core/partition/partition_statistics.h" -#include "paimon/core/stats/simple_stats.h" #include "paimon/core/schema/table_schema.h" +#include "paimon/core/stats/simple_stats.h" #include "paimon/core/table/sink/commit_message_impl.h" #include "paimon/core/utils/file_utils.h" #include "paimon/core/utils/snapshot_manager.h" #include "paimon/data/timestamp.h" #include "paimon/defs.h" #include "paimon/factories/factory_creator.h" -#include "paimon/memory/bytes.h" #include "paimon/fs/file_system.h" #include "paimon/fs/local/local_file_system.h" #include "paimon/fs/local/local_file_system_factory.h" +#include "paimon/memory/bytes.h" #include "paimon/memory/memory_pool.h" #include "paimon/metrics.h" #include "paimon/testing/utils/binary_row_generator.h" @@ -76,8 +76,7 @@ namespace paimon::test { namespace { Snapshot MakeSnapshot(const std::optional& next_row_id, - const Snapshot::CommitKind& commit_kind = - Snapshot::CommitKind::Append()) { + const Snapshot::CommitKind& commit_kind = Snapshot::CommitKind::Append()) { return Snapshot( /*id=*/1, /*schema_id=*/1, @@ -89,8 +88,7 @@ Snapshot MakeSnapshot(const std::optional& next_row_id, /*changelog_manifest_list_size=*/std::nullopt, /*index_manifest=*/std::nullopt, /*commit_user=*/"test-user", - /*commit_identifier=*/1, - commit_kind, + /*commit_identifier=*/1, commit_kind, /*time_millis=*/0, /*log_offsets=*/std::nullopt, /*total_record_count=*/std::nullopt, @@ -98,8 +96,7 @@ Snapshot MakeSnapshot(const std::optional& next_row_id, /*changelog_record_count=*/std::nullopt, /*watermark=*/std::nullopt, /*statistics=*/std::nullopt, - /*properties=*/std::nullopt, - next_row_id); + /*properties=*/std::nullopt, next_row_id); } } // namespace @@ -201,8 +198,8 @@ class FileStoreCommitImplTest : public testing::Test { ManifestEntry CreateManifestEntry(const std::string& file_name, const BinaryRow& partition, const FileKind& kind, const BinaryRow& min_key, - const BinaryRow& max_key, int32_t level, - int32_t bucket = 0, int32_t total_buckets = 2) const { + const BinaryRow& max_key, int32_t level, int32_t bucket = 0, + int32_t total_buckets = 2) const { auto data_file_meta = std::make_shared( file_name, 1024, 8, min_key, max_key, SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), /*min_seq_no=*/16, /*max_seq_no=*/32, @@ -1011,9 +1008,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Delete())); changes.push_back(CreateManifestEntry("file4", FileKind::Delete())); changes.push_back(CreateManifestEntry("file5", FileKind::Delete())); - ASSERT_OK(commit_impl->NoConflictsOrFail( - "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), - Snapshot::CommitKind::Append())); + ASSERT_OK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes, + MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -1030,9 +1027,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Delete())); changes.push_back(CreateManifestEntry("file4", FileKind::Delete())); changes.push_back(CreateManifestEntry("file5", FileKind::Delete())); - ASSERT_OK(commit_impl->NoConflictsOrFail( - "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), - Snapshot::CommitKind::Append())); + ASSERT_OK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes, + MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -1044,9 +1041,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Add())); changes.push_back(CreateManifestEntry("file4", FileKind::Add())); changes.push_back(CreateManifestEntry("file5", FileKind::Add())); - ASSERT_NOK(commit_impl->NoConflictsOrFail( - "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), - Snapshot::CommitKind::Append())); + ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes, + MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -1062,9 +1059,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Add())); changes.push_back(CreateManifestEntry("file4", FileKind::Add())); changes.push_back(CreateManifestEntry("file5", FileKind::Add())); - ASSERT_NOK(commit_impl->NoConflictsOrFail( - "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), - Snapshot::CommitKind::Append())); + ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes, + MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -1080,9 +1077,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file3", FileKind::Add())); changes.push_back(CreateManifestEntry("file4", FileKind::Add())); changes.push_back(CreateManifestEntry("file5", FileKind::Add())); - ASSERT_NOK(commit_impl->NoConflictsOrFail( - "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), - Snapshot::CommitKind::Append())); + ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes, + MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -1090,9 +1087,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { base_entries.push_back(CreateManifestEntry("file1", FileKind::Delete())); std::vector changes; - ASSERT_OK(commit_impl->NoConflictsOrFail( - "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), - Snapshot::CommitKind::Append())); + ASSERT_OK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes, + MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -1103,9 +1100,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { base_entries.push_back(CreateManifestEntry("file5", FileKind::Delete())); std::vector changes; - ASSERT_NOK(commit_impl->NoConflictsOrFail( - "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), - Snapshot::CommitKind::Append())); + ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes, + MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } { std::vector base_entries; @@ -1122,9 +1119,9 @@ TEST_F(FileStoreCommitImplTest, TestCheckConflict) { changes.push_back(CreateManifestEntry("file4", FileKind::Delete())); changes.push_back(CreateManifestEntry("file5", FileKind::Delete())); changes.push_back(CreateManifestEntry("file6", FileKind::Delete())); - ASSERT_NOK(commit_impl->NoConflictsOrFail( - "commit_user_1", base_entries, changes, MakeSnapshot(std::nullopt), - Snapshot::CommitKind::Append())); + ASSERT_NOK(commit_impl->NoConflictsOrFail("commit_user_1", base_entries, changes, + MakeSnapshot(std::nullopt), + Snapshot::CommitKind::Append())); } } @@ -1603,8 +1600,7 @@ TEST_F(FileStoreCommitImplTest, TestCommitWithIndexFiles) { new_index_files.push_back(CreateIndexFileMeta("bitmap-index-commit-1")); DataIncrement data_increment({}, {}, {}, std::move(new_index_files), {}); std::shared_ptr msg = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment, - CompactIncrement({}, {}, {})); + partition, /*bucket=*/0, /*total_buckets=*/2, data_increment, CompactIncrement({}, {}, {})); ASSERT_OK(commit_impl->Commit({msg}, 1)); ASSERT_OK_AND_ASSIGN(Snapshot snapshot, commit_impl->snapshot_manager_->LoadSnapshot(1)); @@ -1634,9 +1630,9 @@ TEST_F(FileStoreCommitImplTest, TestCommitWithGlobalIndexFilesChecksConflicts) { const BinaryRow partition = CreateIntRow(10); DataIncrement data_increment_1({CreateAppendDataFileMeta("data-with-row-id", 10)}, {}, {}); - std::shared_ptr msg_1 = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, - CompactIncrement({}, {}, {})); + std::shared_ptr msg_1 = + std::make_shared(partition, /*bucket=*/0, /*total_buckets=*/2, + data_increment_1, CompactIncrement({}, {}, {})); ASSERT_OK(commit_impl->Commit({msg_1}, 1)); std::vector> global_index_files; @@ -1644,12 +1640,11 @@ TEST_F(FileStoreCommitImplTest, TestCommitWithGlobalIndexFilesChecksConflicts) { /*row_range_start=*/0, /*row_range_end=*/10)); DataIncrement data_increment_2({}, {}, {}, std::move(global_index_files), {}); - std::shared_ptr msg_2 = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_2, - CompactIncrement({}, {}, {})); + std::shared_ptr msg_2 = + std::make_shared(partition, /*bucket=*/0, /*total_buckets=*/2, + data_increment_2, CompactIncrement({}, {}, {})); - ASSERT_NOK_WITH_MSG(commit_impl->Commit({msg_2}, 2), - "Global index row ID existence conflict"); + ASSERT_NOK_WITH_MSG(commit_impl->Commit({msg_2}, 2), "Global index row ID existence conflict"); } TEST_F(FileStoreCommitImplTest, TestCommitWithCompactIndexFiles) { @@ -1701,17 +1696,17 @@ TEST_F(FileStoreCommitImplTest, TestCommitWithDeletedIndexFiles) { std::vector> new_index_files; new_index_files.push_back(CreateIndexFileMeta("bitmap-index-delete-1")); DataIncrement data_increment_1({}, {}, {}, std::move(new_index_files), {}); - std::shared_ptr msg_1 = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, - CompactIncrement({}, {}, {})); + std::shared_ptr msg_1 = + std::make_shared(partition, /*bucket=*/0, /*total_buckets=*/2, + data_increment_1, CompactIncrement({}, {}, {})); ASSERT_OK(commit_impl->Commit({msg_1}, 1)); std::vector> deleted_index_files; deleted_index_files.push_back(CreateIndexFileMeta("bitmap-index-delete-1")); DataIncrement data_increment_2({}, {}, {}, {}, std::move(deleted_index_files)); - std::shared_ptr msg_2 = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_2, - CompactIncrement({}, {}, {})); + std::shared_ptr msg_2 = + std::make_shared(partition, /*bucket=*/0, /*total_buckets=*/2, + data_increment_2, CompactIncrement({}, {}, {})); ASSERT_OK(commit_impl->Commit({msg_2}, 2)); ASSERT_OK_AND_ASSIGN(Snapshot snapshot, commit_impl->snapshot_manager_->LoadSnapshot(2)); @@ -1740,9 +1735,9 @@ TEST_F(FileStoreCommitImplTest, TestCommitWithCompactDeletedIndexFiles) { std::vector> new_index_files; new_index_files.push_back(CreateIndexFileMeta("bitmap-index-compact-delete-1")); DataIncrement data_increment_1({}, {}, {}, std::move(new_index_files), {}); - std::shared_ptr msg_1 = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, - CompactIncrement({}, {}, {})); + std::shared_ptr msg_1 = + std::make_shared(partition, /*bucket=*/0, /*total_buckets=*/2, + data_increment_1, CompactIncrement({}, {}, {})); ASSERT_OK(commit_impl->Commit({msg_1}, 1)); std::vector> deleted_index_files; @@ -1779,34 +1774,32 @@ TEST_F(FileStoreCommitImplTest, TestOverwriteWithIndexFiles) { std::vector> new_index_files_1; new_index_files_1.push_back(CreateIndexFileMeta("bitmap-index-1")); DataIncrement data_increment_1({}, {}, {}, std::move(new_index_files_1), {}); - std::shared_ptr msg_1 = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, - CompactIncrement({}, {}, {})); + std::shared_ptr msg_1 = + std::make_shared(partition, /*bucket=*/0, /*total_buckets=*/2, + data_increment_1, CompactIncrement({}, {}, {})); ASSERT_OK(commit_impl->Overwrite({}, {msg_1}, 1)); ASSERT_OK_AND_ASSIGN(auto snapshot1, commit_impl->snapshot_manager_->LatestSnapshot()); ASSERT_TRUE(snapshot1.value().IndexManifest()); std::vector index_entries_1; ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot1.value().IndexManifest().value(), - /*filter=*/nullptr, - &index_entries_1)); + /*filter=*/nullptr, &index_entries_1)); ASSERT_EQ(1u, index_entries_1.size()); ASSERT_EQ("bitmap-index-1", index_entries_1[0].index_file->FileName()); std::vector> new_index_files_2; new_index_files_2.push_back(CreateIndexFileMeta("bitmap-index-2")); DataIncrement data_increment_2({}, {}, {}, std::move(new_index_files_2), {}); - std::shared_ptr msg_2 = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_2, - CompactIncrement({}, {}, {})); + std::shared_ptr msg_2 = + std::make_shared(partition, /*bucket=*/0, /*total_buckets=*/2, + data_increment_2, CompactIncrement({}, {}, {})); ASSERT_OK(commit_impl->Overwrite({}, {msg_2}, 2)); ASSERT_OK_AND_ASSIGN(auto snapshot2, commit_impl->snapshot_manager_->LatestSnapshot()); ASSERT_TRUE(snapshot2.value().IndexManifest()); std::vector index_entries_2; ASSERT_OK(commit_impl->index_manifest_file_->Read(snapshot2.value().IndexManifest().value(), - /*filter=*/nullptr, - &index_entries_2)); + /*filter=*/nullptr, &index_entries_2)); ASSERT_EQ(1u, index_entries_2.size()); ASSERT_EQ("bitmap-index-2", index_entries_2[0].index_file->FileName()); } @@ -1833,11 +1826,13 @@ TEST_F(FileStoreCommitImplTest, TestOverwriteWithCompactIndexFiles) { ASSERT_OK(commit_impl->Overwrite({}, {msg}, 1)); - ASSERT_OK_AND_ASSIGN(Snapshot overwrite_snapshot, commit_impl->snapshot_manager_->LoadSnapshot(1)); + ASSERT_OK_AND_ASSIGN(Snapshot overwrite_snapshot, + commit_impl->snapshot_manager_->LoadSnapshot(1)); ASSERT_EQ(Snapshot::CommitKind::Overwrite(), overwrite_snapshot.GetCommitKind()); ASSERT_FALSE(overwrite_snapshot.IndexManifest()); - ASSERT_OK_AND_ASSIGN(Snapshot compact_snapshot, commit_impl->snapshot_manager_->LoadSnapshot(2)); + ASSERT_OK_AND_ASSIGN(Snapshot compact_snapshot, + commit_impl->snapshot_manager_->LoadSnapshot(2)); ASSERT_EQ(Snapshot::CommitKind::Compact(), compact_snapshot.GetCommitKind()); ASSERT_TRUE(compact_snapshot.IndexManifest()); @@ -1920,9 +1915,9 @@ TEST_F(FileStoreCommitImplTest, TestFilterAndOverwriteWithIndexFiles) { std::vector> new_index_files_1; new_index_files_1.push_back(CreateIndexFileMeta("bitmap-index-filter-1")); DataIncrement data_increment_1({}, {}, {}, std::move(new_index_files_1), {}); - std::shared_ptr msg_1 = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_1, - CompactIncrement({}, {}, {})); + std::shared_ptr msg_1 = + std::make_shared(partition, /*bucket=*/0, /*total_buckets=*/2, + data_increment_1, CompactIncrement({}, {}, {})); ASSERT_OK_AND_ASSIGN(int32_t actual_commit, commit_impl->FilterAndOverwrite({}, {msg_1}, 1, 10)); @@ -1933,9 +1928,9 @@ TEST_F(FileStoreCommitImplTest, TestFilterAndOverwriteWithIndexFiles) { std::vector> new_index_files_2; new_index_files_2.push_back(CreateIndexFileMeta("bitmap-index-filter-2")); DataIncrement data_increment_2({}, {}, {}, std::move(new_index_files_2), {}); - std::shared_ptr msg_2 = std::make_shared( - partition, /*bucket=*/0, /*total_buckets=*/2, data_increment_2, - CompactIncrement({}, {}, {})); + std::shared_ptr msg_2 = + std::make_shared(partition, /*bucket=*/0, /*total_buckets=*/2, + data_increment_2, CompactIncrement({}, {}, {})); ASSERT_OK_AND_ASSIGN(actual_commit, commit_impl->FilterAndOverwrite({}, {msg_2}, 2, 20)); ASSERT_EQ(1, actual_commit); @@ -1968,19 +1963,20 @@ TEST_F(FileStoreCommitImplTest, TestFilterAndOverwriteWithCompactIndexFiles) { std::shared_ptr msg = std::make_shared( partition, /*bucket=*/0, /*total_buckets=*/2, data_increment, compact_increment); - ASSERT_OK_AND_ASSIGN(int32_t actual_commit, - commit_impl->FilterAndOverwrite({}, {msg}, 1, 10)); + ASSERT_OK_AND_ASSIGN(int32_t actual_commit, commit_impl->FilterAndOverwrite({}, {msg}, 1, 10)); ASSERT_EQ(1, actual_commit); ASSERT_OK_AND_ASSIGN(actual_commit, commit_impl->FilterAndOverwrite({}, {msg}, 1, 5)); ASSERT_EQ(0, actual_commit); - ASSERT_OK_AND_ASSIGN(Snapshot overwrite_snapshot, commit_impl->snapshot_manager_->LoadSnapshot(1)); + ASSERT_OK_AND_ASSIGN(Snapshot overwrite_snapshot, + commit_impl->snapshot_manager_->LoadSnapshot(1)); ASSERT_EQ(Snapshot::CommitKind::Overwrite(), overwrite_snapshot.GetCommitKind()); ASSERT_TRUE(overwrite_snapshot.Watermark()); ASSERT_EQ(10, overwrite_snapshot.Watermark().value()); ASSERT_FALSE(overwrite_snapshot.IndexManifest()); - ASSERT_OK_AND_ASSIGN(Snapshot compact_snapshot, commit_impl->snapshot_manager_->LoadSnapshot(2)); + ASSERT_OK_AND_ASSIGN(Snapshot compact_snapshot, + commit_impl->snapshot_manager_->LoadSnapshot(2)); ASSERT_EQ(Snapshot::CommitKind::Compact(), compact_snapshot.GetCommitKind()); ASSERT_TRUE(compact_snapshot.Watermark()); ASSERT_EQ(10, compact_snapshot.Watermark().value()); @@ -2046,16 +2042,16 @@ TEST_F(FileStoreCommitImplTest, TestOverwriteWithSpecifyPartitionIndexFiles) { std::vector> index_files_partition_10_v1; index_files_partition_10_v1.push_back(CreateIndexFileMeta("bitmap-index-partition-10-v1")); - DataIncrement data_increment_partition_10_v1( - {}, {}, {}, std::move(index_files_partition_10_v1), {}); + DataIncrement data_increment_partition_10_v1({}, {}, {}, std::move(index_files_partition_10_v1), + {}); std::shared_ptr msg_partition_10_v1 = std::make_shared( partition_10, /*bucket=*/0, /*total_buckets=*/2, data_increment_partition_10_v1, CompactIncrement({}, {}, {})); std::vector> index_files_partition_20_v1; index_files_partition_20_v1.push_back(CreateIndexFileMeta("bitmap-index-partition-20-v1")); - DataIncrement data_increment_partition_20_v1( - {}, {}, {}, std::move(index_files_partition_20_v1), {}); + DataIncrement data_increment_partition_20_v1({}, {}, {}, std::move(index_files_partition_20_v1), + {}); std::shared_ptr msg_partition_20_v1 = std::make_shared( partition_20, /*bucket=*/0, /*total_buckets=*/2, data_increment_partition_20_v1, CompactIncrement({}, {}, {})); @@ -2064,8 +2060,8 @@ TEST_F(FileStoreCommitImplTest, TestOverwriteWithSpecifyPartitionIndexFiles) { std::vector> index_files_partition_10_v2; index_files_partition_10_v2.push_back(CreateIndexFileMeta("bitmap-index-partition-10-v2")); - DataIncrement data_increment_partition_10_v2( - {}, {}, {}, std::move(index_files_partition_10_v2), {}); + DataIncrement data_increment_partition_10_v2({}, {}, {}, std::move(index_files_partition_10_v2), + {}); std::shared_ptr msg_partition_10_v2 = std::make_shared( partition_10, /*bucket=*/0, /*total_buckets=*/2, data_increment_partition_10_v2, CompactIncrement({}, {}, {})); diff --git a/src/paimon/core/operation/file_store_commit_test.cpp b/src/paimon/core/operation/file_store_commit_test.cpp index 6a39670ad..4a20cbe7f 100644 --- a/src/paimon/core/operation/file_store_commit_test.cpp +++ b/src/paimon/core/operation/file_store_commit_test.cpp @@ -28,17 +28,17 @@ #include "paimon/commit_context.h" #include "paimon/common/utils/linked_hash_map.h" #include "paimon/common/utils/path_util.h" -#include "paimon/core/deletionvectors/deletion_vectors_index_file.h" #include "paimon/core/deletionvectors/deletion_vector.h" +#include "paimon/core/deletionvectors/deletion_vectors_index_file.h" #include "paimon/core/index/deletion_vector_meta.h" #include "paimon/core/index/index_file_meta.h" +#include "paimon/core/io/compact_increment.h" +#include "paimon/core/io/data_increment.h" #include "paimon/core/operation/file_store_commit_impl.h" #include "paimon/core/table/sink/commit_message_impl.h" #include "paimon/core/utils/snapshot_manager.h" -#include "paimon/core/io/data_increment.h" -#include "paimon/core/io/compact_increment.h" -#include "paimon/fs/local/local_file_system.h" #include "paimon/defs.h" +#include "paimon/fs/local/local_file_system.h" #include "paimon/result.h" #include "paimon/testing/utils/binary_row_generator.h" #include "paimon/testing/utils/testharness.h" @@ -119,11 +119,10 @@ TEST(FileStoreCommitTest, TestAppendDvIndexShouldUseOverwriteCommitKind) { ASSERT_OK_AND_ASSIGN(auto commit, FileStoreCommit::Create(std::move(commit_context))); LinkedHashMap dv_ranges; - dv_ranges.insert_or_assign( - "data-file-1", - DeletionVectorMeta( - /*data_file_name=*/"data-file-1", /*offset=*/0, /*length=*/10, - /*cardinality=*/1)); + dv_ranges.insert_or_assign("data-file-1", + DeletionVectorMeta( + /*data_file_name=*/"data-file-1", /*offset=*/0, /*length=*/10, + /*cardinality=*/1)); std::vector> new_index_files; new_index_files.push_back(std::make_shared( DeletionVectorsIndexFile::DELETION_VECTORS_INDEX, "dv-index-1", 100, 1,