From dcb53143505ec8cecc2cd57ed666e2f55c09450b Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 7 Apr 2025 23:18:17 +0800 Subject: [PATCH 1/6] feat: snapshot Signed-off-by: Junwang Zhao --- src/iceberg/CMakeLists.txt | 3 +- src/iceberg/snapshot.cc | 123 +++++++++++++++++++++ src/iceberg/snapshot.h | 217 +++++++++++++++++++++++++++++++++++++ src/iceberg/snapshot_ref.h | 57 ++++++++++ test/CMakeLists.txt | 3 +- test/snapshot_test.cc | 130 ++++++++++++++++++++++ 6 files changed, 531 insertions(+), 2 deletions(-) create mode 100644 src/iceberg/snapshot.cc create mode 100644 src/iceberg/snapshot.h create mode 100644 src/iceberg/snapshot_ref.h create mode 100644 test/snapshot_test.cc diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 9b634014b..1a8c33491 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -32,7 +32,8 @@ set(ICEBERG_SOURCES table_metadata.cc transform.cc transform_function.cc - type.cc) + type.cc + snapshot.cc) set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS) set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS) diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc new file mode 100644 index 000000000..fdd2a2c21 --- /dev/null +++ b/src/iceberg/snapshot.cc @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/snapshot.h" + +#include "iceberg/util/formatter.h" + +namespace iceberg { + +namespace { +/// \brief Get the relative Operation name +constexpr std::string_view ToString(Operation operation) { + switch (operation) { + case Operation::kAppend: + return "append"; + case Operation::kOverwrite: + return "overwrite"; + case Operation::kReplace: + return "replace"; + case Operation::kDelete: + return "delete"; + default: + return "invalid"; + } +} +} // namespace + +Summary::Summary(Operation op, std::unordered_map props) + : operation_(op), additional_properties_(std::move(props)) {} + +Operation Summary::operation() const { return operation_; } + +const std::unordered_map& Summary::properties() const { + return additional_properties_; +} + +std::string Summary::ToString() const { + std::string repr = + "summary: { operation: " + std::string(iceberg::ToString(operation_)); + for (const auto& [key, value] : additional_properties_) { + repr += ", " + key + ": " + value; + } + repr += "}"; + return repr; +} + +bool Summary::Equals(const Summary& other) const { + return operation_ == other.operation_ && + additional_properties_ == other.additional_properties_; +} + +Snapshot::Snapshot(int64_t snapshot_id, std::optional parent_snapshot_id, + int64_t sequence_number, int64_t timestamp_ms, + std::string manifest_list, std::shared_ptr summary, + std::optional schema_id) + : snapshot_id_(snapshot_id), + parent_snapshot_id_(parent_snapshot_id), + sequence_number_(sequence_number), + timestamp_ms_(timestamp_ms), + manifest_list_(std::move(manifest_list)), + summary_(std::move(summary)), + schema_id_(schema_id) {} + +int64_t Snapshot::snapshot_id() const { return snapshot_id_; } + +std::optional Snapshot::parent_snapshot_id() const { + return parent_snapshot_id_; +} + +int64_t Snapshot::sequence_number() const { return sequence_number_; } + +int64_t Snapshot::timestamp_ms() const { return timestamp_ms_; } + +const std::string& Snapshot::manifest_list() const { return manifest_list_; } + +const std::shared_ptr& Snapshot::summary() const { return summary_; } + +std::optional Snapshot::schema_id() const { return schema_id_; } + +std::string Snapshot::ToString() const { + std::string repr = "snapshot: { id: " + std::to_string(snapshot_id_); + if (parent_snapshot_id_.has_value()) { + repr += ", parent_id: " + std::to_string(parent_snapshot_id_.value()); + } + repr += ", sequence_number: " + std::to_string(sequence_number_); + repr += ", timestamp_ms: " + std::to_string(timestamp_ms_); + repr += ", manifest_list: " + manifest_list_; + repr += ", summary: " + summary_->ToString(); + + if (schema_id_.has_value()) { + repr += ", schema_id: " + std::to_string(schema_id_.value()); + } + + repr += " }"; + + return repr; +} + +bool Snapshot::Equals(const Snapshot& other) const { + return snapshot_id_ == other.snapshot_id_ && + parent_snapshot_id_ == other.parent_snapshot_id_ && + sequence_number_ == other.sequence_number_ && + timestamp_ms_ == other.timestamp_ms_ && manifest_list_ == other.manifest_list_ && + *summary_ == *other.summary_ && schema_id_ == other.schema_id_; +} + +} // namespace iceberg diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h new file mode 100644 index 000000000..c0e21383b --- /dev/null +++ b/src/iceberg/snapshot.h @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/util/formattable.h" + +namespace iceberg { + +/// Optional Snapshot Summary Fields +/// Metrics +/// See https://iceberg.apache.org/spec/#metrics + +/// \brief Number of data files added in the snapshot +constexpr std::string_view kAddedDataFilesKey = "added-data-files"; +/// \brief Number of data files deleted in the snapshot +constexpr std::string_view kDeletedDataFilesKey = "deleted-data-files"; +/// \brief Total number of live data files in the snapshot +constexpr std::string_view kTotalDataFilesKey = "total-data-files"; +/// \brief Number of positional/equality delete files and deletion vectors added in the +/// snapshot +constexpr std::string_view kAddedDeleteFilesKey = "added-delete-files"; +/// \brief Number of equality delete files added in the snapshot +constexpr std::string_view kAddedEqDeleteFilesKey = "added-equality-delete-files"; +/// \brief Number of equality delete files removed in the snapshot +constexpr std::string_view kRemovedEqDeleteFilesKey = "removed-equality-delete-files"; +/// \brief Number of position delete files added in the snapshot +constexpr std::string_view kAddedPosDeleteFilesKey = "added-position-delete-files"; +/// \brief Number of position delete files removed in the snapshot +constexpr std::string_view kRemovedPosDeleteFilesKey = "removed-position-delete-files"; +/// \brief Number of deletion vectors added in the snapshot +constexpr std::string_view kAddedDVSKey = "added-dvs"; +/// \brief Number of deletion vectors removed in the snapshot +constexpr std::string_view kRemovedDVSKey = "removed-dvs"; +/// \brief Number of positional/equality delete files and deletion vectors removed in the +/// snapshot +constexpr std::string_view kRemovedDeleteFilesKey = "removed-delete-files"; +/// \brief Total number of live positional/equality delete files and deletion vectors in +/// the snapshot +constexpr std::string_view kTotalDeleteFilesKey = "total-delete-files"; +/// \brief Number of records added in the snapshot +constexpr std::string_view kAddedRecordsKey = "added-records"; +/// \brief Number of records deleted in the snapshot +constexpr std::string_view kDeletedRecordsKey = "deleted-records"; +/// \brief Total number of records in the snapshot +constexpr std::string_view kTotalRecordsKey = "total-records"; +/// \brief The size of files added in the snapshot +constexpr std::string_view kAddedFileSizeKey = "added-files-size"; +/// \brief The size of files removed in the snapshot +constexpr std::string_view kRemovedFileSizeKey = "removed-files-size"; +/// \brief Total size of live files in the snapshot +constexpr std::string_view kTotalFileSizeKey = "total-files-size"; +/// \brief Number of position delete records added in the snapshot +constexpr std::string_view kAddedPosDeletesKey = "added-position-deletes"; +/// \brief Number of position delete records removed in the snapshot +constexpr std::string_view kRemovedPosDeletesKey = "removed-position-deletes"; +/// \brief Total number of position delete records in the snapshot +constexpr std::string_view kTotalPosDeletesKey = "total-position-deletes"; +/// \brief Number of equality delete records added in the snapshot +constexpr std::string_view kAddedEqDeletesKey = "added-equality-deletes"; +/// \brief Number of equality delete records removed in the snapshot +constexpr std::string_view kRemovedEqDeletesKey = "removed-equality-deletes"; +/// \brief Total number of equality delete records in the snapshot +constexpr std::string_view kTotalEqDeletesKey = "total-equality-deletes"; +/// \brief Number of duplicate files deleted (duplicates are files recorded more than once +/// in the manifest) +constexpr std::string_view kDeletedDuplicatedFilesKey = "deleted-duplicate-files"; +/// \brief Number of partitions with files added or removed in the snapshot +constexpr std::string_view kChangedPartitionCountProp = "changed-partition-count"; + +/// Other Fields +/// See https://iceberg.apache.org/spec/#other-fields + +/// \brief The Write-Audit-Publish id of a staged snapshot +constexpr std::string_view kWAPIDKey = "wap.id"; +/// \brief The Write-Audit-Publish id of a snapshot already been published +constexpr std::string_view kPublishedWAPIDKey = "published-wap-id"; +/// \brief The original id of a cherry-picked snapshot +constexpr std::string_view kSourceSnapshotIDKey = "source-snapshot-id"; +/// \brief Name of the engine that created the snapshot +constexpr std::string_view kEngineNameKey = "engine-name"; +/// \brief Version of the engine that created the snapshot +constexpr std::string_view kEngineVersionKey = "engine-version"; + +/// \brief The operation field is used by some operations, like snapshot expiration, to +/// skip processing certain snapshots. +enum class Operation { + /// Only data files were added and no files were removed. + kAppend, + /// Data and delete files were added and removed without changing table data; i.e. + /// compaction, change the data file format, or relocating data files. + kReplace, + /// Data and delete files were added and removed in a logical overwrite operation. + kOverwrite, + /// Data files were removed and their contents logically deleted and/or delete files + /// were added to delete rows. + kDelete, +}; + +/// \brief Summarises the changes in the snapshot. +class ICEBERG_EXPORT Summary : public iceberg::util::Formattable { + public: + Summary() = default; + /// \brief Construct a summary with the given operation and properties. + Summary(Operation op, std::unordered_map props); + + /// \brief Get the operation type of the snapshot. + [[nodiscard]] Operation operation() const; + + /// \brief Get the additional properties of the snapshot. + [[nodiscard]] const std::unordered_map& properties() const; + + std::string ToString() const override; + + friend bool operator==(const Summary& lhs, const Summary& rhs) { + return lhs.Equals(rhs); + } + + friend bool operator!=(const Summary& lhs, const Summary& rhs) { return !(lhs == rhs); } + + private: + /// \brief Compare two Summaries for equality. + [[nodiscard]] bool Equals(const Summary& other) const; + + /// The type of operation in the snapshot + Operation operation_{Operation::kAppend}; + /// Other summary data. + std::unordered_map additional_properties_; +}; + +/// \brief A snapshot of the data in a table at a point in time. +/// +/// A snapshot consist of one or more file manifests, and the complete table contents is +/// the union of all the data files in those manifests. +/// +/// Snapshots are created by table operations. +class ICEBERG_EXPORT Snapshot : public iceberg::util::Formattable { + public: + Snapshot(int64_t snapshot_id, std::optional parent_snapshot_id, + int64_t sequence_number, int64_t timestamp_ms, std::string manifest_list, + std::shared_ptr summary, std::optional schema_id); + + /// \brief Get the id of the snapshot. + [[nodiscard]] int64_t snapshot_id() const; + + /// \brief Get parent snapshot id. + [[nodiscard]] std::optional parent_snapshot_id() const; + + /// \brief Get the sequence number of the snapshot. + [[nodiscard]] int64_t sequence_number() const; + + /// \brief Get the timestamp of the snapshot. + [[nodiscard]] int64_t timestamp_ms() const; + + /// \brief Get the manifest list of the snapshot. + [[nodiscard]] const std::string& manifest_list() const; + + /// \brief Get the summary of the snapshot. + [[nodiscard]] const std::shared_ptr& summary() const; + + /// \brief Get the schema ID of the snapshot. + [[nodiscard]] std::optional schema_id() const; + + std::string ToString() const override; + + friend bool operator==(const Snapshot& lhs, const Snapshot& rhs) { + return lhs.Equals(rhs); + } + + friend bool operator!=(const Snapshot& lhs, const Snapshot& rhs) { + return !(lhs == rhs); + } + + private: + /// \brief Compare two snapshots for equality. + [[nodiscard]] bool Equals(const Snapshot& other) const; + + /// A unqiue long ID. + int64_t snapshot_id_; + /// The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent. + std::optional parent_snapshot_id_; + /// A monotonically increasing long that tracks the order of changes to a table. + int64_t sequence_number_; + /// A timestamp when the snapshot was created, used for garbage collection and table + /// inspection. + int64_t timestamp_ms_; + /// The location of a manifest list for this snapshot that tracks manifest files with + /// additional metadata. + std::string manifest_list_; + /// A string map that summaries the snapshot changes, including operation. + std::shared_ptr summary_; + /// ID of the table's current schema when the snapshot was created. + std::optional schema_id_; +}; + +} // namespace iceberg diff --git a/src/iceberg/snapshot_ref.h b/src/iceberg/snapshot_ref.h new file mode 100644 index 000000000..cecf9242a --- /dev/null +++ b/src/iceberg/snapshot_ref.h @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "iceberg/iceberg_export.h" + +namespace iceberg { + +enum class SnapshotRefType { + /// Branches are mutable named references that can be updated by committing a new + /// snapshot as the branch’s referenced snapshot using the Commit Conflict Resolution + /// and Retry procedures. + BRANCH, + /// Tags are labels for individual snapshots + TAG, +}; + +struct ICEBERG_EXPORT SnapshotRef { + /// A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. + int64_t snapshot_id_; + /// Type of the reference, tag or branch + SnapshotRefType type_; + /// For branch type only, a positive number for the minimum number of snapshots to keep + /// in a branch while expiring snapshots. Defaults to table property + /// history.expire.min-snapshots-to-keep. + std::optional min_snapshots_to_keep_; + /// For branch type only, a positive number for the max age of snapshots to keep when + /// expiring, including the latest snapshot. Defaults to table property + /// history.expire.max-snapshot-age-ms. + std::optional max_snapshot_age_ms_; + /// For snapshot references except the main branch, a positive number for the max age of + /// the snapshot reference to keep while expiring snapshots. Defaults to table property + /// history.expire.max-ref-age-ms. The main branch never expires. + std::optional max_ref_age_ms_; +}; + +} // namespace iceberg diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 81028804b..4c41364d0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -34,7 +34,8 @@ target_sources(schema_test partition_field_test.cc partition_spec_test.cc sort_field_test.cc - sort_order_test.cc) + sort_order_test.cc + snapshot_test.cc) target_link_libraries(schema_test PRIVATE iceberg_static GTest::gtest_main GTest::gmock) add_test(NAME schema_test COMMAND schema_test) diff --git a/test/snapshot_test.cc b/test/snapshot_test.cc new file mode 100644 index 000000000..2fec1c680 --- /dev/null +++ b/test/snapshot_test.cc @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/snapshot.h" + +#include + +namespace iceberg { + +/// Test Summary +TEST(SummaryTest, DefaultConstruction) { + Summary default_summary; + EXPECT_EQ(default_summary.operation(), Operation::kAppend); + EXPECT_TRUE(default_summary.properties().empty()); +} + +TEST(SummaryTest, CustomConstruction) { + std::unordered_map properties = {{"key1", "value1"}, + {"key2", "value2"}}; + Summary custom_summary(Operation::kOverwrite, properties); + + EXPECT_EQ(custom_summary.operation(), Operation::kOverwrite); + EXPECT_EQ(custom_summary.properties().size(), 2); + EXPECT_EQ(custom_summary.properties().at("key1"), "value1"); + EXPECT_EQ(custom_summary.properties().at("key2"), "value2"); +} + +TEST(SummaryTest, ToStringRepresentation) { + std::unordered_map properties = {{"keyA", "valueA"}, + {"keyB", "valueB"}}; + Summary summary(Operation::kReplace, properties); + + std::string to_string_result = summary.ToString(); + EXPECT_NE(to_string_result.find("operation"), std::string::npos); + EXPECT_NE(to_string_result.find("replace"), std::string::npos); + EXPECT_NE(to_string_result.find("keyA"), std::string::npos); + EXPECT_NE(to_string_result.find("valueA"), std::string::npos); + EXPECT_NE(to_string_result.find("keyB"), std::string::npos); + EXPECT_NE(to_string_result.find("valueB"), std::string::npos); +} + +TEST(SummaryTest, EqualityComparison) { + std::unordered_map properties1 = {{"key1", "value1"}, + {"key2", "value2"}}; + std::unordered_map properties2 = {{"key1", "value1"}, + {"key2", "value2"}}; + std::unordered_map properties3 = {{"key3", "value3"}}; + + Summary summary1(Operation::kAppend, properties1); + Summary summary2(Operation::kAppend, properties2); + Summary summary3(Operation::kDelete, properties3); + + EXPECT_EQ(summary1, summary2); + EXPECT_NE(summary1, summary3); +} + +/// Test Snapshot +TEST(SnapshotTest, ConstructionAndFieldAccess) { + auto summary = std::make_shared( + Operation::kAppend, std::unordered_map{}); + + Snapshot snapshot(12345, 54321, 1, 1615569200000, "s3://example/manifest_list.avro", + summary, 10); + + EXPECT_EQ(snapshot.snapshot_id(), 12345); + EXPECT_EQ(snapshot.parent_snapshot_id().value(), 54321); + EXPECT_EQ(snapshot.sequence_number(), 1); + EXPECT_EQ(snapshot.timestamp_ms(), 1615569200000); + EXPECT_EQ(snapshot.manifest_list(), "s3://example/manifest_list.avro"); + EXPECT_EQ(snapshot.summary()->operation(), Operation::kAppend); + EXPECT_EQ(snapshot.schema_id().value(), 10); +} + +TEST(SnapshotTest, ToStringRepresentation) { + auto summary = std::make_shared( + Operation::kDelete, std::unordered_map{ + {std::string(kDeletedDataFilesKey), "100"}}); + + Snapshot snapshot(67890, {}, 42, 1625569200000, + "s3://example/another_manifest_list.avro", summary, {}); + + std::string to_string_result = snapshot.ToString(); + + EXPECT_NE(to_string_result.find("67890"), std::string::npos); + EXPECT_NE(to_string_result.find("sequence_number: 42"), std::string::npos); + EXPECT_NE( + to_string_result.find("manifest_list: s3://example/another_manifest_list.avro"), + std::string::npos); + EXPECT_NE(to_string_result.find(kDeletedDataFilesKey), std::string::npos); +} + +TEST(SnapshotTest, EqualityComparison) { + auto summary1 = std::make_shared( + Operation::kAppend, std::unordered_map{ + {std::string(kAddedDataFilesKey), "101"}}); + auto summary2 = std::make_shared( + Operation::kAppend, std::unordered_map{ + {std::string(kAddedDataFilesKey), "101"}}); + auto summary3 = std::make_shared( + Operation::kDelete, std::unordered_map{ + {std::string(kDeletedDataFilesKey), "20"}}); + + Snapshot snapshot1(12345, {}, 1, 1615569200000, "s3://example/manifest_list.avro", + summary1, {}); + Snapshot snapshot2(12345, {}, 1, 1615569200000, "s3://example/manifest_list.avro", + summary2, {}); + Snapshot snapshot3(67890, {}, 1, 1615569200000, "s3://example/manifest_list.avro", + summary3, {}); + + EXPECT_EQ(snapshot1, snapshot2); + EXPECT_NE(snapshot1, snapshot3); +} + +} // namespace iceberg From e6026f4e8d523edba2fdf3e4270eccda25f319b2 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Tue, 8 Apr 2025 22:31:18 +0800 Subject: [PATCH 2/6] changes to the review comments --- src/iceberg/snapshot.cc | 55 +++++----- src/iceberg/snapshot.h | 214 ++++++++++++++++++------------------- src/iceberg/snapshot_ref.h | 16 +-- test/snapshot_test.cc | 73 ++++++------- 4 files changed, 171 insertions(+), 187 deletions(-) diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc index fdd2a2c21..033a66768 100644 --- a/src/iceberg/snapshot.cc +++ b/src/iceberg/snapshot.cc @@ -19,21 +19,23 @@ #include "iceberg/snapshot.h" +#include + #include "iceberg/util/formatter.h" namespace iceberg { namespace { /// \brief Get the relative Operation name -constexpr std::string_view ToString(Operation operation) { +constexpr std::string_view ToString(Summary::Operation operation) { switch (operation) { - case Operation::kAppend: + case Summary::Operation::kAppend: return "append"; - case Operation::kOverwrite: + case Summary::Operation::kOverwrite: return "overwrite"; - case Operation::kReplace: + case Summary::Operation::kReplace: return "replace"; - case Operation::kDelete: + case Summary::Operation::kDelete: return "delete"; default: return "invalid"; @@ -44,30 +46,24 @@ constexpr std::string_view ToString(Operation operation) { Summary::Summary(Operation op, std::unordered_map props) : operation_(op), additional_properties_(std::move(props)) {} -Operation Summary::operation() const { return operation_; } +Summary::Operation Summary::operation() const { return operation_; } const std::unordered_map& Summary::properties() const { return additional_properties_; } std::string Summary::ToString() const { - std::string repr = - "summary: { operation: " + std::string(iceberg::ToString(operation_)); + std::string repr = std::format("summary parent_snapshot_id, int64_t sequence_number, int64_t timestamp_ms, - std::string manifest_list, std::shared_ptr summary, + std::string manifest_list, Summary summary, std::optional schema_id) : snapshot_id_(snapshot_id), parent_snapshot_id_(parent_snapshot_id), @@ -89,35 +85,38 @@ int64_t Snapshot::timestamp_ms() const { return timestamp_ms_; } const std::string& Snapshot::manifest_list() const { return manifest_list_; } -const std::shared_ptr& Snapshot::summary() const { return summary_; } +const Summary& Snapshot::summary() const { return summary_; } std::optional Snapshot::schema_id() const { return schema_id_; } std::string Snapshot::ToString() const { - std::string repr = "snapshot: { id: " + std::to_string(snapshot_id_); + std::string repr; + std::format_to(std::back_inserter(repr), "snapshot<\n id: {}\n", snapshot_id_); if (parent_snapshot_id_.has_value()) { - repr += ", parent_id: " + std::to_string(parent_snapshot_id_.value()); + std::format_to(std::back_inserter(repr), " parent_id: {}\n", + parent_snapshot_id_.value()); } - repr += ", sequence_number: " + std::to_string(sequence_number_); - repr += ", timestamp_ms: " + std::to_string(timestamp_ms_); - repr += ", manifest_list: " + manifest_list_; - repr += ", summary: " + summary_->ToString(); + std::format_to(std::back_inserter(repr), " sequence_number: {}\n", sequence_number_); + std::format_to(std::back_inserter(repr), " timestamp_ms: {}\n", timestamp_ms_); + std::format_to(std::back_inserter(repr), " manifest_list: {}\n", manifest_list_); + std::format_to(std::back_inserter(repr), " summary: {}\n", summary_); if (schema_id_.has_value()) { - repr += ", schema_id: " + std::to_string(schema_id_.value()); + std::format_to(std::back_inserter(repr), " schema_id: {}\n", schema_id_.value()); } - repr += " }"; - + repr += ">"; return repr; } bool Snapshot::Equals(const Snapshot& other) const { + if (this == &other) { + return true; + } return snapshot_id_ == other.snapshot_id_ && parent_snapshot_id_ == other.parent_snapshot_id_ && sequence_number_ == other.sequence_number_ && - timestamp_ms_ == other.timestamp_ms_ && manifest_list_ == other.manifest_list_ && - *summary_ == *other.summary_ && schema_id_ == other.schema_id_; + timestamp_ms_ == other.timestamp_ms_ && schema_id_ == other.schema_id_; } } // namespace iceberg diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index c0e21383b..87f24fc15 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -28,121 +28,117 @@ namespace iceberg { -/// Optional Snapshot Summary Fields -/// Metrics -/// See https://iceberg.apache.org/spec/#metrics - -/// \brief Number of data files added in the snapshot -constexpr std::string_view kAddedDataFilesKey = "added-data-files"; -/// \brief Number of data files deleted in the snapshot -constexpr std::string_view kDeletedDataFilesKey = "deleted-data-files"; -/// \brief Total number of live data files in the snapshot -constexpr std::string_view kTotalDataFilesKey = "total-data-files"; -/// \brief Number of positional/equality delete files and deletion vectors added in the -/// snapshot -constexpr std::string_view kAddedDeleteFilesKey = "added-delete-files"; -/// \brief Number of equality delete files added in the snapshot -constexpr std::string_view kAddedEqDeleteFilesKey = "added-equality-delete-files"; -/// \brief Number of equality delete files removed in the snapshot -constexpr std::string_view kRemovedEqDeleteFilesKey = "removed-equality-delete-files"; -/// \brief Number of position delete files added in the snapshot -constexpr std::string_view kAddedPosDeleteFilesKey = "added-position-delete-files"; -/// \brief Number of position delete files removed in the snapshot -constexpr std::string_view kRemovedPosDeleteFilesKey = "removed-position-delete-files"; -/// \brief Number of deletion vectors added in the snapshot -constexpr std::string_view kAddedDVSKey = "added-dvs"; -/// \brief Number of deletion vectors removed in the snapshot -constexpr std::string_view kRemovedDVSKey = "removed-dvs"; -/// \brief Number of positional/equality delete files and deletion vectors removed in the -/// snapshot -constexpr std::string_view kRemovedDeleteFilesKey = "removed-delete-files"; -/// \brief Total number of live positional/equality delete files and deletion vectors in -/// the snapshot -constexpr std::string_view kTotalDeleteFilesKey = "total-delete-files"; -/// \brief Number of records added in the snapshot -constexpr std::string_view kAddedRecordsKey = "added-records"; -/// \brief Number of records deleted in the snapshot -constexpr std::string_view kDeletedRecordsKey = "deleted-records"; -/// \brief Total number of records in the snapshot -constexpr std::string_view kTotalRecordsKey = "total-records"; -/// \brief The size of files added in the snapshot -constexpr std::string_view kAddedFileSizeKey = "added-files-size"; -/// \brief The size of files removed in the snapshot -constexpr std::string_view kRemovedFileSizeKey = "removed-files-size"; -/// \brief Total size of live files in the snapshot -constexpr std::string_view kTotalFileSizeKey = "total-files-size"; -/// \brief Number of position delete records added in the snapshot -constexpr std::string_view kAddedPosDeletesKey = "added-position-deletes"; -/// \brief Number of position delete records removed in the snapshot -constexpr std::string_view kRemovedPosDeletesKey = "removed-position-deletes"; -/// \brief Total number of position delete records in the snapshot -constexpr std::string_view kTotalPosDeletesKey = "total-position-deletes"; -/// \brief Number of equality delete records added in the snapshot -constexpr std::string_view kAddedEqDeletesKey = "added-equality-deletes"; -/// \brief Number of equality delete records removed in the snapshot -constexpr std::string_view kRemovedEqDeletesKey = "removed-equality-deletes"; -/// \brief Total number of equality delete records in the snapshot -constexpr std::string_view kTotalEqDeletesKey = "total-equality-deletes"; -/// \brief Number of duplicate files deleted (duplicates are files recorded more than once -/// in the manifest) -constexpr std::string_view kDeletedDuplicatedFilesKey = "deleted-duplicate-files"; -/// \brief Number of partitions with files added or removed in the snapshot -constexpr std::string_view kChangedPartitionCountProp = "changed-partition-count"; - -/// Other Fields -/// See https://iceberg.apache.org/spec/#other-fields - -/// \brief The Write-Audit-Publish id of a staged snapshot -constexpr std::string_view kWAPIDKey = "wap.id"; -/// \brief The Write-Audit-Publish id of a snapshot already been published -constexpr std::string_view kPublishedWAPIDKey = "published-wap-id"; -/// \brief The original id of a cherry-picked snapshot -constexpr std::string_view kSourceSnapshotIDKey = "source-snapshot-id"; -/// \brief Name of the engine that created the snapshot -constexpr std::string_view kEngineNameKey = "engine-name"; -/// \brief Version of the engine that created the snapshot -constexpr std::string_view kEngineVersionKey = "engine-version"; - -/// \brief The operation field is used by some operations, like snapshot expiration, to -/// skip processing certain snapshots. -enum class Operation { - /// Only data files were added and no files were removed. - kAppend, - /// Data and delete files were added and removed without changing table data; i.e. - /// compaction, change the data file format, or relocating data files. - kReplace, - /// Data and delete files were added and removed in a logical overwrite operation. - kOverwrite, - /// Data files were removed and their contents logically deleted and/or delete files - /// were added to delete rows. - kDelete, +/// \brief Optional Snapshot Summary Fields +struct SnapshotSummaryFields { + /// \brief The operation field key + constexpr static std::string_view kOperation = "operation"; + + /// Metrics, see https://iceberg.apache.org/spec/#metrics + + /// \brief Number of data files added in the snapshot + constexpr static std::string_view kAddedDataFiles = "added-data-files"; + /// \brief Number of data files deleted in the snapshot + constexpr static std::string_view kDeletedDataFiles = "deleted-data-files"; + /// \brief Total number of live data files in the snapshot + constexpr static std::string_view kTotalDataFiles = "total-data-files"; + /// \brief Number of positional/equality delete files and deletion vectors added in the + /// snapshot + constexpr static std::string_view kAddedDeleteFiles = "added-delete-files"; + /// \brief Number of equality delete files added in the snapshot + constexpr static std::string_view kAddedEqDeleteFiles = "added-equality-delete-files"; + /// \brief Number of equality delete files removed in the snapshot + constexpr static std::string_view kRemovedEqDeleteFiles = + "removed-equality-delete-files"; + /// \brief Number of position delete files added in the snapshot + constexpr static std::string_view kAddedPosDeleteFiles = "added-position-delete-files"; + /// \brief Number of position delete files removed in the snapshot + constexpr static std::string_view kRemovedPosDeleteFiles = + "removed-position-delete-files"; + /// \brief Number of deletion vectors added in the snapshot + constexpr static std::string_view kAddedDVS = "added-dvs"; + /// \brief Number of deletion vectors removed in the snapshot + constexpr static std::string_view kRemovedDVS = "removed-dvs"; + /// \brief Number of positional/equality delete files and deletion vectors removed in + /// the snapshot + constexpr static std::string_view kRemovedDeleteFiles = "removed-delete-files"; + /// \brief Total number of live positional/equality delete files and deletion vectors in + /// the snapshot + constexpr static std::string_view kTotalDeleteFiles = "total-delete-files"; + /// \brief Number of records added in the snapshot + constexpr static std::string_view kAddedRecords = "added-records"; + /// \brief Number of records deleted in the snapshot + constexpr static std::string_view kDeletedRecords = "deleted-records"; + /// \brief Total number of records in the snapshot + constexpr static std::string_view kTotalRecords = "total-records"; + /// \brief The size of files added in the snapshot + constexpr static std::string_view kAddedFileSize = "added-files-size"; + /// \brief The size of files removed in the snapshot + constexpr static std::string_view kRemovedFileSize = "removed-files-size"; + /// \brief Total size of live files in the snapshot + constexpr static std::string_view kTotalFileSize = "total-files-size"; + /// \brief Number of position delete records added in the snapshot + constexpr static std::string_view kAddedPosDeletes = "added-position-deletes"; + /// \brief Number of position delete records removed in the snapshot + constexpr static std::string_view kRemovedPosDeletes = "removed-position-deletes"; + /// \brief Total number of position delete records in the snapshot + constexpr static std::string_view kTotalPosDeletes = "total-position-deletes"; + /// \brief Number of equality delete records added in the snapshot + constexpr static std::string_view kAddedEqDeletes = "added-equality-deletes"; + /// \brief Number of equality delete records removed in the snapshot + constexpr static std::string_view kRemovedEqDeletes = "removed-equality-deletes"; + /// \brief Total number of equality delete records in the snapshot + constexpr static std::string_view kTotalEqDeletes = "total-equality-deletes"; + /// \brief Number of duplicate files deleted (duplicates are files recorded more than + /// once in the manifest) + constexpr static std::string_view kDeletedDuplicatedFiles = "deleted-duplicate-files"; + /// \brief Number of partitions with files added or removed in the snapshot + constexpr static std::string_view kChangedPartitionCountProp = + "changed-partition-count"; + + /// Other Fields, see https://iceberg.apache.org/spec/#other-fields + + /// \brief The Write-Audit-Publish id of a staged snapshot + constexpr static std::string_view kWAPID = "wap.id"; + /// \brief The Write-Audit-Publish id of a snapshot already been published + constexpr static std::string_view kPublishedWAPID = "published-wap-id"; + /// \brief The original id of a cherry-picked snapshot + constexpr static std::string_view kSourceSnapshotID = "source-snapshot-id"; + /// \brief Name of the engine that created the snapshot + constexpr static std::string_view kEngineName = "engine-name"; + /// \brief Version of the engine that created the snapshot + constexpr static std::string_view kEngineVersion = "engine-version"; }; /// \brief Summarises the changes in the snapshot. class ICEBERG_EXPORT Summary : public iceberg::util::Formattable { public: + /// \brief The operation field is used by some operations, like snapshot expiration, to + /// skip processing certain snapshots. + enum class Operation { + /// Only data files were added and no files were removed. + kAppend, + /// Data and delete files were added and removed without changing table data; i.e. + /// compaction, change the data file format, or relocating data files. + kReplace, + /// Data and delete files were added and removed in a logical overwrite operation. + kOverwrite, + /// Data files were removed and their contents logically deleted and/or delete files + /// were added to delete rows. + kDelete, + }; Summary() = default; /// \brief Construct a summary with the given operation and properties. Summary(Operation op, std::unordered_map props); /// \brief Get the operation type of the snapshot. - [[nodiscard]] Operation operation() const; + Operation operation() const; /// \brief Get the additional properties of the snapshot. - [[nodiscard]] const std::unordered_map& properties() const; + const std::unordered_map& properties() const; std::string ToString() const override; - friend bool operator==(const Summary& lhs, const Summary& rhs) { - return lhs.Equals(rhs); - } - - friend bool operator!=(const Summary& lhs, const Summary& rhs) { return !(lhs == rhs); } - private: - /// \brief Compare two Summaries for equality. - [[nodiscard]] bool Equals(const Summary& other) const; - /// The type of operation in the snapshot Operation operation_{Operation::kAppend}; /// Other summary data. @@ -159,28 +155,28 @@ class ICEBERG_EXPORT Snapshot : public iceberg::util::Formattable { public: Snapshot(int64_t snapshot_id, std::optional parent_snapshot_id, int64_t sequence_number, int64_t timestamp_ms, std::string manifest_list, - std::shared_ptr summary, std::optional schema_id); + Summary summary, std::optional schema_id); /// \brief Get the id of the snapshot. - [[nodiscard]] int64_t snapshot_id() const; + int64_t snapshot_id() const; /// \brief Get parent snapshot id. - [[nodiscard]] std::optional parent_snapshot_id() const; + std::optional parent_snapshot_id() const; /// \brief Get the sequence number of the snapshot. - [[nodiscard]] int64_t sequence_number() const; + int64_t sequence_number() const; /// \brief Get the timestamp of the snapshot. - [[nodiscard]] int64_t timestamp_ms() const; + int64_t timestamp_ms() const; /// \brief Get the manifest list of the snapshot. - [[nodiscard]] const std::string& manifest_list() const; + const std::string& manifest_list() const; /// \brief Get the summary of the snapshot. - [[nodiscard]] const std::shared_ptr& summary() const; + const Summary& summary() const; /// \brief Get the schema ID of the snapshot. - [[nodiscard]] std::optional schema_id() const; + std::optional schema_id() const; std::string ToString() const override; @@ -194,7 +190,7 @@ class ICEBERG_EXPORT Snapshot : public iceberg::util::Formattable { private: /// \brief Compare two snapshots for equality. - [[nodiscard]] bool Equals(const Snapshot& other) const; + bool Equals(const Snapshot& other) const; /// A unqiue long ID. int64_t snapshot_id_; @@ -209,7 +205,7 @@ class ICEBERG_EXPORT Snapshot : public iceberg::util::Formattable { /// additional metadata. std::string manifest_list_; /// A string map that summaries the snapshot changes, including operation. - std::shared_ptr summary_; + Summary summary_; /// ID of the table's current schema when the snapshot was created. std::optional schema_id_; }; diff --git a/src/iceberg/snapshot_ref.h b/src/iceberg/snapshot_ref.h index cecf9242a..c6b0add57 100644 --- a/src/iceberg/snapshot_ref.h +++ b/src/iceberg/snapshot_ref.h @@ -26,32 +26,34 @@ namespace iceberg { +/// \brief The type of snapshot reference enum class SnapshotRefType { /// Branches are mutable named references that can be updated by committing a new /// snapshot as the branch’s referenced snapshot using the Commit Conflict Resolution /// and Retry procedures. - BRANCH, + kBranch, /// Tags are labels for individual snapshots - TAG, + kTag, }; +/// \brief A reference to a snapshot, either a branch or a tag. struct ICEBERG_EXPORT SnapshotRef { /// A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. - int64_t snapshot_id_; + int64_t snapshot_id; /// Type of the reference, tag or branch - SnapshotRefType type_; + SnapshotRefType type; /// For branch type only, a positive number for the minimum number of snapshots to keep /// in a branch while expiring snapshots. Defaults to table property /// history.expire.min-snapshots-to-keep. - std::optional min_snapshots_to_keep_; + std::optional min_snapshots_to_keep; /// For branch type only, a positive number for the max age of snapshots to keep when /// expiring, including the latest snapshot. Defaults to table property /// history.expire.max-snapshot-age-ms. - std::optional max_snapshot_age_ms_; + std::optional max_snapshot_age_ms; /// For snapshot references except the main branch, a positive number for the max age of /// the snapshot reference to keep while expiring snapshots. Defaults to table property /// history.expire.max-ref-age-ms. The main branch never expires. - std::optional max_ref_age_ms_; + std::optional max_ref_age_ms; }; } // namespace iceberg diff --git a/test/snapshot_test.cc b/test/snapshot_test.cc index 2fec1c680..6d12984c5 100644 --- a/test/snapshot_test.cc +++ b/test/snapshot_test.cc @@ -26,54 +26,39 @@ namespace iceberg { /// Test Summary TEST(SummaryTest, DefaultConstruction) { Summary default_summary; - EXPECT_EQ(default_summary.operation(), Operation::kAppend); + EXPECT_EQ(default_summary.operation(), Summary::Operation::kAppend); EXPECT_TRUE(default_summary.properties().empty()); } TEST(SummaryTest, CustomConstruction) { - std::unordered_map properties = {{"key1", "value1"}, - {"key2", "value2"}}; - Summary custom_summary(Operation::kOverwrite, properties); + std::unordered_map properties = {{"1", "value1"}, + {"2", "value2"}}; + Summary custom_summary(Summary::Operation::kOverwrite, properties); - EXPECT_EQ(custom_summary.operation(), Operation::kOverwrite); + EXPECT_EQ(custom_summary.operation(), Summary::Operation::kOverwrite); EXPECT_EQ(custom_summary.properties().size(), 2); - EXPECT_EQ(custom_summary.properties().at("key1"), "value1"); - EXPECT_EQ(custom_summary.properties().at("key2"), "value2"); + EXPECT_EQ(custom_summary.properties().at("1"), "value1"); + EXPECT_EQ(custom_summary.properties().at("2"), "value2"); } TEST(SummaryTest, ToStringRepresentation) { - std::unordered_map properties = {{"keyA", "valueA"}, - {"keyB", "valueB"}}; - Summary summary(Operation::kReplace, properties); + std::unordered_map properties = {{"A", "valueA"}, + {"B", "valueB"}}; + Summary summary(Summary::Operation::kReplace, properties); std::string to_string_result = summary.ToString(); EXPECT_NE(to_string_result.find("operation"), std::string::npos); EXPECT_NE(to_string_result.find("replace"), std::string::npos); - EXPECT_NE(to_string_result.find("keyA"), std::string::npos); + EXPECT_NE(to_string_result.find("A"), std::string::npos); EXPECT_NE(to_string_result.find("valueA"), std::string::npos); - EXPECT_NE(to_string_result.find("keyB"), std::string::npos); + EXPECT_NE(to_string_result.find("B"), std::string::npos); EXPECT_NE(to_string_result.find("valueB"), std::string::npos); } -TEST(SummaryTest, EqualityComparison) { - std::unordered_map properties1 = {{"key1", "value1"}, - {"key2", "value2"}}; - std::unordered_map properties2 = {{"key1", "value1"}, - {"key2", "value2"}}; - std::unordered_map properties3 = {{"key3", "value3"}}; - - Summary summary1(Operation::kAppend, properties1); - Summary summary2(Operation::kAppend, properties2); - Summary summary3(Operation::kDelete, properties3); - - EXPECT_EQ(summary1, summary2); - EXPECT_NE(summary1, summary3); -} - /// Test Snapshot TEST(SnapshotTest, ConstructionAndFieldAccess) { - auto summary = std::make_shared( - Operation::kAppend, std::unordered_map{}); + Summary summary(Summary::Operation::kAppend, + std::unordered_map{}); Snapshot snapshot(12345, 54321, 1, 1615569200000, "s3://example/manifest_list.avro", summary, 10); @@ -83,14 +68,15 @@ TEST(SnapshotTest, ConstructionAndFieldAccess) { EXPECT_EQ(snapshot.sequence_number(), 1); EXPECT_EQ(snapshot.timestamp_ms(), 1615569200000); EXPECT_EQ(snapshot.manifest_list(), "s3://example/manifest_list.avro"); - EXPECT_EQ(snapshot.summary()->operation(), Operation::kAppend); + EXPECT_EQ(snapshot.summary().operation(), Summary::Operation::kAppend); EXPECT_EQ(snapshot.schema_id().value(), 10); } TEST(SnapshotTest, ToStringRepresentation) { - auto summary = std::make_shared( - Operation::kDelete, std::unordered_map{ - {std::string(kDeletedDataFilesKey), "100"}}); + auto summary = + Summary(Summary::Operation::kDelete, + std::unordered_map{ + {std::string(SnapshotSummaryFields::kDeletedDataFiles), "100"}}); Snapshot snapshot(67890, {}, 42, 1625569200000, "s3://example/another_manifest_list.avro", summary, {}); @@ -102,19 +88,20 @@ TEST(SnapshotTest, ToStringRepresentation) { EXPECT_NE( to_string_result.find("manifest_list: s3://example/another_manifest_list.avro"), std::string::npos); - EXPECT_NE(to_string_result.find(kDeletedDataFilesKey), std::string::npos); + EXPECT_NE(to_string_result.find(SnapshotSummaryFields::kDeletedDataFiles), + std::string::npos); } TEST(SnapshotTest, EqualityComparison) { - auto summary1 = std::make_shared( - Operation::kAppend, std::unordered_map{ - {std::string(kAddedDataFilesKey), "101"}}); - auto summary2 = std::make_shared( - Operation::kAppend, std::unordered_map{ - {std::string(kAddedDataFilesKey), "101"}}); - auto summary3 = std::make_shared( - Operation::kDelete, std::unordered_map{ - {std::string(kDeletedDataFilesKey), "20"}}); + Summary summary1(Summary::Operation::kAppend, + std::unordered_map{ + {std::string(SnapshotSummaryFields::kAddedDataFiles), "101"}}); + Summary summary2(Summary::Operation::kAppend, + std::unordered_map{ + {std::string(SnapshotSummaryFields::kAddedDataFiles), "101"}}); + Summary summary3(Summary::Operation::kDelete, + std::unordered_map{ + {std::string(SnapshotSummaryFields::kDeletedDataFiles), "20"}}); Snapshot snapshot1(12345, {}, 1, 1615569200000, "s3://example/manifest_list.avro", summary1, {}); From c14a45a34e6402e4302bff73aff8c04f591e31f4 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Wed, 9 Apr 2025 22:40:33 +0800 Subject: [PATCH 3/6] resolve more review comments --- src/iceberg/snapshot.cc | 98 +++++----------------- src/iceberg/snapshot.h | 167 ++++++++++++++++++++----------------- src/iceberg/snapshot_ref.h | 59 ------------- test/snapshot_test.cc | 127 +++++++++++----------------- 4 files changed, 157 insertions(+), 294 deletions(-) delete mode 100644 src/iceberg/snapshot_ref.h diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc index 033a66768..916271bb2 100644 --- a/src/iceberg/snapshot.cc +++ b/src/iceberg/snapshot.cc @@ -25,98 +25,38 @@ namespace iceberg { -namespace { -/// \brief Get the relative Operation name -constexpr std::string_view ToString(Summary::Operation operation) { - switch (operation) { - case Summary::Operation::kAppend: - return "append"; - case Summary::Operation::kOverwrite: - return "overwrite"; - case Summary::Operation::kReplace: - return "replace"; - case Summary::Operation::kDelete: - return "delete"; - default: - return "invalid"; +std::optional Snapshot::operation() const { + auto it = summary.find(std::string(SnapshotSummaryFields::kOperation)); + if (it != summary.end()) { + return it->second; } + return std::nullopt; } -} // namespace -Summary::Summary(Operation op, std::unordered_map props) - : operation_(op), additional_properties_(std::move(props)) {} - -Summary::Operation Summary::operation() const { return operation_; } - -const std::unordered_map& Summary::properties() const { - return additional_properties_; -} - -std::string Summary::ToString() const { - std::string repr = std::format("summary> +Snapshot::ManifestList() const { + if (std::holds_alternative(manifest_list)) { + return std::cref(std::get(manifest_list)); } - repr += ">"; - return repr; -} - -Snapshot::Snapshot(int64_t snapshot_id, std::optional parent_snapshot_id, - int64_t sequence_number, int64_t timestamp_ms, - std::string manifest_list, Summary summary, - std::optional schema_id) - : snapshot_id_(snapshot_id), - parent_snapshot_id_(parent_snapshot_id), - sequence_number_(sequence_number), - timestamp_ms_(timestamp_ms), - manifest_list_(std::move(manifest_list)), - summary_(std::move(summary)), - schema_id_(schema_id) {} - -int64_t Snapshot::snapshot_id() const { return snapshot_id_; } - -std::optional Snapshot::parent_snapshot_id() const { - return parent_snapshot_id_; + return std::nullopt; } -int64_t Snapshot::sequence_number() const { return sequence_number_; } - -int64_t Snapshot::timestamp_ms() const { return timestamp_ms_; } - -const std::string& Snapshot::manifest_list() const { return manifest_list_; } - -const Summary& Snapshot::summary() const { return summary_; } - -std::optional Snapshot::schema_id() const { return schema_id_; } - -std::string Snapshot::ToString() const { - std::string repr; - std::format_to(std::back_inserter(repr), "snapshot<\n id: {}\n", snapshot_id_); - if (parent_snapshot_id_.has_value()) { - std::format_to(std::back_inserter(repr), " parent_id: {}\n", - parent_snapshot_id_.value()); - } - std::format_to(std::back_inserter(repr), " sequence_number: {}\n", sequence_number_); - std::format_to(std::back_inserter(repr), " timestamp_ms: {}\n", timestamp_ms_); - std::format_to(std::back_inserter(repr), " manifest_list: {}\n", manifest_list_); - std::format_to(std::back_inserter(repr), " summary: {}\n", summary_); - - if (schema_id_.has_value()) { - std::format_to(std::back_inserter(repr), " schema_id: {}\n", schema_id_.value()); +std::optional> Snapshot::Manifests() + const { + if (std::holds_alternative(manifest_list)) { + return std::cref(std::get(manifest_list)); } - - repr += ">"; - return repr; + return std::nullopt; } bool Snapshot::Equals(const Snapshot& other) const { if (this == &other) { return true; } - return snapshot_id_ == other.snapshot_id_ && - parent_snapshot_id_ == other.parent_snapshot_id_ && - sequence_number_ == other.sequence_number_ && - timestamp_ms_ == other.timestamp_ms_ && schema_id_ == other.schema_id_; + return snapshot_id == other.snapshot_id && + parent_snapshot_id == other.parent_snapshot_id && + sequence_number == other.sequence_number && timestamp_ms == other.timestamp_ms && + schema_id == other.schema_id; } } // namespace iceberg diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index 87f24fc15..1041644ff 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -22,12 +22,44 @@ #include #include #include +#include +#include #include "iceberg/iceberg_export.h" #include "iceberg/util/formattable.h" namespace iceberg { +/// \brief The type of snapshot reference +enum class SnapshotRefType { + /// Branches are mutable named references that can be updated by committing a new + /// snapshot as the branch’s referenced snapshot using the Commit Conflict Resolution + /// and Retry procedures. + kBranch, + /// Tags are labels for individual snapshots + kTag, +}; + +/// \brief A reference to a snapshot, either a branch or a tag. +struct ICEBERG_EXPORT SnapshotRef { + /// A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. + int64_t snapshot_id; + /// Type of the reference, tag or branch + SnapshotRefType type; + /// For branch type only, a positive number for the minimum number of snapshots to keep + /// in a branch while expiring snapshots. Defaults to table property + /// history.expire.min-snapshots-to-keep. + std::optional min_snapshots_to_keep; + /// For branch type only, a positive number for the max age of snapshots to keep when + /// expiring, including the latest snapshot. Defaults to table property + /// history.expire.max-snapshot-age-ms. + std::optional max_snapshot_age_ms; + /// For snapshot references except the main branch, a positive number for the max age of + /// the snapshot reference to keep while expiring snapshots. Defaults to table property + /// history.expire.max-ref-age-ms. The main branch never expires. + std::optional max_ref_age_ms; +}; + /// \brief Optional Snapshot Summary Fields struct SnapshotSummaryFields { /// \brief The operation field key @@ -109,40 +141,23 @@ struct SnapshotSummaryFields { constexpr static std::string_view kEngineVersion = "engine-version"; }; -/// \brief Summarises the changes in the snapshot. -class ICEBERG_EXPORT Summary : public iceberg::util::Formattable { - public: - /// \brief The operation field is used by some operations, like snapshot expiration, to - /// skip processing certain snapshots. - enum class Operation { - /// Only data files were added and no files were removed. - kAppend, - /// Data and delete files were added and removed without changing table data; i.e. - /// compaction, change the data file format, or relocating data files. - kReplace, - /// Data and delete files were added and removed in a logical overwrite operation. - kOverwrite, - /// Data files were removed and their contents logically deleted and/or delete files - /// were added to delete rows. - kDelete, - }; - Summary() = default; - /// \brief Construct a summary with the given operation and properties. - Summary(Operation op, std::unordered_map props); - - /// \brief Get the operation type of the snapshot. - Operation operation() const; - - /// \brief Get the additional properties of the snapshot. - const std::unordered_map& properties() const; - - std::string ToString() const override; - - private: - /// The type of operation in the snapshot - Operation operation_{Operation::kAppend}; - /// Other summary data. - std::unordered_map additional_properties_; +/// \brief Data operation that produce snapshots. +/// +/// A snapshot can return the operation that created the snapshot to help other components +/// ignore snapshots that are not needed for some tasks. For example, snapshot expiration +/// does not need to clean up deleted files for appends, which have no deleted files. +struct ICEBERG_EXPORT DataOperation { + /// \brief Only data files were added and no files were removed. + static constexpr std::string_view kAppend = "append"; + /// \brief Data and delete files were added and removed without changing table data; + /// i.e. compaction, change the data file format, or relocating data files. + static constexpr std::string_view kReplace = "replace"; + /// \brief Data and delete files were added and removed in a logical overwrite + /// operation. + static constexpr std::string_view kOverwrite = "overwrite"; + /// \brief Data files were removed and their contents logically deleted and/or delete + /// files were added to delete rows. + static constexpr std::string_view kDelete = "delete"; }; /// \brief A snapshot of the data in a table at a point in time. @@ -151,39 +166,52 @@ class ICEBERG_EXPORT Summary : public iceberg::util::Formattable { /// the union of all the data files in those manifests. /// /// Snapshots are created by table operations. -class ICEBERG_EXPORT Snapshot : public iceberg::util::Formattable { - public: - Snapshot(int64_t snapshot_id, std::optional parent_snapshot_id, - int64_t sequence_number, int64_t timestamp_ms, std::string manifest_list, - Summary summary, std::optional schema_id); - - /// \brief Get the id of the snapshot. - int64_t snapshot_id() const; - - /// \brief Get parent snapshot id. - std::optional parent_snapshot_id() const; +struct ICEBERG_EXPORT Snapshot { + using manifest_list_t = std::string; + using manifests_t = std::vector; - /// \brief Get the sequence number of the snapshot. - int64_t sequence_number() const; - - /// \brief Get the timestamp of the snapshot. - int64_t timestamp_ms() const; - - /// \brief Get the manifest list of the snapshot. - const std::string& manifest_list() const; - - /// \brief Get the summary of the snapshot. - const Summary& summary() const; - - /// \brief Get the schema ID of the snapshot. - std::optional schema_id() const; - - std::string ToString() const override; + /// A unqiue long ID. + int64_t snapshot_id; + /// The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent. + std::optional parent_snapshot_id; + /// A monotonically increasing long that tracks the order of changes to a table. + int64_t sequence_number; + /// A timestamp when the snapshot was created, used for garbage collection and table + /// inspection. + int64_t timestamp_ms; + /// The location of a manifest list for this snapshot that tracks manifest files with + /// additional metadata. + std::variant manifest_list; + /// A string map that summaries the snapshot changes, including operation. + std::unordered_map summary; + /// ID of the table's current schema when the snapshot was created. + std::optional schema_id; + + /// \brief Return the name of the DataOperations data operation that produced this + /// snapshot. + /// + /// \return the operation that produced this snapshot, or nullopt if the operation is + /// unknown. + std::optional operation() const; + + /// \brief Get the manifest list for this snapshot. + /// + /// \return the manifest list for this snapshot, or nullopt if the snapshot has no + /// manifest list. + std::optional> ManifestList() const; + + /// \brief Get the manifests for this snapshot. + /// + /// \return the manifests for this snapshot, or nullopt if the snapshot has no + /// manifests. + std::optional> Manifests() const; + /// \brief Compare two snapshots for equality. friend bool operator==(const Snapshot& lhs, const Snapshot& rhs) { return lhs.Equals(rhs); } + /// \brief Compare two snapshots for inequality. friend bool operator!=(const Snapshot& lhs, const Snapshot& rhs) { return !(lhs == rhs); } @@ -191,23 +219,6 @@ class ICEBERG_EXPORT Snapshot : public iceberg::util::Formattable { private: /// \brief Compare two snapshots for equality. bool Equals(const Snapshot& other) const; - - /// A unqiue long ID. - int64_t snapshot_id_; - /// The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent. - std::optional parent_snapshot_id_; - /// A monotonically increasing long that tracks the order of changes to a table. - int64_t sequence_number_; - /// A timestamp when the snapshot was created, used for garbage collection and table - /// inspection. - int64_t timestamp_ms_; - /// The location of a manifest list for this snapshot that tracks manifest files with - /// additional metadata. - std::string manifest_list_; - /// A string map that summaries the snapshot changes, including operation. - Summary summary_; - /// ID of the table's current schema when the snapshot was created. - std::optional schema_id_; }; } // namespace iceberg diff --git a/src/iceberg/snapshot_ref.h b/src/iceberg/snapshot_ref.h deleted file mode 100644 index c6b0add57..000000000 --- a/src/iceberg/snapshot_ref.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once - -#include -#include - -#include "iceberg/iceberg_export.h" - -namespace iceberg { - -/// \brief The type of snapshot reference -enum class SnapshotRefType { - /// Branches are mutable named references that can be updated by committing a new - /// snapshot as the branch’s referenced snapshot using the Commit Conflict Resolution - /// and Retry procedures. - kBranch, - /// Tags are labels for individual snapshots - kTag, -}; - -/// \brief A reference to a snapshot, either a branch or a tag. -struct ICEBERG_EXPORT SnapshotRef { - /// A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. - int64_t snapshot_id; - /// Type of the reference, tag or branch - SnapshotRefType type; - /// For branch type only, a positive number for the minimum number of snapshots to keep - /// in a branch while expiring snapshots. Defaults to table property - /// history.expire.min-snapshots-to-keep. - std::optional min_snapshots_to_keep; - /// For branch type only, a positive number for the max age of snapshots to keep when - /// expiring, including the latest snapshot. Defaults to table property - /// history.expire.max-snapshot-age-ms. - std::optional max_snapshot_age_ms; - /// For snapshot references except the main branch, a positive number for the max age of - /// the snapshot reference to keep while expiring snapshots. Defaults to table property - /// history.expire.max-ref-age-ms. The main branch never expires. - std::optional max_ref_age_ms; -}; - -} // namespace iceberg diff --git a/test/snapshot_test.cc b/test/snapshot_test.cc index 6d12984c5..ab19a7b9b 100644 --- a/test/snapshot_test.cc +++ b/test/snapshot_test.cc @@ -23,90 +23,61 @@ namespace iceberg { -/// Test Summary -TEST(SummaryTest, DefaultConstruction) { - Summary default_summary; - EXPECT_EQ(default_summary.operation(), Summary::Operation::kAppend); - EXPECT_TRUE(default_summary.properties().empty()); +class SnapshotTest : public ::testing::Test { + protected: + void SetUp() override { + // Initialize some common test data + summary1 = {{std::string(SnapshotSummaryFields::kOperation), + std::string(DataOperation::kAppend)}, + {std::string(SnapshotSummaryFields::kAddedDataFiles), "101"}}; + + summary2 = {{std::string(SnapshotSummaryFields::kOperation), + std::string(DataOperation::kAppend)}, + {std::string(SnapshotSummaryFields::kAddedDataFiles), "101"}}; + + summary3 = {{std::string(SnapshotSummaryFields::kOperation), + std::string(DataOperation::kDelete)}, + {std::string(SnapshotSummaryFields::kDeletedDataFiles), "20"}}; + } + + std::unordered_map summary1; + std::unordered_map summary2; + std::unordered_map summary3; +}; + +TEST_F(SnapshotTest, ConstructionAndFieldAccess) { + // Test the constructor and field access + Snapshot snapshot{.snapshot_id = 12345, + .parent_snapshot_id = 54321, + .sequence_number = 1, + .timestamp_ms = 1615569200000, + .manifest_list = "s3://example/manifest_list.avro", + .summary = summary1, + .schema_id = 10}; + + EXPECT_EQ(snapshot.snapshot_id, 12345); + EXPECT_TRUE(snapshot.parent_snapshot_id.has_value()); + EXPECT_EQ(*snapshot.parent_snapshot_id, 54321); + EXPECT_EQ(snapshot.sequence_number, 1); + EXPECT_EQ(snapshot.timestamp_ms, 1615569200000); + EXPECT_EQ(snapshot.ManifestList()->get(), "s3://example/manifest_list.avro"); + EXPECT_EQ(snapshot.operation().value(), DataOperation::kAppend); + EXPECT_EQ(snapshot.summary.at(std::string(SnapshotSummaryFields::kAddedDataFiles)), + "101"); + EXPECT_EQ(snapshot.summary.at(std::string(SnapshotSummaryFields::kOperation)), + DataOperation::kAppend); + EXPECT_TRUE(snapshot.schema_id.has_value()); + EXPECT_EQ(snapshot.schema_id.value(), 10); } -TEST(SummaryTest, CustomConstruction) { - std::unordered_map properties = {{"1", "value1"}, - {"2", "value2"}}; - Summary custom_summary(Summary::Operation::kOverwrite, properties); - - EXPECT_EQ(custom_summary.operation(), Summary::Operation::kOverwrite); - EXPECT_EQ(custom_summary.properties().size(), 2); - EXPECT_EQ(custom_summary.properties().at("1"), "value1"); - EXPECT_EQ(custom_summary.properties().at("2"), "value2"); -} - -TEST(SummaryTest, ToStringRepresentation) { - std::unordered_map properties = {{"A", "valueA"}, - {"B", "valueB"}}; - Summary summary(Summary::Operation::kReplace, properties); - - std::string to_string_result = summary.ToString(); - EXPECT_NE(to_string_result.find("operation"), std::string::npos); - EXPECT_NE(to_string_result.find("replace"), std::string::npos); - EXPECT_NE(to_string_result.find("A"), std::string::npos); - EXPECT_NE(to_string_result.find("valueA"), std::string::npos); - EXPECT_NE(to_string_result.find("B"), std::string::npos); - EXPECT_NE(to_string_result.find("valueB"), std::string::npos); -} - -/// Test Snapshot -TEST(SnapshotTest, ConstructionAndFieldAccess) { - Summary summary(Summary::Operation::kAppend, - std::unordered_map{}); - - Snapshot snapshot(12345, 54321, 1, 1615569200000, "s3://example/manifest_list.avro", - summary, 10); - - EXPECT_EQ(snapshot.snapshot_id(), 12345); - EXPECT_EQ(snapshot.parent_snapshot_id().value(), 54321); - EXPECT_EQ(snapshot.sequence_number(), 1); - EXPECT_EQ(snapshot.timestamp_ms(), 1615569200000); - EXPECT_EQ(snapshot.manifest_list(), "s3://example/manifest_list.avro"); - EXPECT_EQ(snapshot.summary().operation(), Summary::Operation::kAppend); - EXPECT_EQ(snapshot.schema_id().value(), 10); -} - -TEST(SnapshotTest, ToStringRepresentation) { - auto summary = - Summary(Summary::Operation::kDelete, - std::unordered_map{ - {std::string(SnapshotSummaryFields::kDeletedDataFiles), "100"}}); - - Snapshot snapshot(67890, {}, 42, 1625569200000, - "s3://example/another_manifest_list.avro", summary, {}); - - std::string to_string_result = snapshot.ToString(); - - EXPECT_NE(to_string_result.find("67890"), std::string::npos); - EXPECT_NE(to_string_result.find("sequence_number: 42"), std::string::npos); - EXPECT_NE( - to_string_result.find("manifest_list: s3://example/another_manifest_list.avro"), - std::string::npos); - EXPECT_NE(to_string_result.find(SnapshotSummaryFields::kDeletedDataFiles), - std::string::npos); -} - -TEST(SnapshotTest, EqualityComparison) { - Summary summary1(Summary::Operation::kAppend, - std::unordered_map{ - {std::string(SnapshotSummaryFields::kAddedDataFiles), "101"}}); - Summary summary2(Summary::Operation::kAppend, - std::unordered_map{ - {std::string(SnapshotSummaryFields::kAddedDataFiles), "101"}}); - Summary summary3(Summary::Operation::kDelete, - std::unordered_map{ - {std::string(SnapshotSummaryFields::kDeletedDataFiles), "20"}}); - +TEST_F(SnapshotTest, EqualityComparison) { + // Test the == and != operators Snapshot snapshot1(12345, {}, 1, 1615569200000, "s3://example/manifest_list.avro", summary1, {}); + Snapshot snapshot2(12345, {}, 1, 1615569200000, "s3://example/manifest_list.avro", summary2, {}); + Snapshot snapshot3(67890, {}, 1, 1615569200000, "s3://example/manifest_list.avro", summary3, {}); From 80893f86d3324059145bc6e9bf8ac915f7ac72e6 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 10 Apr 2025 22:50:40 +0800 Subject: [PATCH 4/6] resolve some review comments --- src/iceberg/snapshot.cc | 82 +++++++++++++++++++++++++------- src/iceberg/snapshot.h | 101 +++++++++++++++++++++------------------- test/snapshot_test.cc | 32 ++++++------- 3 files changed, 133 insertions(+), 82 deletions(-) diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc index 916271bb2..97babd03e 100644 --- a/src/iceberg/snapshot.cc +++ b/src/iceberg/snapshot.cc @@ -19,34 +19,82 @@ #include "iceberg/snapshot.h" -#include +namespace iceberg { -#include "iceberg/util/formatter.h" +const std::string SnapshotSummaryFields::kOperation = "operation"; +const std::string SnapshotSummaryFields::kAddedDataFiles = "added-data-files"; +const std::string SnapshotSummaryFields::kDeletedDataFiles = "deleted-data-files"; +const std::string SnapshotSummaryFields::kTotalDataFiles = "total-data-files"; +const std::string SnapshotSummaryFields::kAddedDeleteFiles = "added-delete-files"; +const std::string SnapshotSummaryFields::kAddedEqDeleteFiles = + "added-equality-delete-files"; +const std::string SnapshotSummaryFields::kRemovedEqDeleteFiles = + "removed-equality-delete-files"; +const std::string SnapshotSummaryFields::kAddedPosDeleteFiles = + "added-position-delete-files"; +const std::string SnapshotSummaryFields::kRemovedPosDeleteFiles = + "removed-position-delete-files"; +const std::string SnapshotSummaryFields::kAddedDVS = "added-dvs"; +const std::string SnapshotSummaryFields::kRemovedDVS = "removed-dvs"; +const std::string SnapshotSummaryFields::kRemovedDeleteFiles = "removed-delete-files"; +const std::string SnapshotSummaryFields::kTotalDeleteFiles = "total-delete-files"; +const std::string SnapshotSummaryFields::kAddedRecords = "added-records"; +const std::string SnapshotSummaryFields::kDeletedRecords = "deleted-records"; +const std::string SnapshotSummaryFields::kTotalRecords = "total-records"; +const std::string SnapshotSummaryFields::kAddedFileSize = "added-files-size"; +const std::string SnapshotSummaryFields::kRemovedFileSize = "removed-files-size"; +const std::string SnapshotSummaryFields::kTotalFileSize = "total-files-size"; +const std::string SnapshotSummaryFields::kAddedPosDeletes = "added-position-deletes"; +const std::string SnapshotSummaryFields::kRemovedPosDeletes = "removed-position-deletes"; +const std::string SnapshotSummaryFields::kTotalPosDeletes = "total-position-deletes"; +const std::string SnapshotSummaryFields::kAddedEqDeletes = "added-equality-deletes"; +const std::string SnapshotSummaryFields::kRemovedEqDeletes = "removed-equality-deletes"; +const std::string SnapshotSummaryFields::kTotalEqDeletes = "total-equality-deletes"; +const std::string SnapshotSummaryFields::kDeletedDuplicatedFiles = + "deleted-duplicate-files"; +const std::string SnapshotSummaryFields::kChangedPartitionCountProp = + "changed-partition-count"; -namespace iceberg { +const std::string SnapshotSummaryFields::kWAPID = "wap.id"; +const std::string SnapshotSummaryFields::kPublishedWAPID = "published-wap-id"; +const std::string SnapshotSummaryFields::kSourceSnapshotID = "source-snapshot-id"; +const std::string SnapshotSummaryFields::kEngineName = "engine-name"; +const std::string SnapshotSummaryFields::kEngineVersion = "engine-version"; -std::optional Snapshot::operation() const { - auto it = summary.find(std::string(SnapshotSummaryFields::kOperation)); +std::optional Snapshot::operation() const { + auto it = summary.find(SnapshotSummaryFields::kOperation); if (it != summary.end()) { return it->second; } return std::nullopt; } -std::optional> -Snapshot::ManifestList() const { - if (std::holds_alternative(manifest_list)) { - return std::cref(std::get(manifest_list)); - } - return std::nullopt; +std::optional> Snapshot::ManifestList() const { + return std::visit( + [&](const auto& manifest_list) + -> std::optional> { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return std::cref(manifest_list); + } else { + return std::nullopt; + } + }, + manifest_list); } -std::optional> Snapshot::Manifests() - const { - if (std::holds_alternative(manifest_list)) { - return std::cref(std::get(manifest_list)); - } - return std::nullopt; +std::optional> Snapshot::Manifests() const { + return std::visit( + [&](const auto& manifest_list) + -> std::optional> { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return std::cref(manifest_list); + } else { + return std::nullopt; + } + }, + manifest_list); } bool Snapshot::Equals(const Snapshot& other) const { diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index 1041644ff..1d913d0b8 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -21,12 +21,12 @@ #include #include +#include #include #include #include #include "iceberg/iceberg_export.h" -#include "iceberg/util/formattable.h" namespace iceberg { @@ -63,82 +63,79 @@ struct ICEBERG_EXPORT SnapshotRef { /// \brief Optional Snapshot Summary Fields struct SnapshotSummaryFields { /// \brief The operation field key - constexpr static std::string_view kOperation = "operation"; + static const std::string kOperation; /// Metrics, see https://iceberg.apache.org/spec/#metrics /// \brief Number of data files added in the snapshot - constexpr static std::string_view kAddedDataFiles = "added-data-files"; + static const std::string kAddedDataFiles; /// \brief Number of data files deleted in the snapshot - constexpr static std::string_view kDeletedDataFiles = "deleted-data-files"; + static const std::string kDeletedDataFiles; /// \brief Total number of live data files in the snapshot - constexpr static std::string_view kTotalDataFiles = "total-data-files"; + static const std::string kTotalDataFiles; /// \brief Number of positional/equality delete files and deletion vectors added in the /// snapshot - constexpr static std::string_view kAddedDeleteFiles = "added-delete-files"; + static const std::string kAddedDeleteFiles; /// \brief Number of equality delete files added in the snapshot - constexpr static std::string_view kAddedEqDeleteFiles = "added-equality-delete-files"; + static const std::string kAddedEqDeleteFiles; /// \brief Number of equality delete files removed in the snapshot - constexpr static std::string_view kRemovedEqDeleteFiles = - "removed-equality-delete-files"; + static const std::string kRemovedEqDeleteFiles; /// \brief Number of position delete files added in the snapshot - constexpr static std::string_view kAddedPosDeleteFiles = "added-position-delete-files"; + static const std::string kAddedPosDeleteFiles; /// \brief Number of position delete files removed in the snapshot - constexpr static std::string_view kRemovedPosDeleteFiles = - "removed-position-delete-files"; + static const std::string kRemovedPosDeleteFiles; /// \brief Number of deletion vectors added in the snapshot - constexpr static std::string_view kAddedDVS = "added-dvs"; + static const std::string kAddedDVS; /// \brief Number of deletion vectors removed in the snapshot - constexpr static std::string_view kRemovedDVS = "removed-dvs"; + static const std::string kRemovedDVS; /// \brief Number of positional/equality delete files and deletion vectors removed in /// the snapshot - constexpr static std::string_view kRemovedDeleteFiles = "removed-delete-files"; + static const std::string kRemovedDeleteFiles; /// \brief Total number of live positional/equality delete files and deletion vectors in /// the snapshot - constexpr static std::string_view kTotalDeleteFiles = "total-delete-files"; + static const std::string kTotalDeleteFiles; /// \brief Number of records added in the snapshot - constexpr static std::string_view kAddedRecords = "added-records"; + static const std::string kAddedRecords; /// \brief Number of records deleted in the snapshot - constexpr static std::string_view kDeletedRecords = "deleted-records"; + static const std::string kDeletedRecords; /// \brief Total number of records in the snapshot - constexpr static std::string_view kTotalRecords = "total-records"; + static const std::string kTotalRecords; /// \brief The size of files added in the snapshot - constexpr static std::string_view kAddedFileSize = "added-files-size"; + static const std::string kAddedFileSize; /// \brief The size of files removed in the snapshot - constexpr static std::string_view kRemovedFileSize = "removed-files-size"; + static const std::string kRemovedFileSize; /// \brief Total size of live files in the snapshot - constexpr static std::string_view kTotalFileSize = "total-files-size"; + static const std::string kTotalFileSize; /// \brief Number of position delete records added in the snapshot - constexpr static std::string_view kAddedPosDeletes = "added-position-deletes"; + static const std::string kAddedPosDeletes; /// \brief Number of position delete records removed in the snapshot - constexpr static std::string_view kRemovedPosDeletes = "removed-position-deletes"; + static const std::string kRemovedPosDeletes; /// \brief Total number of position delete records in the snapshot - constexpr static std::string_view kTotalPosDeletes = "total-position-deletes"; + static const std::string kTotalPosDeletes; /// \brief Number of equality delete records added in the snapshot - constexpr static std::string_view kAddedEqDeletes = "added-equality-deletes"; + static const std::string kAddedEqDeletes; /// \brief Number of equality delete records removed in the snapshot - constexpr static std::string_view kRemovedEqDeletes = "removed-equality-deletes"; + static const std::string kRemovedEqDeletes; /// \brief Total number of equality delete records in the snapshot - constexpr static std::string_view kTotalEqDeletes = "total-equality-deletes"; + static const std::string kTotalEqDeletes; /// \brief Number of duplicate files deleted (duplicates are files recorded more than /// once in the manifest) - constexpr static std::string_view kDeletedDuplicatedFiles = "deleted-duplicate-files"; + static const std::string kDeletedDuplicatedFiles; /// \brief Number of partitions with files added or removed in the snapshot - constexpr static std::string_view kChangedPartitionCountProp = - "changed-partition-count"; + static const std::string kChangedPartitionCountProp; /// Other Fields, see https://iceberg.apache.org/spec/#other-fields /// \brief The Write-Audit-Publish id of a staged snapshot - constexpr static std::string_view kWAPID = "wap.id"; + static const std::string kWAPID; /// \brief The Write-Audit-Publish id of a snapshot already been published - constexpr static std::string_view kPublishedWAPID = "published-wap-id"; + static const std::string kPublishedWAPID; /// \brief The original id of a cherry-picked snapshot - constexpr static std::string_view kSourceSnapshotID = "source-snapshot-id"; + static const std::string kSourceSnapshotID; /// \brief Name of the engine that created the snapshot - constexpr static std::string_view kEngineName = "engine-name"; + static const std::string kEngineName; /// \brief Version of the engine that created the snapshot - constexpr static std::string_view kEngineVersion = "engine-version"; + static const std::string kEngineVersion; }; /// \brief Data operation that produce snapshots. @@ -148,16 +145,27 @@ struct SnapshotSummaryFields { /// does not need to clean up deleted files for appends, which have no deleted files. struct ICEBERG_EXPORT DataOperation { /// \brief Only data files were added and no files were removed. - static constexpr std::string_view kAppend = "append"; + static constexpr std::string kAppend = "append"; /// \brief Data and delete files were added and removed without changing table data; /// i.e. compaction, change the data file format, or relocating data files. - static constexpr std::string_view kReplace = "replace"; + static constexpr std::string kReplace = "replace"; /// \brief Data and delete files were added and removed in a logical overwrite /// operation. - static constexpr std::string_view kOverwrite = "overwrite"; + static constexpr std::string kOverwrite = "overwrite"; /// \brief Data files were removed and their contents logically deleted and/or delete /// files were added to delete rows. - static constexpr std::string_view kDelete = "delete"; + static constexpr std::string kDelete = "delete"; +}; + +/// \brief The location of a manifest list for this snapshot that tracks manifest files +/// with additional metadata +struct ICEBERG_EXPORT ManifestList { + std::string manifest_list_path; +}; + +/// \brief A list of manifest file locations. +struct ICEBERG_EXPORT Manifests { + std::vector manifest_paths; }; /// \brief A snapshot of the data in a table at a point in time. @@ -167,9 +175,6 @@ struct ICEBERG_EXPORT DataOperation { /// /// Snapshots are created by table operations. struct ICEBERG_EXPORT Snapshot { - using manifest_list_t = std::string; - using manifests_t = std::vector; - /// A unqiue long ID. int64_t snapshot_id; /// The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent. @@ -180,8 +185,8 @@ struct ICEBERG_EXPORT Snapshot { /// inspection. int64_t timestamp_ms; /// The location of a manifest list for this snapshot that tracks manifest files with - /// additional metadata. - std::variant manifest_list; + /// additional metadata(v2) or a list of manifest file locations(v1). + std::variant manifest_list; /// A string map that summaries the snapshot changes, including operation. std::unordered_map summary; /// ID of the table's current schema when the snapshot was created. @@ -192,19 +197,19 @@ struct ICEBERG_EXPORT Snapshot { /// /// \return the operation that produced this snapshot, or nullopt if the operation is /// unknown. - std::optional operation() const; + std::optional operation() const; /// \brief Get the manifest list for this snapshot. /// /// \return the manifest list for this snapshot, or nullopt if the snapshot has no /// manifest list. - std::optional> ManifestList() const; + std::optional> ManifestList() const; /// \brief Get the manifests for this snapshot. /// /// \return the manifests for this snapshot, or nullopt if the snapshot has no /// manifests. - std::optional> Manifests() const; + std::optional> Manifests() const; /// \brief Compare two snapshots for equality. friend bool operator==(const Snapshot& lhs, const Snapshot& rhs) { diff --git a/test/snapshot_test.cc b/test/snapshot_test.cc index ab19a7b9b..8786181af 100644 --- a/test/snapshot_test.cc +++ b/test/snapshot_test.cc @@ -27,17 +27,14 @@ class SnapshotTest : public ::testing::Test { protected: void SetUp() override { // Initialize some common test data - summary1 = {{std::string(SnapshotSummaryFields::kOperation), - std::string(DataOperation::kAppend)}, - {std::string(SnapshotSummaryFields::kAddedDataFiles), "101"}}; + summary1 = {{SnapshotSummaryFields::kOperation, DataOperation::kAppend}, + {SnapshotSummaryFields::kAddedDataFiles, "101"}}; - summary2 = {{std::string(SnapshotSummaryFields::kOperation), - std::string(DataOperation::kAppend)}, - {std::string(SnapshotSummaryFields::kAddedDataFiles), "101"}}; + summary2 = {{SnapshotSummaryFields::kOperation, DataOperation::kAppend}, + {SnapshotSummaryFields::kAddedDataFiles, "101"}}; - summary3 = {{std::string(SnapshotSummaryFields::kOperation), - std::string(DataOperation::kDelete)}, - {std::string(SnapshotSummaryFields::kDeletedDataFiles), "20"}}; + summary3 = {{SnapshotSummaryFields::kOperation, DataOperation::kDelete}, + {SnapshotSummaryFields::kDeletedDataFiles, "20"}}; } std::unordered_map summary1; @@ -51,7 +48,7 @@ TEST_F(SnapshotTest, ConstructionAndFieldAccess) { .parent_snapshot_id = 54321, .sequence_number = 1, .timestamp_ms = 1615569200000, - .manifest_list = "s3://example/manifest_list.avro", + .manifest_list = ManifestList("s3://example/manifest_list.avro"), .summary = summary1, .schema_id = 10}; @@ -60,7 +57,8 @@ TEST_F(SnapshotTest, ConstructionAndFieldAccess) { EXPECT_EQ(*snapshot.parent_snapshot_id, 54321); EXPECT_EQ(snapshot.sequence_number, 1); EXPECT_EQ(snapshot.timestamp_ms, 1615569200000); - EXPECT_EQ(snapshot.ManifestList()->get(), "s3://example/manifest_list.avro"); + EXPECT_EQ(snapshot.ManifestList()->get().manifest_list_path, + "s3://example/manifest_list.avro"); EXPECT_EQ(snapshot.operation().value(), DataOperation::kAppend); EXPECT_EQ(snapshot.summary.at(std::string(SnapshotSummaryFields::kAddedDataFiles)), "101"); @@ -72,14 +70,14 @@ TEST_F(SnapshotTest, ConstructionAndFieldAccess) { TEST_F(SnapshotTest, EqualityComparison) { // Test the == and != operators - Snapshot snapshot1(12345, {}, 1, 1615569200000, "s3://example/manifest_list.avro", - summary1, {}); + Snapshot snapshot1(12345, {}, 1, 1615569200000, + ManifestList("s3://example/manifest_list.avro"), summary1, {}); - Snapshot snapshot2(12345, {}, 1, 1615569200000, "s3://example/manifest_list.avro", - summary2, {}); + Snapshot snapshot2(12345, {}, 1, 1615569200000, + ManifestList("s3://example/manifest_list.avro"), summary2, {}); - Snapshot snapshot3(67890, {}, 1, 1615569200000, "s3://example/manifest_list.avro", - summary3, {}); + Snapshot snapshot3(67890, {}, 1, 1615569200000, + ManifestList("s3://example/manifest_list.avro"), summary3, {}); EXPECT_EQ(snapshot1, snapshot2); EXPECT_NE(snapshot1, snapshot3); From f2afa5be070cd19ead13cd3e3c7cfe99a2b356a1 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 11 Apr 2025 08:26:36 +0800 Subject: [PATCH 5/6] get rid of manifests per Fokko's suggestion --- src/iceberg/snapshot.cc | 32 ++------------------------------ src/iceberg/snapshot.h | 31 ++++--------------------------- test/snapshot_test.cc | 17 ++++++++--------- 3 files changed, 14 insertions(+), 66 deletions(-) diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc index 97babd03e..5619a949f 100644 --- a/src/iceberg/snapshot.cc +++ b/src/iceberg/snapshot.cc @@ -34,8 +34,8 @@ const std::string SnapshotSummaryFields::kAddedPosDeleteFiles = "added-position-delete-files"; const std::string SnapshotSummaryFields::kRemovedPosDeleteFiles = "removed-position-delete-files"; -const std::string SnapshotSummaryFields::kAddedDVS = "added-dvs"; -const std::string SnapshotSummaryFields::kRemovedDVS = "removed-dvs"; +const std::string SnapshotSummaryFields::kAddedDVs = "added-dvs"; +const std::string SnapshotSummaryFields::kRemovedDVs = "removed-dvs"; const std::string SnapshotSummaryFields::kRemovedDeleteFiles = "removed-delete-files"; const std::string SnapshotSummaryFields::kTotalDeleteFiles = "total-delete-files"; const std::string SnapshotSummaryFields::kAddedRecords = "added-records"; @@ -69,34 +69,6 @@ std::optional Snapshot::operation() const { return std::nullopt; } -std::optional> Snapshot::ManifestList() const { - return std::visit( - [&](const auto& manifest_list) - -> std::optional> { - using T = std::decay_t; - if constexpr (std::is_same_v) { - return std::cref(manifest_list); - } else { - return std::nullopt; - } - }, - manifest_list); -} - -std::optional> Snapshot::Manifests() const { - return std::visit( - [&](const auto& manifest_list) - -> std::optional> { - using T = std::decay_t; - if constexpr (std::is_same_v) { - return std::cref(manifest_list); - } else { - return std::nullopt; - } - }, - manifest_list); -} - bool Snapshot::Equals(const Snapshot& other) const { if (this == &other) { return true; diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index 1d913d0b8..925ebecc7 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -85,9 +85,9 @@ struct SnapshotSummaryFields { /// \brief Number of position delete files removed in the snapshot static const std::string kRemovedPosDeleteFiles; /// \brief Number of deletion vectors added in the snapshot - static const std::string kAddedDVS; + static const std::string kAddedDVs; /// \brief Number of deletion vectors removed in the snapshot - static const std::string kRemovedDVS; + static const std::string kRemovedDVs; /// \brief Number of positional/equality delete files and deletion vectors removed in /// the snapshot static const std::string kRemovedDeleteFiles; @@ -157,17 +157,6 @@ struct ICEBERG_EXPORT DataOperation { static constexpr std::string kDelete = "delete"; }; -/// \brief The location of a manifest list for this snapshot that tracks manifest files -/// with additional metadata -struct ICEBERG_EXPORT ManifestList { - std::string manifest_list_path; -}; - -/// \brief A list of manifest file locations. -struct ICEBERG_EXPORT Manifests { - std::vector manifest_paths; -}; - /// \brief A snapshot of the data in a table at a point in time. /// /// A snapshot consist of one or more file manifests, and the complete table contents is @@ -185,8 +174,8 @@ struct ICEBERG_EXPORT Snapshot { /// inspection. int64_t timestamp_ms; /// The location of a manifest list for this snapshot that tracks manifest files with - /// additional metadata(v2) or a list of manifest file locations(v1). - std::variant manifest_list; + /// additional metadata. + std::string manifest_list; /// A string map that summaries the snapshot changes, including operation. std::unordered_map summary; /// ID of the table's current schema when the snapshot was created. @@ -199,18 +188,6 @@ struct ICEBERG_EXPORT Snapshot { /// unknown. std::optional operation() const; - /// \brief Get the manifest list for this snapshot. - /// - /// \return the manifest list for this snapshot, or nullopt if the snapshot has no - /// manifest list. - std::optional> ManifestList() const; - - /// \brief Get the manifests for this snapshot. - /// - /// \return the manifests for this snapshot, or nullopt if the snapshot has no - /// manifests. - std::optional> Manifests() const; - /// \brief Compare two snapshots for equality. friend bool operator==(const Snapshot& lhs, const Snapshot& rhs) { return lhs.Equals(rhs); diff --git a/test/snapshot_test.cc b/test/snapshot_test.cc index 8786181af..8725a692e 100644 --- a/test/snapshot_test.cc +++ b/test/snapshot_test.cc @@ -48,7 +48,7 @@ TEST_F(SnapshotTest, ConstructionAndFieldAccess) { .parent_snapshot_id = 54321, .sequence_number = 1, .timestamp_ms = 1615569200000, - .manifest_list = ManifestList("s3://example/manifest_list.avro"), + .manifest_list = "s3://example/manifest_list.avro", .summary = summary1, .schema_id = 10}; @@ -57,8 +57,7 @@ TEST_F(SnapshotTest, ConstructionAndFieldAccess) { EXPECT_EQ(*snapshot.parent_snapshot_id, 54321); EXPECT_EQ(snapshot.sequence_number, 1); EXPECT_EQ(snapshot.timestamp_ms, 1615569200000); - EXPECT_EQ(snapshot.ManifestList()->get().manifest_list_path, - "s3://example/manifest_list.avro"); + EXPECT_EQ(snapshot.manifest_list, "s3://example/manifest_list.avro"); EXPECT_EQ(snapshot.operation().value(), DataOperation::kAppend); EXPECT_EQ(snapshot.summary.at(std::string(SnapshotSummaryFields::kAddedDataFiles)), "101"); @@ -70,14 +69,14 @@ TEST_F(SnapshotTest, ConstructionAndFieldAccess) { TEST_F(SnapshotTest, EqualityComparison) { // Test the == and != operators - Snapshot snapshot1(12345, {}, 1, 1615569200000, - ManifestList("s3://example/manifest_list.avro"), summary1, {}); + Snapshot snapshot1(12345, {}, 1, 1615569200000, "s3://example/manifest_list.avro", + summary1, {}); - Snapshot snapshot2(12345, {}, 1, 1615569200000, - ManifestList("s3://example/manifest_list.avro"), summary2, {}); + Snapshot snapshot2(12345, {}, 1, 1615569200000, "s3://example/manifest_list.avro", + summary2, {}); - Snapshot snapshot3(67890, {}, 1, 1615569200000, - ManifestList("s3://example/manifest_list.avro"), summary3, {}); + Snapshot snapshot3(67890, {}, 1, 1615569200000, "s3://example/manifest_list.avro", + summary3, {}); EXPECT_EQ(snapshot1, snapshot2); EXPECT_NE(snapshot1, snapshot3); From 68a7a389287007d02e69a085fa86d241a4968096 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Fri, 11 Apr 2025 18:28:55 +0800 Subject: [PATCH 6/6] use inline static const string --- src/iceberg/snapshot.cc | 51 +++++------------- src/iceberg/snapshot.h | 114 ++++++++++++++++++++++------------------ test/snapshot_test.cc | 38 +++++++++++++- 3 files changed, 112 insertions(+), 91 deletions(-) diff --git a/src/iceberg/snapshot.cc b/src/iceberg/snapshot.cc index 5619a949f..a17d9e8b2 100644 --- a/src/iceberg/snapshot.cc +++ b/src/iceberg/snapshot.cc @@ -21,45 +21,18 @@ namespace iceberg { -const std::string SnapshotSummaryFields::kOperation = "operation"; -const std::string SnapshotSummaryFields::kAddedDataFiles = "added-data-files"; -const std::string SnapshotSummaryFields::kDeletedDataFiles = "deleted-data-files"; -const std::string SnapshotSummaryFields::kTotalDataFiles = "total-data-files"; -const std::string SnapshotSummaryFields::kAddedDeleteFiles = "added-delete-files"; -const std::string SnapshotSummaryFields::kAddedEqDeleteFiles = - "added-equality-delete-files"; -const std::string SnapshotSummaryFields::kRemovedEqDeleteFiles = - "removed-equality-delete-files"; -const std::string SnapshotSummaryFields::kAddedPosDeleteFiles = - "added-position-delete-files"; -const std::string SnapshotSummaryFields::kRemovedPosDeleteFiles = - "removed-position-delete-files"; -const std::string SnapshotSummaryFields::kAddedDVs = "added-dvs"; -const std::string SnapshotSummaryFields::kRemovedDVs = "removed-dvs"; -const std::string SnapshotSummaryFields::kRemovedDeleteFiles = "removed-delete-files"; -const std::string SnapshotSummaryFields::kTotalDeleteFiles = "total-delete-files"; -const std::string SnapshotSummaryFields::kAddedRecords = "added-records"; -const std::string SnapshotSummaryFields::kDeletedRecords = "deleted-records"; -const std::string SnapshotSummaryFields::kTotalRecords = "total-records"; -const std::string SnapshotSummaryFields::kAddedFileSize = "added-files-size"; -const std::string SnapshotSummaryFields::kRemovedFileSize = "removed-files-size"; -const std::string SnapshotSummaryFields::kTotalFileSize = "total-files-size"; -const std::string SnapshotSummaryFields::kAddedPosDeletes = "added-position-deletes"; -const std::string SnapshotSummaryFields::kRemovedPosDeletes = "removed-position-deletes"; -const std::string SnapshotSummaryFields::kTotalPosDeletes = "total-position-deletes"; -const std::string SnapshotSummaryFields::kAddedEqDeletes = "added-equality-deletes"; -const std::string SnapshotSummaryFields::kRemovedEqDeletes = "removed-equality-deletes"; -const std::string SnapshotSummaryFields::kTotalEqDeletes = "total-equality-deletes"; -const std::string SnapshotSummaryFields::kDeletedDuplicatedFiles = - "deleted-duplicate-files"; -const std::string SnapshotSummaryFields::kChangedPartitionCountProp = - "changed-partition-count"; - -const std::string SnapshotSummaryFields::kWAPID = "wap.id"; -const std::string SnapshotSummaryFields::kPublishedWAPID = "published-wap-id"; -const std::string SnapshotSummaryFields::kSourceSnapshotID = "source-snapshot-id"; -const std::string SnapshotSummaryFields::kEngineName = "engine-name"; -const std::string SnapshotSummaryFields::kEngineVersion = "engine-version"; +SnapshotRefType SnapshotRef::type() const noexcept { + return std::visit( + [&](const auto& retention) -> SnapshotRefType { + using T = std::decay_t; + if constexpr (std::is_same_v) { + return SnapshotRefType::kBranch; + } else { + return SnapshotRefType::kTag; + } + }, + retention); +} std::optional Snapshot::operation() const { auto it = summary.find(SnapshotSummaryFields::kOperation); diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index 925ebecc7..29577e9b1 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -24,7 +24,6 @@ #include #include #include -#include #include "iceberg/iceberg_export.h" @@ -42,100 +41,113 @@ enum class SnapshotRefType { /// \brief A reference to a snapshot, either a branch or a tag. struct ICEBERG_EXPORT SnapshotRef { + struct ICEBERG_EXPORT Branch { + /// A positive number for the minimum number of snapshots to keep in a branch while + /// expiring snapshots. Defaults to table property + /// history.expire.min-snapshots-to-keep. + std::optional min_snapshots_to_keep; + /// A positive number for the max age of snapshots to keep when + /// expiring, including the latest snapshot. Defaults to table property + /// history.expire.max-snapshot-age-ms. + std::optional max_snapshot_age_ms; + /// For snapshot references except the main branch, a positive number for the max age + /// of the snapshot reference to keep while expiring snapshots. Defaults to table + /// property history.expire.max-ref-age-ms. The main branch never expires. + std::optional max_ref_age_ms; + }; + + struct ICEBERG_EXPORT Tag { + /// For snapshot references except the main branch, a positive number for the max age + /// of the snapshot reference to keep while expiring snapshots. Defaults to table + /// property history.expire.max-ref-age-ms. The main branch never expires. + std::optional max_ref_age_ms; + }; + /// A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. int64_t snapshot_id; - /// Type of the reference, tag or branch - SnapshotRefType type; - /// For branch type only, a positive number for the minimum number of snapshots to keep - /// in a branch while expiring snapshots. Defaults to table property - /// history.expire.min-snapshots-to-keep. - std::optional min_snapshots_to_keep; - /// For branch type only, a positive number for the max age of snapshots to keep when - /// expiring, including the latest snapshot. Defaults to table property - /// history.expire.max-snapshot-age-ms. - std::optional max_snapshot_age_ms; - /// For snapshot references except the main branch, a positive number for the max age of - /// the snapshot reference to keep while expiring snapshots. Defaults to table property - /// history.expire.max-ref-age-ms. The main branch never expires. - std::optional max_ref_age_ms; + /// Snapshot retention policy + std::variant retention; + + SnapshotRefType type() const noexcept; }; /// \brief Optional Snapshot Summary Fields struct SnapshotSummaryFields { /// \brief The operation field key - static const std::string kOperation; + inline static const std::string kOperation = "operation"; /// Metrics, see https://iceberg.apache.org/spec/#metrics /// \brief Number of data files added in the snapshot - static const std::string kAddedDataFiles; + inline static const std::string kAddedDataFiles = "added-data-files"; /// \brief Number of data files deleted in the snapshot - static const std::string kDeletedDataFiles; + inline static const std::string kDeletedDataFiles = "deleted-data-files"; /// \brief Total number of live data files in the snapshot - static const std::string kTotalDataFiles; + inline static const std::string kTotalDataFiles = "total-data-files"; /// \brief Number of positional/equality delete files and deletion vectors added in the /// snapshot - static const std::string kAddedDeleteFiles; + inline static const std::string kAddedDeleteFiles = "added-delete-files"; /// \brief Number of equality delete files added in the snapshot - static const std::string kAddedEqDeleteFiles; + inline static const std::string kAddedEqDeleteFiles = "added-equality-delete-files"; /// \brief Number of equality delete files removed in the snapshot - static const std::string kRemovedEqDeleteFiles; + inline static const std::string kRemovedEqDeleteFiles = "removed-equality-delete-files"; /// \brief Number of position delete files added in the snapshot - static const std::string kAddedPosDeleteFiles; + inline static const std::string kAddedPosDeleteFiles = "added-position-delete-files"; /// \brief Number of position delete files removed in the snapshot - static const std::string kRemovedPosDeleteFiles; + inline static const std::string kRemovedPosDeleteFiles = + "removed-position-delete-files"; /// \brief Number of deletion vectors added in the snapshot - static const std::string kAddedDVs; + inline static const std::string kAddedDVs = "added-dvs"; /// \brief Number of deletion vectors removed in the snapshot - static const std::string kRemovedDVs; + inline static const std::string kRemovedDVs = "removed-dvs"; /// \brief Number of positional/equality delete files and deletion vectors removed in /// the snapshot - static const std::string kRemovedDeleteFiles; + inline static const std::string kRemovedDeleteFiles = "removed-delete-files"; /// \brief Total number of live positional/equality delete files and deletion vectors in /// the snapshot - static const std::string kTotalDeleteFiles; + inline static const std::string kTotalDeleteFiles = "total-delete-files"; /// \brief Number of records added in the snapshot - static const std::string kAddedRecords; + inline static const std::string kAddedRecords = "added-records"; /// \brief Number of records deleted in the snapshot - static const std::string kDeletedRecords; + inline static const std::string kDeletedRecords = "deleted-records"; /// \brief Total number of records in the snapshot - static const std::string kTotalRecords; + inline static const std::string kTotalRecords = "total-records"; /// \brief The size of files added in the snapshot - static const std::string kAddedFileSize; + inline static const std::string kAddedFileSize = "added-files-size"; /// \brief The size of files removed in the snapshot - static const std::string kRemovedFileSize; + inline static const std::string kRemovedFileSize = "removed-files-size"; /// \brief Total size of live files in the snapshot - static const std::string kTotalFileSize; + inline static const std::string kTotalFileSize = "total-files-size"; /// \brief Number of position delete records added in the snapshot - static const std::string kAddedPosDeletes; + inline static const std::string kAddedPosDeletes = "added-position-deletes"; /// \brief Number of position delete records removed in the snapshot - static const std::string kRemovedPosDeletes; + inline static const std::string kRemovedPosDeletes = "removed-position-deletes"; /// \brief Total number of position delete records in the snapshot - static const std::string kTotalPosDeletes; + inline static const std::string kTotalPosDeletes = "total-position-deletes"; /// \brief Number of equality delete records added in the snapshot - static const std::string kAddedEqDeletes; + inline static const std::string kAddedEqDeletes = "added-equality-deletes"; /// \brief Number of equality delete records removed in the snapshot - static const std::string kRemovedEqDeletes; + inline static const std::string kRemovedEqDeletes = "removed-equality-deletes"; /// \brief Total number of equality delete records in the snapshot - static const std::string kTotalEqDeletes; + inline static const std::string kTotalEqDeletes = "total-equality-deletes"; /// \brief Number of duplicate files deleted (duplicates are files recorded more than /// once in the manifest) - static const std::string kDeletedDuplicatedFiles; + inline static const std::string kDeletedDuplicatedFiles = "deleted-duplicate-files"; /// \brief Number of partitions with files added or removed in the snapshot - static const std::string kChangedPartitionCountProp; + inline static const std::string kChangedPartitionCountProp = "changed-partition-count"; /// Other Fields, see https://iceberg.apache.org/spec/#other-fields /// \brief The Write-Audit-Publish id of a staged snapshot - static const std::string kWAPID; + inline static const std::string kWAPID = "wap.id"; /// \brief The Write-Audit-Publish id of a snapshot already been published - static const std::string kPublishedWAPID; + inline static const std::string kPublishedWAPID = "published-wap-id"; /// \brief The original id of a cherry-picked snapshot - static const std::string kSourceSnapshotID; + inline static const std::string kSourceSnapshotID = "source-snapshot-id"; /// \brief Name of the engine that created the snapshot - static const std::string kEngineName; + inline static const std::string kEngineName = "engine-name"; /// \brief Version of the engine that created the snapshot - static const std::string kEngineVersion; + inline static const std::string kEngineVersion = "engine-version"; }; /// \brief Data operation that produce snapshots. @@ -145,16 +157,16 @@ struct SnapshotSummaryFields { /// does not need to clean up deleted files for appends, which have no deleted files. struct ICEBERG_EXPORT DataOperation { /// \brief Only data files were added and no files were removed. - static constexpr std::string kAppend = "append"; + inline static const std::string kAppend = "append"; /// \brief Data and delete files were added and removed without changing table data; /// i.e. compaction, change the data file format, or relocating data files. - static constexpr std::string kReplace = "replace"; + inline static const std::string kReplace = "replace"; /// \brief Data and delete files were added and removed in a logical overwrite /// operation. - static constexpr std::string kOverwrite = "overwrite"; + inline static const std::string kOverwrite = "overwrite"; /// \brief Data files were removed and their contents logically deleted and/or delete /// files were added to delete rows. - static constexpr std::string kDelete = "delete"; + inline static const std::string kDelete = "delete"; }; /// \brief A snapshot of the data in a table at a point in time. diff --git a/test/snapshot_test.cc b/test/snapshot_test.cc index 8725a692e..995c64a86 100644 --- a/test/snapshot_test.cc +++ b/test/snapshot_test.cc @@ -16,13 +16,49 @@ * specific language governing permissions and limitations * under the License. */ - #include "iceberg/snapshot.h" #include namespace iceberg { +TEST(SnapshotRefTest, SnapshotRefBranchInitialization) { + SnapshotRef snapshot_ref; + + snapshot_ref.snapshot_id = 12345; + snapshot_ref.retention = SnapshotRef::Branch{ + .min_snapshots_to_keep = 10, .max_snapshot_age_ms = 5000, .max_ref_age_ms = 10000}; + + EXPECT_EQ(snapshot_ref.snapshot_id, 12345); + EXPECT_TRUE(std::holds_alternative(snapshot_ref.retention)); + EXPECT_EQ(snapshot_ref.type(), SnapshotRefType::kBranch); + + auto* branch = std::get_if(&snapshot_ref.retention); + ASSERT_NE(branch, nullptr); + EXPECT_TRUE(branch->min_snapshots_to_keep.has_value()); + EXPECT_EQ(branch->min_snapshots_to_keep.value(), 10); + EXPECT_TRUE(branch->max_snapshot_age_ms.has_value()); + EXPECT_EQ(branch->max_snapshot_age_ms.value(), 5000); + EXPECT_TRUE(branch->max_ref_age_ms.has_value()); + EXPECT_EQ(branch->max_ref_age_ms.value(), 10000); +} + +TEST(SnapshotRefTest, SnapshotRefTagInitialization) { + SnapshotRef snapshot_ref; + + snapshot_ref.snapshot_id = 67890; + snapshot_ref.retention = SnapshotRef::Tag{.max_ref_age_ms = 20000}; + + EXPECT_EQ(snapshot_ref.snapshot_id, 67890); + EXPECT_TRUE(std::holds_alternative(snapshot_ref.retention)); + EXPECT_EQ(snapshot_ref.type(), SnapshotRefType::kTag); + + auto* tag = std::get_if(&snapshot_ref.retention); + ASSERT_NE(tag, nullptr); + EXPECT_TRUE(tag->max_ref_age_ms.has_value()); + EXPECT_EQ(tag->max_ref_age_ms.value(), 20000); +} + class SnapshotTest : public ::testing::Test { protected: void SetUp() override {