Skip to content

Commit dd5fef7

Browse files
authored
feat: snapshot (#60)
Signed-off-by: Junwang Zhao <[email protected]>
1 parent 185515a commit dd5fef7

File tree

5 files changed

+398
-2
lines changed

5 files changed

+398
-2
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ set(ICEBERG_SOURCES
3232
table_metadata.cc
3333
transform.cc
3434
transform_function.cc
35-
type.cc)
35+
type.cc
36+
snapshot.cc)
3637

3738
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
3839
set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)

src/iceberg/snapshot.cc

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/snapshot.h"
21+
22+
namespace iceberg {
23+
24+
SnapshotRefType SnapshotRef::type() const noexcept {
25+
return std::visit(
26+
[&](const auto& retention) -> SnapshotRefType {
27+
using T = std::decay_t<decltype(retention)>;
28+
if constexpr (std::is_same_v<T, Branch>) {
29+
return SnapshotRefType::kBranch;
30+
} else {
31+
return SnapshotRefType::kTag;
32+
}
33+
},
34+
retention);
35+
}
36+
37+
std::optional<std::string_view> Snapshot::operation() const {
38+
auto it = summary.find(SnapshotSummaryFields::kOperation);
39+
if (it != summary.end()) {
40+
return it->second;
41+
}
42+
return std::nullopt;
43+
}
44+
45+
bool Snapshot::Equals(const Snapshot& other) const {
46+
if (this == &other) {
47+
return true;
48+
}
49+
return snapshot_id == other.snapshot_id &&
50+
parent_snapshot_id == other.parent_snapshot_id &&
51+
sequence_number == other.sequence_number && timestamp_ms == other.timestamp_ms &&
52+
schema_id == other.schema_id;
53+
}
54+
55+
} // namespace iceberg

src/iceberg/snapshot.h

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
#include <optional>
23+
#include <string>
24+
#include <string_view>
25+
#include <unordered_map>
26+
#include <variant>
27+
28+
#include "iceberg/iceberg_export.h"
29+
30+
namespace iceberg {
31+
32+
/// \brief The type of snapshot reference
33+
enum class SnapshotRefType {
34+
/// Branches are mutable named references that can be updated by committing a new
35+
/// snapshot as the branch’s referenced snapshot using the Commit Conflict Resolution
36+
/// and Retry procedures.
37+
kBranch,
38+
/// Tags are labels for individual snapshots
39+
kTag,
40+
};
41+
42+
/// \brief A reference to a snapshot, either a branch or a tag.
43+
struct ICEBERG_EXPORT SnapshotRef {
44+
struct ICEBERG_EXPORT Branch {
45+
/// A positive number for the minimum number of snapshots to keep in a branch while
46+
/// expiring snapshots. Defaults to table property
47+
/// history.expire.min-snapshots-to-keep.
48+
std::optional<int32_t> min_snapshots_to_keep;
49+
/// A positive number for the max age of snapshots to keep when
50+
/// expiring, including the latest snapshot. Defaults to table property
51+
/// history.expire.max-snapshot-age-ms.
52+
std::optional<int64_t> max_snapshot_age_ms;
53+
/// For snapshot references except the main branch, a positive number for the max age
54+
/// of the snapshot reference to keep while expiring snapshots. Defaults to table
55+
/// property history.expire.max-ref-age-ms. The main branch never expires.
56+
std::optional<int64_t> max_ref_age_ms;
57+
};
58+
59+
struct ICEBERG_EXPORT Tag {
60+
/// For snapshot references except the main branch, a positive number for the max age
61+
/// of the snapshot reference to keep while expiring snapshots. Defaults to table
62+
/// property history.expire.max-ref-age-ms. The main branch never expires.
63+
std::optional<int64_t> max_ref_age_ms;
64+
};
65+
66+
/// A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch.
67+
int64_t snapshot_id;
68+
/// Snapshot retention policy
69+
std::variant<Branch, Tag> retention;
70+
71+
SnapshotRefType type() const noexcept;
72+
};
73+
74+
/// \brief Optional Snapshot Summary Fields
75+
struct SnapshotSummaryFields {
76+
/// \brief The operation field key
77+
inline static const std::string kOperation = "operation";
78+
79+
/// Metrics, see https://iceberg.apache.org/spec/#metrics
80+
81+
/// \brief Number of data files added in the snapshot
82+
inline static const std::string kAddedDataFiles = "added-data-files";
83+
/// \brief Number of data files deleted in the snapshot
84+
inline static const std::string kDeletedDataFiles = "deleted-data-files";
85+
/// \brief Total number of live data files in the snapshot
86+
inline static const std::string kTotalDataFiles = "total-data-files";
87+
/// \brief Number of positional/equality delete files and deletion vectors added in the
88+
/// snapshot
89+
inline static const std::string kAddedDeleteFiles = "added-delete-files";
90+
/// \brief Number of equality delete files added in the snapshot
91+
inline static const std::string kAddedEqDeleteFiles = "added-equality-delete-files";
92+
/// \brief Number of equality delete files removed in the snapshot
93+
inline static const std::string kRemovedEqDeleteFiles = "removed-equality-delete-files";
94+
/// \brief Number of position delete files added in the snapshot
95+
inline static const std::string kAddedPosDeleteFiles = "added-position-delete-files";
96+
/// \brief Number of position delete files removed in the snapshot
97+
inline static const std::string kRemovedPosDeleteFiles =
98+
"removed-position-delete-files";
99+
/// \brief Number of deletion vectors added in the snapshot
100+
inline static const std::string kAddedDVs = "added-dvs";
101+
/// \brief Number of deletion vectors removed in the snapshot
102+
inline static const std::string kRemovedDVs = "removed-dvs";
103+
/// \brief Number of positional/equality delete files and deletion vectors removed in
104+
/// the snapshot
105+
inline static const std::string kRemovedDeleteFiles = "removed-delete-files";
106+
/// \brief Total number of live positional/equality delete files and deletion vectors in
107+
/// the snapshot
108+
inline static const std::string kTotalDeleteFiles = "total-delete-files";
109+
/// \brief Number of records added in the snapshot
110+
inline static const std::string kAddedRecords = "added-records";
111+
/// \brief Number of records deleted in the snapshot
112+
inline static const std::string kDeletedRecords = "deleted-records";
113+
/// \brief Total number of records in the snapshot
114+
inline static const std::string kTotalRecords = "total-records";
115+
/// \brief The size of files added in the snapshot
116+
inline static const std::string kAddedFileSize = "added-files-size";
117+
/// \brief The size of files removed in the snapshot
118+
inline static const std::string kRemovedFileSize = "removed-files-size";
119+
/// \brief Total size of live files in the snapshot
120+
inline static const std::string kTotalFileSize = "total-files-size";
121+
/// \brief Number of position delete records added in the snapshot
122+
inline static const std::string kAddedPosDeletes = "added-position-deletes";
123+
/// \brief Number of position delete records removed in the snapshot
124+
inline static const std::string kRemovedPosDeletes = "removed-position-deletes";
125+
/// \brief Total number of position delete records in the snapshot
126+
inline static const std::string kTotalPosDeletes = "total-position-deletes";
127+
/// \brief Number of equality delete records added in the snapshot
128+
inline static const std::string kAddedEqDeletes = "added-equality-deletes";
129+
/// \brief Number of equality delete records removed in the snapshot
130+
inline static const std::string kRemovedEqDeletes = "removed-equality-deletes";
131+
/// \brief Total number of equality delete records in the snapshot
132+
inline static const std::string kTotalEqDeletes = "total-equality-deletes";
133+
/// \brief Number of duplicate files deleted (duplicates are files recorded more than
134+
/// once in the manifest)
135+
inline static const std::string kDeletedDuplicatedFiles = "deleted-duplicate-files";
136+
/// \brief Number of partitions with files added or removed in the snapshot
137+
inline static const std::string kChangedPartitionCountProp = "changed-partition-count";
138+
139+
/// Other Fields, see https://iceberg.apache.org/spec/#other-fields
140+
141+
/// \brief The Write-Audit-Publish id of a staged snapshot
142+
inline static const std::string kWAPID = "wap.id";
143+
/// \brief The Write-Audit-Publish id of a snapshot already been published
144+
inline static const std::string kPublishedWAPID = "published-wap-id";
145+
/// \brief The original id of a cherry-picked snapshot
146+
inline static const std::string kSourceSnapshotID = "source-snapshot-id";
147+
/// \brief Name of the engine that created the snapshot
148+
inline static const std::string kEngineName = "engine-name";
149+
/// \brief Version of the engine that created the snapshot
150+
inline static const std::string kEngineVersion = "engine-version";
151+
};
152+
153+
/// \brief Data operation that produce snapshots.
154+
///
155+
/// A snapshot can return the operation that created the snapshot to help other components
156+
/// ignore snapshots that are not needed for some tasks. For example, snapshot expiration
157+
/// does not need to clean up deleted files for appends, which have no deleted files.
158+
struct ICEBERG_EXPORT DataOperation {
159+
/// \brief Only data files were added and no files were removed.
160+
inline static const std::string kAppend = "append";
161+
/// \brief Data and delete files were added and removed without changing table data;
162+
/// i.e. compaction, change the data file format, or relocating data files.
163+
inline static const std::string kReplace = "replace";
164+
/// \brief Data and delete files were added and removed in a logical overwrite
165+
/// operation.
166+
inline static const std::string kOverwrite = "overwrite";
167+
/// \brief Data files were removed and their contents logically deleted and/or delete
168+
/// files were added to delete rows.
169+
inline static const std::string kDelete = "delete";
170+
};
171+
172+
/// \brief A snapshot of the data in a table at a point in time.
173+
///
174+
/// A snapshot consist of one or more file manifests, and the complete table contents is
175+
/// the union of all the data files in those manifests.
176+
///
177+
/// Snapshots are created by table operations.
178+
struct ICEBERG_EXPORT Snapshot {
179+
/// A unqiue long ID.
180+
int64_t snapshot_id;
181+
/// The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent.
182+
std::optional<int64_t> parent_snapshot_id;
183+
/// A monotonically increasing long that tracks the order of changes to a table.
184+
int64_t sequence_number;
185+
/// A timestamp when the snapshot was created, used for garbage collection and table
186+
/// inspection.
187+
int64_t timestamp_ms;
188+
/// The location of a manifest list for this snapshot that tracks manifest files with
189+
/// additional metadata.
190+
std::string manifest_list;
191+
/// A string map that summaries the snapshot changes, including operation.
192+
std::unordered_map<std::string, std::string> summary;
193+
/// ID of the table's current schema when the snapshot was created.
194+
std::optional<int32_t> schema_id;
195+
196+
/// \brief Return the name of the DataOperations data operation that produced this
197+
/// snapshot.
198+
///
199+
/// \return the operation that produced this snapshot, or nullopt if the operation is
200+
/// unknown.
201+
std::optional<std::string_view> operation() const;
202+
203+
/// \brief Compare two snapshots for equality.
204+
friend bool operator==(const Snapshot& lhs, const Snapshot& rhs) {
205+
return lhs.Equals(rhs);
206+
}
207+
208+
/// \brief Compare two snapshots for inequality.
209+
friend bool operator!=(const Snapshot& lhs, const Snapshot& rhs) {
210+
return !(lhs == rhs);
211+
}
212+
213+
private:
214+
/// \brief Compare two snapshots for equality.
215+
bool Equals(const Snapshot& other) const;
216+
};
217+
218+
} // namespace iceberg

test/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ target_sources(schema_test
3434
partition_field_test.cc
3535
partition_spec_test.cc
3636
sort_field_test.cc
37-
sort_order_test.cc)
37+
sort_order_test.cc
38+
snapshot_test.cc)
3839
target_link_libraries(schema_test PRIVATE iceberg_static GTest::gtest_main GTest::gmock)
3940
add_test(NAME schema_test COMMAND schema_test)
4041

0 commit comments

Comments
 (0)