Skip to content

Commit 6094892

Browse files
committed
feat: snapshot
Signed-off-by: Junwang Zhao <[email protected]>
1 parent d05a9b2 commit 6094892

File tree

6 files changed

+531
-2
lines changed

6 files changed

+531
-2
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ set(ICEBERG_SOURCES
2626
partition_field.cc
2727
partition_spec.cc
2828
transform.cc
29-
type.cc)
29+
type.cc
30+
snapshot.cc)
3031

3132
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)
3233
set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS)

src/iceberg/snapshot.cc

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/snapshot.h"
21+
22+
#include "iceberg/util/formatter.h"
23+
24+
namespace iceberg {
25+
26+
namespace {
27+
/// \brief Get the relative Operation name
28+
constexpr std::string_view ToString(Operation operation) {
29+
switch (operation) {
30+
case Operation::kAppend:
31+
return "append";
32+
case Operation::kOverwrite:
33+
return "overwrite";
34+
case Operation::kReplace:
35+
return "replace";
36+
case Operation::kDelete:
37+
return "delete";
38+
default:
39+
return "invalid";
40+
}
41+
}
42+
} // namespace
43+
44+
Summary::Summary(Operation op, std::unordered_map<std::string, std::string> props)
45+
: operation_(op), additional_properties_(std::move(props)) {}
46+
47+
Operation Summary::operation() const { return operation_; }
48+
49+
const std::unordered_map<std::string, std::string>& Summary::properties() const {
50+
return additional_properties_;
51+
}
52+
53+
std::string Summary::ToString() const {
54+
std::string repr =
55+
"summary: { operation: " + std::string(iceberg::ToString(operation_));
56+
for (const auto& [key, value] : additional_properties_) {
57+
repr += ", " + key + ": " + value;
58+
}
59+
repr += "}";
60+
return repr;
61+
}
62+
63+
bool Summary::Equals(const Summary& other) const {
64+
return operation_ == other.operation_ &&
65+
additional_properties_ == other.additional_properties_;
66+
}
67+
68+
Snapshot::Snapshot(int64_t snapshot_id, std::optional<int64_t> parent_snapshot_id,
69+
int64_t sequence_number, int64_t timestamp_ms,
70+
std::string manifest_list, std::shared_ptr<Summary> summary,
71+
std::optional<int64_t> schema_id)
72+
: snapshot_id_(snapshot_id),
73+
parent_snapshot_id_(parent_snapshot_id),
74+
sequence_number_(sequence_number),
75+
timestamp_ms_(timestamp_ms),
76+
manifest_list_(std::move(manifest_list)),
77+
summary_(std::move(summary)),
78+
schema_id_(schema_id) {}
79+
80+
int64_t Snapshot::snapshot_id() const { return snapshot_id_; }
81+
82+
std::optional<int64_t> Snapshot::parent_snapshot_id() const {
83+
return parent_snapshot_id_;
84+
}
85+
86+
int64_t Snapshot::sequence_number() const { return sequence_number_; }
87+
88+
int64_t Snapshot::timestamp_ms() const { return timestamp_ms_; }
89+
90+
const std::string& Snapshot::manifest_list() const { return manifest_list_; }
91+
92+
const std::shared_ptr<Summary>& Snapshot::summary() const { return summary_; }
93+
94+
std::optional<int32_t> Snapshot::schema_id() const { return schema_id_; }
95+
96+
std::string Snapshot::ToString() const {
97+
std::string repr = "snapshot: { id: " + std::to_string(snapshot_id_);
98+
if (parent_snapshot_id_.has_value()) {
99+
repr += ", parent_id: " + std::to_string(parent_snapshot_id_.value());
100+
}
101+
repr += ", sequence_number: " + std::to_string(sequence_number_);
102+
repr += ", timestamp_ms: " + std::to_string(timestamp_ms_);
103+
repr += ", manifest_list: " + manifest_list_;
104+
repr += ", summary: " + summary_->ToString();
105+
106+
if (schema_id_.has_value()) {
107+
repr += ", schema_id: " + std::to_string(schema_id_.value());
108+
}
109+
110+
repr += " }";
111+
112+
return repr;
113+
}
114+
115+
bool Snapshot::Equals(const Snapshot& other) const {
116+
return snapshot_id_ == other.snapshot_id_ &&
117+
parent_snapshot_id_ == other.parent_snapshot_id_ &&
118+
sequence_number_ == other.sequence_number_ &&
119+
timestamp_ms_ == other.timestamp_ms_ && manifest_list_ == other.manifest_list_ &&
120+
*summary_ == *other.summary_ && schema_id_ == other.schema_id_;
121+
}
122+
123+
} // namespace iceberg

src/iceberg/snapshot.h

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
#include <optional>
23+
#include <string>
24+
#include <unordered_map>
25+
26+
#include "iceberg/iceberg_export.h"
27+
#include "iceberg/util/formattable.h"
28+
29+
namespace iceberg {
30+
31+
/// Optional Snapshot Summary Fields
32+
/// Metrics
33+
/// See https://iceberg.apache.org/spec/#metrics
34+
35+
/// \brief Number of data files added in the snapshot
36+
constexpr std::string_view kAddedDataFilesKey = "added-data-files";
37+
/// \brief Number of data files deleted in the snapshot
38+
constexpr std::string_view kDeletedDataFilesKey = "deleted-data-files";
39+
/// \brief Total number of live data files in the snapshot
40+
constexpr std::string_view kTotalDataFilesKey = "total-data-files";
41+
/// \brief Number of positional/equality delete files and deletion vectors added in the
42+
/// snapshot
43+
constexpr std::string_view kAddedDeleteFilesKey = "added-delete-files";
44+
/// \brief Number of equality delete files added in the snapshot
45+
constexpr std::string_view kAddedEqDeleteFilesKey = "added-equality-delete-files";
46+
/// \brief Number of equality delete files removed in the snapshot
47+
constexpr std::string_view kRemovedEqDeleteFilesKey = "removed-equality-delete-files";
48+
/// \brief Number of position delete files added in the snapshot
49+
constexpr std::string_view kAddedPosDeleteFilesKey = "added-position-delete-files";
50+
/// \brief Number of position delete files removed in the snapshot
51+
constexpr std::string_view kRemovedPosDeleteFilesKey = "removed-position-delete-files";
52+
/// \brief Number of deletion vectors added in the snapshot
53+
constexpr std::string_view kAddedDVSKey = "added-dvs";
54+
/// \brief Number of deletion vectors removed in the snapshot
55+
constexpr std::string_view kRemovedDVSKey = "removed-dvs";
56+
/// \brief Number of positional/equality delete files and deletion vectors removed in the
57+
/// snapshot
58+
constexpr std::string_view kRemovedDeleteFilesKey = "removed-delete-files";
59+
/// \brief Total number of live positional/equality delete files and deletion vectors in
60+
/// the snapshot
61+
constexpr std::string_view kTotalDeleteFilesKey = "total-delete-files";
62+
/// \brief Number of records added in the snapshot
63+
constexpr std::string_view kAddedRecordsKey = "added-records";
64+
/// \brief Number of records deleted in the snapshot
65+
constexpr std::string_view kDeletedRecordsKey = "deleted-records";
66+
/// \brief Total number of records in the snapshot
67+
constexpr std::string_view kTotalRecordsKey = "total-records";
68+
/// \brief The size of files added in the snapshot
69+
constexpr std::string_view kAddedFileSizeKey = "added-files-size";
70+
/// \brief The size of files removed in the snapshot
71+
constexpr std::string_view kRemovedFileSizeKey = "removed-files-size";
72+
/// \brief Total size of live files in the snapshot
73+
constexpr std::string_view kTotalFileSizeKey = "total-files-size";
74+
/// \brief Number of position delete records added in the snapshot
75+
constexpr std::string_view kAddedPosDeletesKey = "added-position-deletes";
76+
/// \brief Number of position delete records removed in the snapshot
77+
constexpr std::string_view kRemovedPosDeletesKey = "removed-position-deletes";
78+
/// \brief Total number of position delete records in the snapshot
79+
constexpr std::string_view kTotalPosDeletesKey = "total-position-deletes";
80+
/// \brief Number of equality delete records added in the snapshot
81+
constexpr std::string_view kAddedEqDeletesKey = "added-equality-deletes";
82+
/// \brief Number of equality delete records removed in the snapshot
83+
constexpr std::string_view kRemovedEqDeletesKey = "removed-equality-deletes";
84+
/// \brief Total number of equality delete records in the snapshot
85+
constexpr std::string_view kTotalEqDeletesKey = "total-equality-deletes";
86+
/// \brief Number of duplicate files deleted (duplicates are files recorded more than once
87+
/// in the manifest)
88+
constexpr std::string_view kDeletedDuplicatedFilesKey = "deleted-duplicate-files";
89+
/// \brief Number of partitions with files added or removed in the snapshot
90+
constexpr std::string_view kChangedPartitionCountProp = "changed-partition-count";
91+
92+
/// Other Fields
93+
/// See https://iceberg.apache.org/spec/#other-fields
94+
95+
/// \brief The Write-Audit-Publish id of a staged snapshot
96+
constexpr std::string_view kWAPIDKey = "wap.id";
97+
/// \brief The Write-Audit-Publish id of a snapshot already been published
98+
constexpr std::string_view kPublishedWAPIDKey = "published-wap-id";
99+
/// \brief The original id of a cherry-picked snapshot
100+
constexpr std::string_view kSourceSnapshotIDKey = "source-snapshot-id";
101+
/// \brief Name of the engine that created the snapshot
102+
constexpr std::string_view kEngineNameKey = "engine-name";
103+
/// \brief Version of the engine that created the snapshot
104+
constexpr std::string_view kEngineVersionKey = "engine-version";
105+
106+
/// \brief The operation field is used by some operations, like snapshot expiration, to
107+
/// skip processing certain snapshots.
108+
enum class Operation {
109+
/// Only data files were added and no files were removed.
110+
kAppend,
111+
/// Data and delete files were added and removed without changing table data; i.e.
112+
/// compaction, change the data file format, or relocating data files.
113+
kReplace,
114+
/// Data and delete files were added and removed in a logical overwrite operation.
115+
kOverwrite,
116+
/// Data files were removed and their contents logically deleted and/or delete files
117+
/// were added to delete rows.
118+
kDelete,
119+
};
120+
121+
/// \brief Summarises the changes in the snapshot.
122+
class ICEBERG_EXPORT Summary : public iceberg::util::Formattable {
123+
public:
124+
Summary() = default;
125+
/// \brief Construct a summary with the given operation and properties.
126+
Summary(Operation op, std::unordered_map<std::string, std::string> props);
127+
128+
/// \brief Get the operation type of the snapshot.
129+
[[nodiscard]] Operation operation() const;
130+
131+
/// \brief Get the additional properties of the snapshot.
132+
[[nodiscard]] const std::unordered_map<std::string, std::string>& properties() const;
133+
134+
std::string ToString() const override;
135+
136+
friend bool operator==(const Summary& lhs, const Summary& rhs) {
137+
return lhs.Equals(rhs);
138+
}
139+
140+
friend bool operator!=(const Summary& lhs, const Summary& rhs) { return !(lhs == rhs); }
141+
142+
private:
143+
/// \brief Compare two Summaries for equality.
144+
[[nodiscard]] bool Equals(const Summary& other) const;
145+
146+
/// The type of operation in the snapshot
147+
Operation operation_{Operation::kAppend};
148+
/// Other summary data.
149+
std::unordered_map<std::string, std::string> additional_properties_;
150+
};
151+
152+
/// \brief A snapshot of the data in a table at a point in time.
153+
///
154+
/// A snapshot consist of one or more file manifests, and the complete table contents is
155+
/// the union of all the data files in those manifests.
156+
///
157+
/// Snapshots are created by table operations.
158+
class ICEBERG_EXPORT Snapshot : public iceberg::util::Formattable {
159+
public:
160+
Snapshot(int64_t snapshot_id, std::optional<int64_t> parent_snapshot_id,
161+
int64_t sequence_number, int64_t timestamp_ms, std::string manifest_list,
162+
std::shared_ptr<Summary> summary, std::optional<int64_t> schema_id);
163+
164+
/// \brief Get the id of the snapshot.
165+
[[nodiscard]] int64_t snapshot_id() const;
166+
167+
/// \brief Get parent snapshot id.
168+
[[nodiscard]] std::optional<int64_t> parent_snapshot_id() const;
169+
170+
/// \brief Get the sequence number of the snapshot.
171+
[[nodiscard]] int64_t sequence_number() const;
172+
173+
/// \brief Get the timestamp of the snapshot.
174+
[[nodiscard]] int64_t timestamp_ms() const;
175+
176+
/// \brief Get the manifest list of the snapshot.
177+
[[nodiscard]] const std::string& manifest_list() const;
178+
179+
/// \brief Get the summary of the snapshot.
180+
[[nodiscard]] const std::shared_ptr<Summary>& summary() const;
181+
182+
/// \brief Get the schema ID of the snapshot.
183+
[[nodiscard]] std::optional<int32_t> schema_id() const;
184+
185+
std::string ToString() const override;
186+
187+
friend bool operator==(const Snapshot& lhs, const Snapshot& rhs) {
188+
return lhs.Equals(rhs);
189+
}
190+
191+
friend bool operator!=(const Snapshot& lhs, const Snapshot& rhs) {
192+
return !(lhs == rhs);
193+
}
194+
195+
private:
196+
/// \brief Compare two snapshots for equality.
197+
[[nodiscard]] bool Equals(const Snapshot& other) const;
198+
199+
/// A unqiue long ID.
200+
int64_t snapshot_id_;
201+
/// The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent.
202+
std::optional<int64_t> parent_snapshot_id_;
203+
/// A monotonically increasing long that tracks the order of changes to a table.
204+
int64_t sequence_number_;
205+
/// A timestamp when the snapshot was created, used for garbage collection and table
206+
/// inspection.
207+
int64_t timestamp_ms_;
208+
/// The location of a manifest list for this snapshot that tracks manifest files with
209+
/// additional metadata.
210+
std::string manifest_list_;
211+
/// A string map that summaries the snapshot changes, including operation.
212+
std::shared_ptr<Summary> summary_;
213+
/// ID of the table's current schema when the snapshot was created.
214+
std::optional<int32_t> schema_id_;
215+
};
216+
217+
} // namespace iceberg

0 commit comments

Comments
 (0)