Skip to content

Commit b76dc8e

Browse files
committed
feat: add table metadata definition
1 parent d05a9b2 commit b76dc8e

File tree

5 files changed

+345
-0
lines changed

5 files changed

+345
-0
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ set(ICEBERG_SOURCES
2525
schema_internal.cc
2626
partition_field.cc
2727
partition_spec.cc
28+
statistics_file.cc
29+
table_metadata.cc
2830
transform.cc
2931
type.cc)
3032

src/iceberg/statistics_file.cc

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/statistics_file.h"
21+
22+
#include <format>
23+
24+
namespace iceberg {
25+
26+
bool BlobMetadata::Equals(const BlobMetadata& other) const {
27+
return type == other.type && source_snapshot_id == other.source_snapshot_id &&
28+
source_snapshot_sequence_number == other.source_snapshot_sequence_number &&
29+
fields == other.fields && properties == other.properties;
30+
}
31+
32+
std::string BlobMetadata::ToString() const {
33+
std::string repr = "BlobMetadata[";
34+
std::format_to(std::back_inserter(repr),
35+
"type='{}',sourceSnapshotId={},sourceSnapshotSequenceNumber={},", type,
36+
source_snapshot_id, source_snapshot_sequence_number);
37+
std::format_to(std::back_inserter(repr), "fields=[");
38+
for (auto iter = fields.cbegin(); iter != fields.cend(); ++iter) {
39+
if (iter != fields.cbegin()) {
40+
std::format_to(std::back_inserter(repr), ",{}", *iter);
41+
} else {
42+
std::format_to(std::back_inserter(repr), "{}", *iter);
43+
}
44+
}
45+
std::format_to(std::back_inserter(repr), "],properties=[");
46+
for (auto iter = properties.cbegin(); iter != properties.cend(); ++iter) {
47+
const auto& [key, value] = *iter;
48+
if (iter != properties.cbegin()) {
49+
std::format_to(std::back_inserter(repr), ",{}:{}", key, value);
50+
} else {
51+
std::format_to(std::back_inserter(repr), "{}:{}", key, value);
52+
}
53+
}
54+
repr += "]]";
55+
return repr;
56+
}
57+
58+
bool StatisticsFile::Equals(const StatisticsFile& other) const {
59+
return snapshot_id == other.snapshot_id && path == other.path &&
60+
file_size_in_bytes == other.file_size_in_bytes &&
61+
file_footer_size_in_bytes == other.file_footer_size_in_bytes &&
62+
blob_metadata == other.blob_metadata;
63+
}
64+
65+
std::string StatisticsFile::ToString() const {
66+
std::string repr = "StatisticsFile[";
67+
std::format_to(std::back_inserter(repr),
68+
"snapshotId={},path={},fileSizeInBytes={},fileFooterSizeInBytes={},",
69+
snapshot_id, path, file_size_in_bytes, file_footer_size_in_bytes);
70+
std::format_to(std::back_inserter(repr), "blobMetadata=[");
71+
for (auto iter = blob_metadata.cbegin(); iter != blob_metadata.cend(); ++iter) {
72+
if (iter != blob_metadata.cbegin()) {
73+
std::format_to(std::back_inserter(repr), ",{}", iter->ToString());
74+
} else {
75+
std::format_to(std::back_inserter(repr), "{}", iter->ToString());
76+
}
77+
}
78+
repr += "]]";
79+
return repr;
80+
}
81+
82+
} // namespace iceberg

src/iceberg/statistics_file.h

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/statistics_file.h
23+
/// Statistics file for Iceberg tables.
24+
25+
#include <cstdint>
26+
#include <string>
27+
#include <unordered_map>
28+
#include <vector>
29+
30+
#include "iceberg/iceberg_export.h"
31+
#include "iceberg/util/formattable.h"
32+
33+
namespace iceberg {
34+
35+
/// \brief A metadata about a statistics or indices blob
36+
struct ICEBERG_EXPORT BlobMetadata : public util::Formattable {
37+
/// Type of the blob
38+
std::string type;
39+
/// ID of the Iceberg table's snapshot the blob was computed from
40+
int64_t source_snapshot_id;
41+
/// Sequence number of the Iceberg table's snapshot the blob was computed from
42+
int64_t source_snapshot_sequence_number;
43+
/// Ordered list of fields the blob was calculated from
44+
std::vector<int32_t> fields;
45+
/// Additional properties of the blob, specific to the blob type
46+
std::unordered_map<std::string, std::string> properties;
47+
48+
/// \brief Compare two BlobMetadatas for equality.
49+
friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) {
50+
return lhs.Equals(rhs);
51+
}
52+
53+
/// \brief Compare two BlobMetadatas for inequality.
54+
friend bool operator!=(const BlobMetadata& lhs, const BlobMetadata& rhs) {
55+
return !(lhs == rhs);
56+
}
57+
58+
std::string ToString() const override;
59+
60+
private:
61+
bool Equals(const BlobMetadata& other) const;
62+
};
63+
64+
/// \brief Represents a statistics file in the Puffin format
65+
struct ICEBERG_EXPORT StatisticsFile : public util::Formattable {
66+
/// ID of the Iceberg table's snapshot the statistics file is associated with
67+
int64_t snapshot_id;
68+
/// Fully qualified path to the file
69+
std::string path;
70+
/// The size of the file in bytes
71+
int64_t file_size_in_bytes;
72+
/// The size of the file footer in bytes
73+
int64_t file_footer_size_in_bytes;
74+
/// List of statistics contained in the file
75+
std::vector<BlobMetadata> blob_metadata;
76+
77+
/// \brief Compare two StatisticsFiles for equality.
78+
friend bool operator==(const StatisticsFile& lhs, const StatisticsFile& rhs) {
79+
return lhs.Equals(rhs);
80+
}
81+
82+
/// \brief Compare two StatisticsFiles for inequality.
83+
friend bool operator!=(const StatisticsFile& lhs, const StatisticsFile& rhs) {
84+
return !(lhs == rhs);
85+
}
86+
87+
std::string ToString() const override;
88+
89+
private:
90+
bool Equals(const StatisticsFile& other) const;
91+
};
92+
93+
/// \brief Represents a partition statistics file
94+
struct ICEBERG_EXPORT PartitionStatisticsFile {
95+
/// Snapshot ID of the Iceberg table's snapshot the partition statistics file is
96+
/// associated with
97+
int64_t snapshot_id;
98+
/// Fully qualified path to the file
99+
std::string path;
100+
/// The size of the partition statistics file in bytes
101+
int64_t file_size_in_bytes;
102+
};
103+
104+
} // namespace iceberg

src/iceberg/table_metadata.cc

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/table_metadata.h"
21+
22+
#include <format>
23+
#include <string>
24+
25+
#include "iceberg/statistics_file.h"
26+
27+
namespace iceberg {
28+
29+
std::string SnapshotLogEntry::ToString() const {
30+
return std::format("SnapshotLogEntry[timestampMillis={},snapshotId={}]", timestamp_ms,
31+
snapshot_id);
32+
}
33+
34+
std::string MetadataLogEntry::ToString() const {
35+
return std::format("MetadataLogEntry[timestampMillis={},file={}]", timestamp_ms, file);
36+
}
37+
38+
} // namespace iceberg

src/iceberg/table_metadata.h

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/table_metadata.h
23+
/// Table metadata for Iceberg tables.
24+
25+
#include <chrono>
26+
#include <memory>
27+
#include <string>
28+
#include <unordered_map>
29+
#include <vector>
30+
31+
#include "iceberg/iceberg_export.h"
32+
#include "iceberg/type_fwd.h"
33+
#include "iceberg/util/formattable.h"
34+
35+
namespace iceberg {
36+
37+
/// \brief Represents a snapshot log entry
38+
struct ICEBERG_EXPORT SnapshotLogEntry : public util::Formattable {
39+
/// The timestamp in milliseconds of the change
40+
std::chrono::time_point<std::chrono::system_clock, std::chrono::milliseconds>
41+
timestamp_ms;
42+
/// ID of the snapshot
43+
int64_t snapshot_id;
44+
45+
std::string ToString() const override;
46+
};
47+
48+
/// \brief Represents a metadata log entry
49+
struct ICEBERG_EXPORT MetadataLogEntry : public util::Formattable {
50+
/// The timestamp in milliseconds of the change
51+
std::chrono::time_point<std::chrono::system_clock, std::chrono::milliseconds>
52+
timestamp_ms;
53+
/// Metadata file location
54+
std::string file;
55+
56+
std::string ToString() const override;
57+
};
58+
59+
/// \brief Represents the metadata for an Iceberg table
60+
///
61+
/// Note that it only contains table metadata from the spec. Compared to the Java
62+
/// implementation, missing pieces including: 1) Map<Integer,
63+
/// Schema|PartitionSpec|SortOrder> 2) List<MetadataUpdate> 3) Map<Long, Snapshot> 4)
64+
/// Map<String, SnapshotRef>
65+
///
66+
/// TODO(wgtmac): Implement Equals and ToString once SortOrder and Snapshot are
67+
/// implemented.
68+
struct ICEBERG_EXPORT TableMetadata {
69+
/// An integer version number for the format
70+
int8_t format_version;
71+
/// A UUID that identifies the table
72+
std::string table_uuid;
73+
/// The table's base location
74+
std::string location;
75+
/// The table's highest assigned sequence number
76+
int64_t last_sequence_number;
77+
/// Timestamp in milliseconds from the unix epoch when the table was last updated.
78+
int64_t last_updated_ms;
79+
/// The highest assigned column ID for the table
80+
int32_t last_column_id;
81+
/// A list of schemas
82+
std::vector<std::shared_ptr<Schema>> schemas;
83+
/// ID of the table's current schema
84+
int32_t current_schema_id;
85+
/// A list of partition specs
86+
std::vector<std::shared_ptr<PartitionSpec>> partition_specs;
87+
/// ID of the current partition spec that writers should use by default
88+
int32_t default_spec_id;
89+
/// The highest assigned partition field ID across all partition specs for the table
90+
int32_t last_partition_id;
91+
/// A string to string map of table properties
92+
std::unordered_map<std::string, std::string> properties;
93+
/// ID of the current table snapshot
94+
int64_t current_snapshot_id;
95+
/// A list of valid snapshots
96+
std::vector<std::shared_ptr<Snapshot>> snapshots;
97+
/// A list of timestamp and snapshot ID pairs that encodes changes to the current
98+
/// snapshot for the table
99+
std::vector<SnapshotLogEntry> snapshot_log;
100+
/// A list of timestamp and metadata file location pairs that encodes changes to the
101+
/// previous metadata files for the table
102+
std::vector<MetadataLogEntry> metadata_log;
103+
/// A list of sort orders
104+
std::vector<std::shared_ptr<SortOrder>> sort_orders;
105+
/// Default sort order id of the table
106+
int32_t default_sort_order_id;
107+
/// A map of snapshot references
108+
std::unordered_map<std::string, std::string> refs;
109+
/// A list of table statistics
110+
std::vector<std::shared_ptr<struct StatisticsFile>> statistics;
111+
/// A list of partition statistics
112+
std::vector<std::shared_ptr<struct PartitionStatisticsFile>> partition_statistics;
113+
/// whether or not to track the creation and updates to rows in the table
114+
bool row_lineage_enabled = false;
115+
/// A `long` higher than all assigned row IDs
116+
int64_t next_row_id;
117+
};
118+
119+
} // namespace iceberg

0 commit comments

Comments
 (0)