Skip to content

Commit f8ea501

Browse files
committed
feat: add table metadata definition
1 parent d05a9b2 commit f8ea501

File tree

5 files changed

+346
-0
lines changed

5 files changed

+346
-0
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ set(ICEBERG_SOURCES
2525
schema_internal.cc
2626
partition_field.cc
2727
partition_spec.cc
28+
statistics_file.cc
29+
table_metadata.cc
2830
transform.cc
2931
type.cc)
3032

src/iceberg/statistics_file.cc

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/statistics_file.h"
21+
22+
#include <format>
23+
24+
namespace iceberg {
25+
26+
bool BlobMetadata::Equals(const BlobMetadata& other) const {
27+
return type == other.type && source_snapshot_id == other.source_snapshot_id &&
28+
source_snapshot_sequence_number == other.source_snapshot_sequence_number &&
29+
fields == other.fields && properties == other.properties;
30+
}
31+
32+
std::string BlobMetadata::ToString() const {
33+
std::string repr = "BlobMetadata[";
34+
std::format_to(std::back_inserter(repr),
35+
"type='{}',sourceSnapshotId={},sourceSnapshotSequenceNumber={},", type,
36+
source_snapshot_id, source_snapshot_sequence_number);
37+
std::format_to(std::back_inserter(repr), "fields=[");
38+
for (auto iter = fields.cbegin(); iter != fields.cend(); ++iter) {
39+
if (iter != fields.cbegin()) {
40+
std::format_to(std::back_inserter(repr), ",{}", *iter);
41+
} else {
42+
std::format_to(std::back_inserter(repr), "{}", *iter);
43+
}
44+
}
45+
std::format_to(std::back_inserter(repr), "],properties=[");
46+
for (auto iter = properties.cbegin(); iter != properties.cend(); ++iter) {
47+
const auto& [key, value] = *iter;
48+
if (iter != properties.cbegin()) {
49+
std::format_to(std::back_inserter(repr), ",{}:{}", key, value);
50+
} else {
51+
std::format_to(std::back_inserter(repr), "{}:{}", key, value);
52+
}
53+
}
54+
repr += "]]";
55+
return repr;
56+
}
57+
58+
bool StatisticsFile::Equals(const StatisticsFile& other) const {
59+
return snapshot_id == other.snapshot_id && path == other.path &&
60+
file_size_in_bytes == other.file_size_in_bytes &&
61+
file_footer_size_in_bytes == other.file_footer_size_in_bytes &&
62+
blob_metadata == other.blob_metadata;
63+
}
64+
65+
std::string StatisticsFile::ToString() const {
66+
std::string repr = "StatisticsFile[";
67+
std::format_to(std::back_inserter(repr),
68+
"snapshotId={},path={},fileSizeInBytes={},fileFooterSizeInBytes={},",
69+
snapshot_id, path, file_size_in_bytes, file_footer_size_in_bytes);
70+
std::format_to(std::back_inserter(repr), "blobMetadata=[");
71+
for (auto iter = blob_metadata.cbegin(); iter != blob_metadata.cend(); ++iter) {
72+
if (iter != blob_metadata.cbegin()) {
73+
std::format_to(std::back_inserter(repr), ",{}", iter->ToString());
74+
} else {
75+
std::format_to(std::back_inserter(repr), "{}", iter->ToString());
76+
}
77+
}
78+
repr += "]]";
79+
return repr;
80+
}
81+
82+
} // namespace iceberg

src/iceberg/statistics_file.h

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/statistics_file.h
23+
/// Statistics file for Iceberg tables.
24+
25+
#include <cstdint>
26+
#include <string>
27+
#include <unordered_map>
28+
#include <vector>
29+
30+
#include "iceberg/iceberg_export.h"
31+
#include "iceberg/util/formattable.h"
32+
33+
namespace iceberg {
34+
35+
/// \brief A metadata about a statistics or indices blob
36+
struct ICEBERG_EXPORT BlobMetadata : public util::Formattable {
37+
/// Type of the blob
38+
std::string type;
39+
/// ID of the Iceberg table's snapshot the blob was computed from
40+
int64_t source_snapshot_id;
41+
/// Sequence number of the Iceberg table's snapshot the blob was computed from
42+
int64_t source_snapshot_sequence_number;
43+
/// Ordered list of fields the blob was calculated from
44+
std::vector<int32_t> fields;
45+
/// Additional properties of the blob, specific to the blob type
46+
std::unordered_map<std::string, std::string> properties;
47+
48+
/// \brief Compare two BlobMetadatas for equality.
49+
friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) {
50+
return lhs.Equals(rhs);
51+
}
52+
53+
/// \brief Compare two BlobMetadatas for inequality.
54+
friend bool operator!=(const BlobMetadata& lhs, const BlobMetadata& rhs) {
55+
return !(lhs == rhs);
56+
}
57+
58+
std::string ToString() const override;
59+
60+
private:
61+
bool Equals(const BlobMetadata& other) const;
62+
};
63+
64+
/// \brief Represents a statistics file in the Puffin format
65+
struct ICEBERG_EXPORT StatisticsFile : public util::Formattable {
66+
/// ID of the Iceberg table's snapshot the statistics file is associated with
67+
int64_t snapshot_id;
68+
/// Fully qualified path to the file
69+
std::string path;
70+
/// The size of the file in bytes
71+
int64_t file_size_in_bytes;
72+
/// The size of the file footer in bytes
73+
int64_t file_footer_size_in_bytes;
74+
/// List of statistics contained in the file
75+
std::vector<BlobMetadata> blob_metadata;
76+
77+
/// \brief Compare two StatisticsFiles for equality.
78+
friend bool operator==(const StatisticsFile& lhs, const StatisticsFile& rhs) {
79+
return lhs.Equals(rhs);
80+
}
81+
82+
/// \brief Compare two StatisticsFiles for inequality.
83+
friend bool operator!=(const StatisticsFile& lhs, const StatisticsFile& rhs) {
84+
return !(lhs == rhs);
85+
}
86+
87+
std::string ToString() const override;
88+
89+
private:
90+
bool Equals(const StatisticsFile& other) const;
91+
};
92+
93+
/// \brief Represents a partition statistics file
94+
struct ICEBERG_EXPORT PartitionStatisticsFile {
95+
/// Snapshot ID of the Iceberg table's snapshot the partition statistics file is
96+
/// associated with
97+
int64_t snapshot_id;
98+
/// Fully qualified path to the file
99+
std::string path;
100+
/// The size of the partition statistics file in bytes
101+
int64_t file_size_in_bytes;
102+
};
103+
104+
} // namespace iceberg

src/iceberg/table_metadata.cc

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/table_metadata.h"
21+
22+
#include <format>
23+
#include <string>
24+
25+
#include "iceberg/statistics_file.h"
26+
27+
namespace iceberg {
28+
29+
std::string SnapshotLogEntry::ToString() const {
30+
return std::format("SnapshotLogEntry[timestampMillis={},snapshotId={}]", timestamp_ms,
31+
snapshot_id);
32+
}
33+
34+
std::string MetadataLogEntry::ToString() const {
35+
return std::format("MetadataLogEntry[timestampMillis={},file={}]", timestamp_ms, file);
36+
}
37+
38+
} // namespace iceberg

src/iceberg/table_metadata.h

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/table_metadata.h
23+
/// Table metadata for Iceberg tables.
24+
25+
#include <chrono>
26+
#include <memory>
27+
#include <string>
28+
#include <unordered_map>
29+
#include <vector>
30+
31+
#include "iceberg/iceberg_export.h"
32+
#include "iceberg/type_fwd.h"
33+
#include "iceberg/util/formattable.h"
34+
35+
namespace iceberg {
36+
37+
using TimePointMs =
38+
std::chrono::time_point<std::chrono::system_clock, std::chrono::milliseconds>;
39+
40+
/// \brief Represents a snapshot log entry
41+
struct ICEBERG_EXPORT SnapshotLogEntry : public util::Formattable {
42+
/// The timestamp in milliseconds of the change
43+
TimePointMs timestamp_ms;
44+
/// ID of the snapshot
45+
int64_t snapshot_id;
46+
47+
std::string ToString() const override;
48+
};
49+
50+
/// \brief Represents a metadata log entry
51+
struct ICEBERG_EXPORT MetadataLogEntry : public util::Formattable {
52+
/// The timestamp in milliseconds of the change
53+
TimePointMs timestamp_ms;
54+
/// Metadata file location
55+
std::string file;
56+
57+
std::string ToString() const override;
58+
};
59+
60+
/// \brief Represents the metadata for an Iceberg table
61+
///
62+
/// Note that it only contains table metadata from the spec. Compared to the Java
63+
/// implementation, missing pieces including: 1) Map<Integer,
64+
/// Schema|PartitionSpec|SortOrder> 2) List<MetadataUpdate> 3) Map<Long, Snapshot> 4)
65+
/// Map<String, SnapshotRef>
66+
///
67+
/// TODO(wgtmac): Implement Equals and ToString once SortOrder and Snapshot are
68+
/// implemented.
69+
struct ICEBERG_EXPORT TableMetadata {
70+
/// An integer version number for the format
71+
int8_t format_version;
72+
/// A UUID that identifies the table
73+
std::string table_uuid;
74+
/// The table's base location
75+
std::string location;
76+
/// The table's highest assigned sequence number
77+
int64_t last_sequence_number;
78+
/// Timestamp in milliseconds from the unix epoch when the table was last updated.
79+
int64_t last_updated_ms;
80+
/// The highest assigned column ID for the table
81+
int32_t last_column_id;
82+
/// A list of schemas
83+
std::vector<std::shared_ptr<Schema>> schemas;
84+
/// ID of the table's current schema
85+
int32_t current_schema_id;
86+
/// A list of partition specs
87+
std::vector<std::shared_ptr<PartitionSpec>> partition_specs;
88+
/// ID of the current partition spec that writers should use by default
89+
int32_t default_spec_id;
90+
/// The highest assigned partition field ID across all partition specs for the table
91+
int32_t last_partition_id;
92+
/// A string to string map of table properties
93+
std::unordered_map<std::string, std::string> properties;
94+
/// ID of the current table snapshot
95+
int64_t current_snapshot_id;
96+
/// A list of valid snapshots
97+
std::vector<std::shared_ptr<Snapshot>> snapshots;
98+
/// A list of timestamp and snapshot ID pairs that encodes changes to the current
99+
/// snapshot for the table
100+
std::vector<SnapshotLogEntry> snapshot_log;
101+
/// A list of timestamp and metadata file location pairs that encodes changes to the
102+
/// previous metadata files for the table
103+
std::vector<MetadataLogEntry> metadata_log;
104+
/// A list of sort orders
105+
std::vector<std::shared_ptr<SortOrder>> sort_orders;
106+
/// Default sort order id of the table
107+
int32_t default_sort_order_id;
108+
/// A map of snapshot references
109+
std::unordered_map<std::string, std::string> refs;
110+
/// A list of table statistics
111+
std::vector<std::shared_ptr<struct StatisticsFile>> statistics;
112+
/// A list of partition statistics
113+
std::vector<std::shared_ptr<struct PartitionStatisticsFile>> partition_statistics;
114+
/// whether or not to track the creation and updates to rows in the table
115+
bool row_lineage_enabled = false;
116+
/// A `long` higher than all assigned row IDs
117+
int64_t next_row_id;
118+
};
119+
120+
} // namespace iceberg

0 commit comments

Comments
 (0)