Skip to content

Commit 5871171

Browse files
committed
feat: add table metadata definition
1 parent d05a9b2 commit 5871171

File tree

5 files changed

+341
-0
lines changed

5 files changed

+341
-0
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ set(ICEBERG_SOURCES
2525
schema_internal.cc
2626
partition_field.cc
2727
partition_spec.cc
28+
statistics_file.cc
29+
table_metadata.cc
2830
transform.cc
2931
type.cc)
3032

src/iceberg/statistics_file.cc

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/statistics_file.h"
21+
22+
#include <format>
23+
24+
namespace iceberg {
25+
26+
bool BlobMetadata::Equals(const BlobMetadata& other) const {
27+
return type == other.type && source_snapshot_id == other.source_snapshot_id &&
28+
source_snapshot_sequence_number == other.source_snapshot_sequence_number &&
29+
fields == other.fields && properties == other.properties;
30+
}
31+
32+
std::string BlobMetadata::ToString() const {
33+
std::string repr = "BlobMetadata[";
34+
std::format_to(std::back_inserter(repr),
35+
"type='{}',sourceSnapshotId={},sourceSnapshotSequenceNumber={},", type,
36+
source_snapshot_id, source_snapshot_sequence_number);
37+
std::format_to(std::back_inserter(repr), "fields=[");
38+
for (auto iter = fields.cbegin(); iter != fields.cend(); ++iter) {
39+
if (iter != fields.cbegin()) {
40+
std::format_to(std::back_inserter(repr), ",{}", *iter);
41+
} else {
42+
std::format_to(std::back_inserter(repr), "{}", *iter);
43+
}
44+
}
45+
std::format_to(std::back_inserter(repr), "],properties=[");
46+
for (auto iter = properties.cbegin(); iter != properties.cend(); ++iter) {
47+
const auto& [key, value] = *iter;
48+
if (iter != properties.cbegin()) {
49+
std::format_to(std::back_inserter(repr), ",{}:{}", key, value);
50+
} else {
51+
std::format_to(std::back_inserter(repr), "{}:{}", key, value);
52+
}
53+
}
54+
repr += "]]";
55+
return repr;
56+
}
57+
58+
bool StatisticsFile::Equals(const StatisticsFile& other) const {
59+
return snapshot_id == other.snapshot_id && path == other.path &&
60+
file_size_in_bytes == other.file_size_in_bytes &&
61+
file_footer_size_in_bytes == other.file_footer_size_in_bytes &&
62+
blob_metadata == other.blob_metadata;
63+
}
64+
65+
std::string StatisticsFile::ToString() const {
66+
std::string repr = "StatisticsFile[";
67+
std::format_to(std::back_inserter(repr),
68+
"snapshotId={},path={},fileSizeInBytes={},fileFooterSizeInBytes={},",
69+
snapshot_id, path, file_size_in_bytes, file_footer_size_in_bytes);
70+
std::format_to(std::back_inserter(repr), "blobMetadata=[");
71+
for (auto iter = blob_metadata.cbegin(); iter != blob_metadata.cend(); ++iter) {
72+
if (iter != blob_metadata.cbegin()) {
73+
std::format_to(std::back_inserter(repr), ",{}", iter->ToString());
74+
} else {
75+
std::format_to(std::back_inserter(repr), "{}", iter->ToString());
76+
}
77+
}
78+
repr += "]]";
79+
return repr;
80+
}
81+
82+
} // namespace iceberg

src/iceberg/statistics_file.h

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/statistics_file.h
23+
/// Statistics file for Iceberg tables.
24+
25+
#include <cstdint>
26+
#include <map>
27+
#include <string>
28+
#include <vector>
29+
30+
#include "iceberg/iceberg_export.h"
31+
#include "iceberg/util/formattable.h"
32+
33+
namespace iceberg {
34+
35+
/// \brief A metadata about a statistics or indices blob
36+
struct ICEBERG_EXPORT BlobMetadata : public util::Formattable {
37+
/// Type of the blob
38+
std::string type;
39+
/// ID of the Iceberg table's snapshot the blob was computed from
40+
int64_t source_snapshot_id;
41+
/// Sequence number of the Iceberg table's snapshot the blob was computed from
42+
int64_t source_snapshot_sequence_number;
43+
/// Ordered list of fields the blob was calculated from
44+
std::vector<int32_t> fields;
45+
/// Additional properties of the blob, specific to the blob type
46+
std::map<std::string, std::string> properties;
47+
48+
/// \brief Compare two BlobMetadatas for equality.
49+
friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) {
50+
return lhs.Equals(rhs);
51+
}
52+
53+
/// \brief Compare two BlobMetadatas for inequality.
54+
friend bool operator!=(const BlobMetadata& lhs, const BlobMetadata& rhs) {
55+
return !(lhs == rhs);
56+
}
57+
58+
std::string ToString() const override;
59+
60+
private:
61+
bool Equals(const BlobMetadata& other) const;
62+
};
63+
64+
/// \brief Represents a statistics file in the Puffin format
65+
struct ICEBERG_EXPORT StatisticsFile : public util::Formattable {
66+
/// ID of the Iceberg table's snapshot the statistics file is associated with
67+
int64_t snapshot_id;
68+
/// Fully qualified path to the file
69+
std::string path;
70+
/// The size of the file in bytes
71+
int64_t file_size_in_bytes;
72+
/// The size of the file footer in bytes
73+
int64_t file_footer_size_in_bytes;
74+
/// List of statistics contained in the file
75+
std::vector<BlobMetadata> blob_metadata;
76+
77+
/// \brief Compare two StatisticsFiles for equality.
78+
friend bool operator==(const StatisticsFile& lhs, const StatisticsFile& rhs) {
79+
return lhs.Equals(rhs);
80+
}
81+
82+
/// \brief Compare two StatisticsFiles for inequality.
83+
friend bool operator!=(const StatisticsFile& lhs, const StatisticsFile& rhs) {
84+
return !(lhs == rhs);
85+
}
86+
87+
std::string ToString() const override;
88+
89+
private:
90+
bool Equals(const StatisticsFile& other) const;
91+
};
92+
93+
/// \brief Represents a partition statistics file
94+
struct ICEBERG_EXPORT PartitionStatisticsFile {
95+
/// Snapshot ID of the Iceberg table's snapshot the partition statistics file is
96+
/// associated with
97+
int64_t snapshot_id;
98+
/// Fully qualified path to the file
99+
std::string path;
100+
/// The size of the partition statistics file in bytes
101+
int64_t file_size_in_bytes;
102+
};
103+
104+
} // namespace iceberg

src/iceberg/table_metadata.cc

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/table_metadata.h"
21+
22+
#include <format>
23+
#include <string>
24+
25+
#include "iceberg/statistics_file.h"
26+
27+
namespace iceberg {
28+
29+
std::string SnapshotLogEntry::ToString() const {
30+
return std::format("SnapshotLogEntry[timestampMillis={},snapshotId={}]", timestamp_ms,
31+
snapshot_id);
32+
}
33+
34+
std::string MetadataLogEntry::ToString() const {
35+
return std::format("MetadataLogEntry[timestampMillis={},file={}]", timestamp_ms, file);
36+
}
37+
38+
} // namespace iceberg

src/iceberg/table_metadata.h

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/table_metadata.h
23+
/// Table metadata for Iceberg tables.
24+
25+
#include <map>
26+
#include <string>
27+
#include <vector>
28+
29+
#include "iceberg/iceberg_export.h"
30+
#include "iceberg/type_fwd.h"
31+
#include "iceberg/util/formattable.h"
32+
33+
namespace iceberg {
34+
35+
/// \brief Represents a snapshot log entry
36+
struct ICEBERG_EXPORT SnapshotLogEntry : public util::Formattable {
37+
/// The timestamp in milliseconds of the change
38+
int64_t timestamp_ms;
39+
/// ID of the snapshot
40+
int64_t snapshot_id;
41+
42+
std::string ToString() const override;
43+
};
44+
45+
/// \brief Represents a metadata log entry
46+
struct ICEBERG_EXPORT MetadataLogEntry : public util::Formattable {
47+
/// The timestamp in milliseconds of the change
48+
int64_t timestamp_ms;
49+
/// Metadata file location
50+
std::string file;
51+
52+
std::string ToString() const override;
53+
};
54+
55+
/// \brief Represents the metadata for an Iceberg table
56+
///
57+
/// Note that it only contains table metadata from the spec. Compared to the Java
58+
/// implementation, missing pieces including: 1) Map<Integer,
59+
/// Schema|PartitionSpec|SortOrder> 2) List<MetadataUpdate> 3) Map<Long, Snapshot> 4)
60+
/// Map<String, SnapshotRef>
61+
///
62+
/// TODO(wgtmac): Implement Equals and ToString once SortOrder and Snapshot are
63+
/// implemented.
64+
struct ICEBERG_EXPORT TableMetadata {
65+
/// An integer version number for the format
66+
int8_t format_version;
67+
/// A UUID that identifies the table
68+
std::string table_uuid;
69+
/// The table's base location
70+
std::string location;
71+
/// The table's highest assigned sequence number
72+
int64_t last_sequence_number;
73+
/// Timestamp in milliseconds from the unix epoch when the table was last updated.
74+
int64_t last_updated_ms;
75+
/// The highest assigned column ID for the table
76+
int32_t last_column_id;
77+
/// A list of schemas
78+
std::vector<std::shared_ptr<Schema>> schemas;
79+
/// ID of the table's current schema
80+
int32_t current_schema_id;
81+
/// A list of partition specs
82+
std::vector<std::shared_ptr<PartitionSpec>> partition_specs;
83+
/// ID of the current partition spec that writers should use by default
84+
int32_t default_spec_id;
85+
/// The highest assigned partition field ID across all partition specs for the table
86+
int32_t last_partition_id;
87+
/// A string to string map of table properties
88+
std::map<std::string, std::string> properties;
89+
/// ID of the current table snapshot
90+
int64_t current_snapshot_id;
91+
/// A list of valid snapshots
92+
std::vector<std::shared_ptr<Snapshot>> snapshots;
93+
/// A list of timestamp and snapshot ID pairs that encodes changes to the current
94+
/// snapshot for the table
95+
std::vector<SnapshotLogEntry> snapshot_log;
96+
/// A list of timestamp and metadata file location pairs that encodes changes to the
97+
/// previous metadata files for the table
98+
std::vector<MetadataLogEntry> metadata_log;
99+
/// A list of sort orders
100+
std::vector<std::shared_ptr<SortOrder>> sort_orders;
101+
/// Default sort order id of the table
102+
int32_t default_sort_order_id;
103+
/// A map of snapshot references
104+
std::map<std::string, std::string> refs;
105+
/// A list of table statistics
106+
std::vector<std::shared_ptr<struct StatisticsFile>> statistics;
107+
/// A list of partition statistics
108+
std::vector<std::shared_ptr<struct PartitionStatisticsFile>> partition_statistics;
109+
/// whether or not to track the creation and updates to rows in the table
110+
bool row_lineage_enabled = false;
111+
/// A `long` higher than all assigned row IDs
112+
int64_t next_row_id;
113+
};
114+
115+
} // namespace iceberg

0 commit comments

Comments
 (0)