-
Notifications
You must be signed in to change notification settings - Fork 70
feat: add table metadata definition #62
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #include "iceberg/statistics_file.h" | ||
|
|
||
| #include <format> | ||
|
|
||
| namespace iceberg { | ||
|
|
||
| bool BlobMetadata::Equals(const BlobMetadata& other) const { | ||
| return type == other.type && source_snapshot_id == other.source_snapshot_id && | ||
| source_snapshot_sequence_number == other.source_snapshot_sequence_number && | ||
| fields == other.fields && properties == other.properties; | ||
| } | ||
|
|
||
| std::string BlobMetadata::ToString() const { | ||
| std::string repr = "BlobMetadata["; | ||
| std::format_to(std::back_inserter(repr), | ||
| "type='{}',sourceSnapshotId={},sourceSnapshotSequenceNumber={},", type, | ||
| source_snapshot_id, source_snapshot_sequence_number); | ||
| std::format_to(std::back_inserter(repr), "fields=["); | ||
| for (auto iter = fields.cbegin(); iter != fields.cend(); ++iter) { | ||
| if (iter != fields.cbegin()) { | ||
| std::format_to(std::back_inserter(repr), ",{}", *iter); | ||
| } else { | ||
| std::format_to(std::back_inserter(repr), "{}", *iter); | ||
| } | ||
| } | ||
| std::format_to(std::back_inserter(repr), "],properties=["); | ||
| for (auto iter = properties.cbegin(); iter != properties.cend(); ++iter) { | ||
| const auto& [key, value] = *iter; | ||
| if (iter != properties.cbegin()) { | ||
| std::format_to(std::back_inserter(repr), ",{}:{}", key, value); | ||
| } else { | ||
| std::format_to(std::back_inserter(repr), "{}:{}", key, value); | ||
| } | ||
| } | ||
| repr += "]]"; | ||
| return repr; | ||
| } | ||
|
|
||
| bool StatisticsFile::Equals(const StatisticsFile& other) const { | ||
| return snapshot_id == other.snapshot_id && path == other.path && | ||
| file_size_in_bytes == other.file_size_in_bytes && | ||
| file_footer_size_in_bytes == other.file_footer_size_in_bytes && | ||
| blob_metadata == other.blob_metadata; | ||
| } | ||
|
|
||
| std::string StatisticsFile::ToString() const { | ||
| std::string repr = "StatisticsFile["; | ||
| std::format_to(std::back_inserter(repr), | ||
| "snapshotId={},path={},fileSizeInBytes={},fileFooterSizeInBytes={},", | ||
| snapshot_id, path, file_size_in_bytes, file_footer_size_in_bytes); | ||
| std::format_to(std::back_inserter(repr), "blobMetadata=["); | ||
| for (auto iter = blob_metadata.cbegin(); iter != blob_metadata.cend(); ++iter) { | ||
| if (iter != blob_metadata.cbegin()) { | ||
| std::format_to(std::back_inserter(repr), ",{}", iter->ToString()); | ||
| } else { | ||
| std::format_to(std::back_inserter(repr), "{}", iter->ToString()); | ||
| } | ||
| } | ||
| repr += "]]"; | ||
| return repr; | ||
| } | ||
|
|
||
| } // namespace iceberg |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #pragma once | ||
|
|
||
| /// \file iceberg/statistics_file.h | ||
| /// Statistics file for Iceberg tables. | ||
|
|
||
| #include <cstdint> | ||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
|
|
||
| #include "iceberg/iceberg_export.h" | ||
| #include "iceberg/util/formattable.h" | ||
|
|
||
| namespace iceberg { | ||
|
|
||
| /// \brief A metadata about a statistics or indices blob | ||
| struct ICEBERG_EXPORT BlobMetadata : public util::Formattable { | ||
| /// Type of the blob | ||
| std::string type; | ||
| /// ID of the Iceberg table's snapshot the blob was computed from | ||
| int64_t source_snapshot_id; | ||
| /// Sequence number of the Iceberg table's snapshot the blob was computed from | ||
| int64_t source_snapshot_sequence_number; | ||
| /// Ordered list of fields the blob was calculated from | ||
| std::vector<int32_t> fields; | ||
| /// Additional properties of the blob, specific to the blob type | ||
| std::unordered_map<std::string, std::string> properties; | ||
|
|
||
| /// \brief Compare two BlobMetadatas for equality. | ||
| friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) { | ||
| return lhs.Equals(rhs); | ||
| } | ||
|
|
||
| /// \brief Compare two BlobMetadatas for inequality. | ||
| friend bool operator!=(const BlobMetadata& lhs, const BlobMetadata& rhs) { | ||
| return !(lhs == rhs); | ||
| } | ||
|
|
||
| std::string ToString() const override; | ||
|
|
||
| private: | ||
| bool Equals(const BlobMetadata& other) const; | ||
| }; | ||
|
|
||
| /// \brief Represents a statistics file in the Puffin format | ||
| struct ICEBERG_EXPORT StatisticsFile : public util::Formattable { | ||
| /// ID of the Iceberg table's snapshot the statistics file is associated with | ||
| int64_t snapshot_id; | ||
| /// Fully qualified path to the file | ||
| std::string path; | ||
| /// The size of the file in bytes | ||
| int64_t file_size_in_bytes; | ||
| /// The size of the file footer in bytes | ||
| int64_t file_footer_size_in_bytes; | ||
| /// List of statistics contained in the file | ||
| std::vector<BlobMetadata> blob_metadata; | ||
|
|
||
| /// \brief Compare two StatisticsFiles for equality. | ||
| friend bool operator==(const StatisticsFile& lhs, const StatisticsFile& rhs) { | ||
| return lhs.Equals(rhs); | ||
| } | ||
|
|
||
| /// \brief Compare two StatisticsFiles for inequality. | ||
| friend bool operator!=(const StatisticsFile& lhs, const StatisticsFile& rhs) { | ||
| return !(lhs == rhs); | ||
| } | ||
|
|
||
| std::string ToString() const override; | ||
|
|
||
| private: | ||
| bool Equals(const StatisticsFile& other) const; | ||
| }; | ||
|
|
||
| /// \brief Represents a partition statistics file | ||
| struct ICEBERG_EXPORT PartitionStatisticsFile { | ||
| /// Snapshot ID of the Iceberg table's snapshot the partition statistics file is | ||
| /// associated with | ||
| int64_t snapshot_id; | ||
| /// Fully qualified path to the file | ||
| std::string path; | ||
| /// The size of the partition statistics file in bytes | ||
| int64_t file_size_in_bytes; | ||
| }; | ||
|
|
||
| } // namespace iceberg |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #include "iceberg/table_metadata.h" | ||
|
|
||
| #include <format> | ||
| #include <string> | ||
|
|
||
| #include "iceberg/statistics_file.h" | ||
|
|
||
| namespace iceberg { | ||
|
|
||
| std::string SnapshotLogEntry::ToString() const { | ||
| return std::format("SnapshotLogEntry[timestampMillis={},snapshotId={}]", timestamp_ms, | ||
| snapshot_id); | ||
| } | ||
|
|
||
| std::string MetadataLogEntry::ToString() const { | ||
| return std::format("MetadataLogEntry[timestampMillis={},file={}]", timestamp_ms, file); | ||
| } | ||
|
|
||
| } // namespace iceberg | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| #pragma once | ||
|
|
||
| /// \file iceberg/table_metadata.h | ||
| /// Table metadata for Iceberg tables. | ||
|
|
||
| #include <chrono> | ||
| #include <memory> | ||
| #include <string> | ||
| #include <unordered_map> | ||
| #include <vector> | ||
|
|
||
| #include "iceberg/iceberg_export.h" | ||
| #include "iceberg/type_fwd.h" | ||
| #include "iceberg/util/formattable.h" | ||
|
|
||
| namespace iceberg { | ||
|
|
||
| using TimePointMs = | ||
| std::chrono::time_point<std::chrono::system_clock, std::chrono::milliseconds>; | ||
|
|
||
| /// \brief Represents a snapshot log entry | ||
| struct ICEBERG_EXPORT SnapshotLogEntry : public util::Formattable { | ||
| /// The timestamp in milliseconds of the change | ||
wgtmac marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| TimePointMs timestamp_ms; | ||
| /// ID of the snapshot | ||
| int64_t snapshot_id; | ||
|
|
||
| std::string ToString() const override; | ||
| }; | ||
|
|
||
| /// \brief Represents a metadata log entry | ||
| struct ICEBERG_EXPORT MetadataLogEntry : public util::Formattable { | ||
| /// The timestamp in milliseconds of the change | ||
| TimePointMs timestamp_ms; | ||
| /// Metadata file location | ||
| std::string file; | ||
|
|
||
| std::string ToString() const override; | ||
| }; | ||
|
|
||
| /// \brief Represents the metadata for an Iceberg table | ||
| /// | ||
| /// Note that it only contains table metadata from the spec. Compared to the Java | ||
| /// implementation, missing pieces including: 1) Map<Integer, | ||
| /// Schema|PartitionSpec|SortOrder> 2) List<MetadataUpdate> 3) Map<Long, Snapshot> 4) | ||
| /// Map<String, SnapshotRef> | ||
| /// | ||
| /// TODO(wgtmac): Implement Equals and ToString once SortOrder and Snapshot are | ||
| /// implemented. | ||
| struct ICEBERG_EXPORT TableMetadata { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you need to mark some fields as optional?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think we need but I'm still unclear which one should change. I will postpone this decision until we implement the json serialization from/to different table format versions. |
||
| /// An integer version number for the format | ||
| int8_t format_version; | ||
| /// A UUID that identifies the table | ||
| std::string table_uuid; | ||
| /// The table's base location | ||
| std::string location; | ||
| /// The table's highest assigned sequence number | ||
| int64_t last_sequence_number; | ||
| /// Timestamp in milliseconds from the unix epoch when the table was last updated. | ||
| int64_t last_updated_ms; | ||
| /// The highest assigned column ID for the table | ||
| int32_t last_column_id; | ||
| /// A list of schemas | ||
| std::vector<std::shared_ptr<Schema>> schemas; | ||
| /// ID of the table's current schema | ||
| int32_t current_schema_id; | ||
| /// A list of partition specs | ||
| std::vector<std::shared_ptr<PartitionSpec>> partition_specs; | ||
| /// ID of the current partition spec that writers should use by default | ||
| int32_t default_spec_id; | ||
| /// The highest assigned partition field ID across all partition specs for the table | ||
| int32_t last_partition_id; | ||
| /// A string to string map of table properties | ||
| std::unordered_map<std::string, std::string> properties; | ||
| /// ID of the current table snapshot | ||
| int64_t current_snapshot_id; | ||
| /// A list of valid snapshots | ||
| std::vector<std::shared_ptr<Snapshot>> snapshots; | ||
| /// A list of timestamp and snapshot ID pairs that encodes changes to the current | ||
| /// snapshot for the table | ||
| std::vector<SnapshotLogEntry> snapshot_log; | ||
| /// A list of timestamp and metadata file location pairs that encodes changes to the | ||
| /// previous metadata files for the table | ||
| std::vector<MetadataLogEntry> metadata_log; | ||
| /// A list of sort orders | ||
| std::vector<std::shared_ptr<SortOrder>> sort_orders; | ||
| /// Default sort order id of the table | ||
| int32_t default_sort_order_id; | ||
| /// A map of snapshot references | ||
| std::unordered_map<std::string, std::string> refs; | ||
| /// A list of table statistics | ||
| std::vector<std::shared_ptr<struct StatisticsFile>> statistics; | ||
| /// A list of partition statistics | ||
| std::vector<std::shared_ptr<struct PartitionStatisticsFile>> partition_statistics; | ||
| /// A `long` higher than all assigned row IDs | ||
| int64_t next_row_id; | ||
| }; | ||
|
|
||
| } // namespace iceberg | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a reason why this include is here and not in
table_metadata.hwhere we seem to require the following?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just want to use forward declaration as much as possible. The implementation is not yet complete due to missing the concrete implementation of
Snapshotand other classes so it looks weird thaticeberg/statistics_file.his included but not used at the moment.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but as those would still be part of the table metadata API wouldn't it be better to be part of the header file? I am learning C++, I just thought it would still be better there even when we add the implementation part but I might be wrong :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a common practice to use forward declaration to speed up compilation if the implementation detail is not required in the header file, though I suspect that modern compilers are smart enough to optimize this.