From f8ea5011abbaeb58df92b5ee7e3aa22030948aca Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 8 Apr 2025 15:55:59 +0800 Subject: [PATCH 1/2] feat: add table metadata definition --- src/iceberg/CMakeLists.txt | 2 + src/iceberg/statistics_file.cc | 82 ++++++++++++++++++++++ src/iceberg/statistics_file.h | 104 ++++++++++++++++++++++++++++ src/iceberg/table_metadata.cc | 38 +++++++++++ src/iceberg/table_metadata.h | 120 +++++++++++++++++++++++++++++++++ 5 files changed, 346 insertions(+) create mode 100644 src/iceberg/statistics_file.cc create mode 100644 src/iceberg/statistics_file.h create mode 100644 src/iceberg/table_metadata.cc create mode 100644 src/iceberg/table_metadata.h diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index fec895240..4ab6b558e 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -25,6 +25,8 @@ set(ICEBERG_SOURCES schema_internal.cc partition_field.cc partition_spec.cc + statistics_file.cc + table_metadata.cc transform.cc type.cc) diff --git a/src/iceberg/statistics_file.cc b/src/iceberg/statistics_file.cc new file mode 100644 index 000000000..07f4f531d --- /dev/null +++ b/src/iceberg/statistics_file.cc @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/statistics_file.h" + +#include + +namespace iceberg { + +bool BlobMetadata::Equals(const BlobMetadata& other) const { + return type == other.type && source_snapshot_id == other.source_snapshot_id && + source_snapshot_sequence_number == other.source_snapshot_sequence_number && + fields == other.fields && properties == other.properties; +} + +std::string BlobMetadata::ToString() const { + std::string repr = "BlobMetadata["; + std::format_to(std::back_inserter(repr), + "type='{}',sourceSnapshotId={},sourceSnapshotSequenceNumber={},", type, + source_snapshot_id, source_snapshot_sequence_number); + std::format_to(std::back_inserter(repr), "fields=["); + for (auto iter = fields.cbegin(); iter != fields.cend(); ++iter) { + if (iter != fields.cbegin()) { + std::format_to(std::back_inserter(repr), ",{}", *iter); + } else { + std::format_to(std::back_inserter(repr), "{}", *iter); + } + } + std::format_to(std::back_inserter(repr), "],properties=["); + for (auto iter = properties.cbegin(); iter != properties.cend(); ++iter) { + const auto& [key, value] = *iter; + if (iter != properties.cbegin()) { + std::format_to(std::back_inserter(repr), ",{}:{}", key, value); + } else { + std::format_to(std::back_inserter(repr), "{}:{}", key, value); + } + } + repr += "]]"; + return repr; +} + +bool StatisticsFile::Equals(const StatisticsFile& other) const { + return snapshot_id == other.snapshot_id && path == other.path && + file_size_in_bytes == other.file_size_in_bytes && + file_footer_size_in_bytes == other.file_footer_size_in_bytes && + blob_metadata == other.blob_metadata; +} + +std::string StatisticsFile::ToString() const { + std::string repr = "StatisticsFile["; + std::format_to(std::back_inserter(repr), + "snapshotId={},path={},fileSizeInBytes={},fileFooterSizeInBytes={},", + snapshot_id, path, file_size_in_bytes, file_footer_size_in_bytes); + std::format_to(std::back_inserter(repr), "blobMetadata=["); + for (auto iter = blob_metadata.cbegin(); iter != blob_metadata.cend(); ++iter) { + if (iter != blob_metadata.cbegin()) { + std::format_to(std::back_inserter(repr), ",{}", iter->ToString()); + } else { + std::format_to(std::back_inserter(repr), "{}", iter->ToString()); + } + } + repr += "]]"; + return repr; +} + +} // namespace iceberg diff --git a/src/iceberg/statistics_file.h b/src/iceberg/statistics_file.h new file mode 100644 index 000000000..0de9587cd --- /dev/null +++ b/src/iceberg/statistics_file.h @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/statistics_file.h +/// Statistics file for Iceberg tables. + +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/util/formattable.h" + +namespace iceberg { + +/// \brief A metadata about a statistics or indices blob +struct ICEBERG_EXPORT BlobMetadata : public util::Formattable { + /// Type of the blob + std::string type; + /// ID of the Iceberg table's snapshot the blob was computed from + int64_t source_snapshot_id; + /// Sequence number of the Iceberg table's snapshot the blob was computed from + int64_t source_snapshot_sequence_number; + /// Ordered list of fields the blob was calculated from + std::vector fields; + /// Additional properties of the blob, specific to the blob type + std::unordered_map properties; + + /// \brief Compare two BlobMetadatas for equality. + friend bool operator==(const BlobMetadata& lhs, const BlobMetadata& rhs) { + return lhs.Equals(rhs); + } + + /// \brief Compare two BlobMetadatas for inequality. + friend bool operator!=(const BlobMetadata& lhs, const BlobMetadata& rhs) { + return !(lhs == rhs); + } + + std::string ToString() const override; + + private: + bool Equals(const BlobMetadata& other) const; +}; + +/// \brief Represents a statistics file in the Puffin format +struct ICEBERG_EXPORT StatisticsFile : public util::Formattable { + /// ID of the Iceberg table's snapshot the statistics file is associated with + int64_t snapshot_id; + /// Fully qualified path to the file + std::string path; + /// The size of the file in bytes + int64_t file_size_in_bytes; + /// The size of the file footer in bytes + int64_t file_footer_size_in_bytes; + /// List of statistics contained in the file + std::vector blob_metadata; + + /// \brief Compare two StatisticsFiles for equality. + friend bool operator==(const StatisticsFile& lhs, const StatisticsFile& rhs) { + return lhs.Equals(rhs); + } + + /// \brief Compare two StatisticsFiles for inequality. + friend bool operator!=(const StatisticsFile& lhs, const StatisticsFile& rhs) { + return !(lhs == rhs); + } + + std::string ToString() const override; + + private: + bool Equals(const StatisticsFile& other) const; +}; + +/// \brief Represents a partition statistics file +struct ICEBERG_EXPORT PartitionStatisticsFile { + /// Snapshot ID of the Iceberg table's snapshot the partition statistics file is + /// associated with + int64_t snapshot_id; + /// Fully qualified path to the file + std::string path; + /// The size of the partition statistics file in bytes + int64_t file_size_in_bytes; +}; + +} // namespace iceberg diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc new file mode 100644 index 000000000..c30ddecd6 --- /dev/null +++ b/src/iceberg/table_metadata.cc @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/table_metadata.h" + +#include +#include + +#include "iceberg/statistics_file.h" + +namespace iceberg { + +std::string SnapshotLogEntry::ToString() const { + return std::format("SnapshotLogEntry[timestampMillis={},snapshotId={}]", timestamp_ms, + snapshot_id); +} + +std::string MetadataLogEntry::ToString() const { + return std::format("MetadataLogEntry[timestampMillis={},file={}]", timestamp_ms, file); +} + +} // namespace iceberg diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h new file mode 100644 index 000000000..5ec1e19a0 --- /dev/null +++ b/src/iceberg/table_metadata.h @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/table_metadata.h +/// Table metadata for Iceberg tables. + +#include +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/type_fwd.h" +#include "iceberg/util/formattable.h" + +namespace iceberg { + +using TimePointMs = + std::chrono::time_point; + +/// \brief Represents a snapshot log entry +struct ICEBERG_EXPORT SnapshotLogEntry : public util::Formattable { + /// The timestamp in milliseconds of the change + TimePointMs timestamp_ms; + /// ID of the snapshot + int64_t snapshot_id; + + std::string ToString() const override; +}; + +/// \brief Represents a metadata log entry +struct ICEBERG_EXPORT MetadataLogEntry : public util::Formattable { + /// The timestamp in milliseconds of the change + TimePointMs timestamp_ms; + /// Metadata file location + std::string file; + + std::string ToString() const override; +}; + +/// \brief Represents the metadata for an Iceberg table +/// +/// Note that it only contains table metadata from the spec. Compared to the Java +/// implementation, missing pieces including: 1) Map 2) List 3) Map 4) +/// Map +/// +/// TODO(wgtmac): Implement Equals and ToString once SortOrder and Snapshot are +/// implemented. +struct ICEBERG_EXPORT TableMetadata { + /// An integer version number for the format + int8_t format_version; + /// A UUID that identifies the table + std::string table_uuid; + /// The table's base location + std::string location; + /// The table's highest assigned sequence number + int64_t last_sequence_number; + /// Timestamp in milliseconds from the unix epoch when the table was last updated. + int64_t last_updated_ms; + /// The highest assigned column ID for the table + int32_t last_column_id; + /// A list of schemas + std::vector> schemas; + /// ID of the table's current schema + int32_t current_schema_id; + /// A list of partition specs + std::vector> partition_specs; + /// ID of the current partition spec that writers should use by default + int32_t default_spec_id; + /// The highest assigned partition field ID across all partition specs for the table + int32_t last_partition_id; + /// A string to string map of table properties + std::unordered_map properties; + /// ID of the current table snapshot + int64_t current_snapshot_id; + /// A list of valid snapshots + std::vector> snapshots; + /// A list of timestamp and snapshot ID pairs that encodes changes to the current + /// snapshot for the table + std::vector snapshot_log; + /// A list of timestamp and metadata file location pairs that encodes changes to the + /// previous metadata files for the table + std::vector metadata_log; + /// A list of sort orders + std::vector> sort_orders; + /// Default sort order id of the table + int32_t default_sort_order_id; + /// A map of snapshot references + std::unordered_map refs; + /// A list of table statistics + std::vector> statistics; + /// A list of partition statistics + std::vector> partition_statistics; + /// whether or not to track the creation and updates to rows in the table + bool row_lineage_enabled = false; + /// A `long` higher than all assigned row IDs + int64_t next_row_id; +}; + +} // namespace iceberg From 4229aca85e74631cc196c7851fc9693bb18577bc Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 8 Apr 2025 22:52:33 +0800 Subject: [PATCH 2/2] Update src/iceberg/table_metadata.h --- src/iceberg/table_metadata.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index 5ec1e19a0..e1665dce1 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -111,8 +111,6 @@ struct ICEBERG_EXPORT TableMetadata { std::vector> statistics; /// A list of partition statistics std::vector> partition_statistics; - /// whether or not to track the creation and updates to rows in the table - bool row_lineage_enabled = false; /// A `long` higher than all assigned row IDs int64_t next_row_id; };