From a21d270d2599c04752b9cc348f449e746d67c81c Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Mon, 5 May 2025 20:26:34 +0800 Subject: [PATCH 01/11] feat: add manifest related structures Add DataFile, ManifestEntry, ManifestFile, and ManifestList to Iceberg core. Support for parsing these data structures from Avro file will be added in future PRs. --- src/iceberg/CMakeLists.txt | 2 + src/iceberg/file_format.h | 11 ++ src/iceberg/manifest_entry.cc | 161 ++++++++++++++++++++++ src/iceberg/manifest_entry.h | 253 ++++++++++++++++++++++++++++++++++ src/iceberg/manifest_list.cc | 103 ++++++++++++++ src/iceberg/manifest_list.h | 203 +++++++++++++++++++++++++++ src/iceberg/manifest_reader.h | 6 +- src/iceberg/type_fwd.h | 5 + 8 files changed, 742 insertions(+), 2 deletions(-) create mode 100644 src/iceberg/manifest_entry.cc create mode 100644 src/iceberg/manifest_entry.h create mode 100644 src/iceberg/manifest_list.cc create mode 100644 src/iceberg/manifest_list.h diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 328befd54..85cf84d2d 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -23,6 +23,8 @@ set(ICEBERG_SOURCES expression/expression.cc file_reader.cc json_internal.cc + manifest_entry.cc + manifest_list.cc metadata_columns.cc name_mapping.cc partition_field.cc diff --git a/src/iceberg/file_format.h b/src/iceberg/file_format.h index 883782c7c..eebb76d14 100644 --- a/src/iceberg/file_format.h +++ b/src/iceberg/file_format.h @@ -25,6 +25,7 @@ #include #include "iceberg/iceberg_export.h" +#include "iceberg/result.h" namespace iceberg { @@ -50,4 +51,14 @@ ICEBERG_EXPORT inline std::string_view ToString(FileFormatType format_type) { } } +/// \brief Convert a string to a FileFormatType +ICEBERG_EXPORT constexpr Result FileFormatTypeFromString( + std::string_view str) noexcept { + if (str == "parquet") return FileFormatType::kParquet; + if (str == "avro") return FileFormatType::kAvro; + if (str == "orc") return FileFormatType::kOrc; + if (str == "puffin") return FileFormatType::kPuffin; + return InvalidArgument("Invalid file format type: {}", str); +} + } // namespace iceberg diff --git a/src/iceberg/manifest_entry.cc b/src/iceberg/manifest_entry.cc new file mode 100644 index 000000000..017252307 --- /dev/null +++ b/src/iceberg/manifest_entry.cc @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_entry.h" + +#include +#include +#include + +#include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/type.h" + +namespace iceberg { +const SchemaField DataFile::CONTENT = + SchemaField::MakeRequired(134, "content", std::make_shared()); +const SchemaField DataFile::FILE_PATH = + SchemaField::MakeRequired(100, "file_path", std::make_shared()); +const SchemaField DataFile::FILE_FORMAT = + SchemaField::MakeRequired(101, "file_format", std::make_shared()); +const SchemaField DataFile::RECORD_COUNT = + SchemaField::MakeRequired(103, "record_count", std::make_shared()); +const SchemaField DataFile::FILE_SIZE = + SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared()); +const SchemaField DataFile::COLUMN_SIZES = SchemaField::MakeOptional( + 108, "column_sizes", + std::make_shared( + SchemaField::MakeRequired(117, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(118, std::string(MapType::kValueName), + std::make_shared()))); +const SchemaField DataFile::VALUE_COUNTS = SchemaField::MakeOptional( + 109, "value_counts", + std::make_shared( + SchemaField::MakeRequired(119, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(120, std::string(MapType::kValueName), + std::make_shared()))); +const SchemaField DataFile::NULL_VALUE_COUNTS = SchemaField::MakeOptional( + 110, "null_value_counts", + std::make_shared( + SchemaField::MakeRequired(121, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(122, std::string(MapType::kValueName), + std::make_shared()))); +const SchemaField DataFile::NAN_VALUE_COUNTS = SchemaField::MakeOptional( + 137, "nan_value_counts", + std::make_shared( + SchemaField::MakeRequired(138, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(139, std::string(MapType::kValueName), + std::make_shared()))); +const SchemaField DataFile::LOWER_BOUNDS = SchemaField::MakeOptional( + 125, "lower_bounds", + std::make_shared( + SchemaField::MakeRequired(126, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(127, std::string(MapType::kValueName), + std::make_shared()))); +const SchemaField DataFile::UPPER_BOUNDS = SchemaField::MakeOptional( + 128, "upper_bounds", + std::make_shared( + SchemaField::MakeRequired(129, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(130, std::string(MapType::kValueName), + std::make_shared()))); +const SchemaField DataFile::KEY_METADATA = + SchemaField::MakeOptional(131, "key_metadata", std::make_shared()); +const SchemaField DataFile::SPLIT_OFFSETS = SchemaField::MakeOptional( + 132, "split_offsets", + std::make_shared(SchemaField::MakeRequired( + 133, std::string(ListType::kElementName), std::make_shared()))); +const SchemaField DataFile::EQUALITY_IDS = SchemaField::MakeOptional( + 135, "equality_ids", + std::make_shared(SchemaField::MakeRequired( + 136, std::string(ListType::kElementName), std::make_shared()))); +const SchemaField DataFile::SORT_ORDER_ID = + SchemaField::MakeOptional(140, "sort_order_id", std::make_shared()); +const SchemaField DataFile::FIRST_ROW_ID = + SchemaField::MakeOptional(142, "first_row_id", std::make_shared()); +const SchemaField DataFile::REFERENCED_DATA_FILE = SchemaField::MakeOptional( + 143, "referenced_data_file", std::make_shared()); +const SchemaField DataFile::CONTENT_OFFSET = + SchemaField::MakeOptional(144, "content_offset", std::make_shared()); +const SchemaField DataFile::CONTENT_SIZE = + SchemaField::MakeOptional(145, "content_size_in_bytes", std::make_shared()); + +StructType DataFile::GetType(StructType partition_type) { + std::vector fields; + + fields.push_back(CONTENT); + fields.push_back(FILE_PATH); + fields.push_back(FILE_FORMAT); + fields.push_back(SchemaField::MakeRequired( + 102, "partition", std::make_shared(partition_type))); + fields.push_back(RECORD_COUNT); + fields.push_back(FILE_SIZE); + fields.push_back(COLUMN_SIZES); + fields.push_back(VALUE_COUNTS); + fields.push_back(NULL_VALUE_COUNTS); + fields.push_back(NAN_VALUE_COUNTS); + fields.push_back(LOWER_BOUNDS); + fields.push_back(UPPER_BOUNDS); + fields.push_back(KEY_METADATA); + fields.push_back(SPLIT_OFFSETS); + fields.push_back(EQUALITY_IDS); + fields.push_back(SORT_ORDER_ID); + fields.push_back(FIRST_ROW_ID); + fields.push_back(REFERENCED_DATA_FILE); + fields.push_back(CONTENT_OFFSET); + fields.push_back(CONTENT_SIZE); + + return StructType(std::move(fields)); +} + +const SchemaField ManifestEntry::STATUS = + SchemaField::MakeRequired(0, "status", std::make_shared()); +const SchemaField ManifestEntry::SNAPSHOT_ID = + SchemaField::MakeOptional(1, "snapshot_id", std::make_shared()); +const SchemaField ManifestEntry::SEQUENCE_NUMBER = + SchemaField::MakeOptional(3, "sequence_number", std::make_shared()); +const SchemaField ManifestEntry::FILE_SEQUENCE_NUMBER = + SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared()); + +Schema ManifestEntry::GetSchema(StructType partition_type) { + return GetSchemaFromDataFileType(DataFile::GetType(partition_type)); +} + +Schema ManifestEntry::GetSchemaFromDataFileType(StructType datafile_type) { + std::vector fields; + + fields.push_back(STATUS); + fields.push_back(SNAPSHOT_ID); + fields.push_back(SEQUENCE_NUMBER); + fields.push_back(FILE_SEQUENCE_NUMBER); + + // Add the data file schema + auto data_file_type_field = SchemaField::MakeRequired( + 2, "data_file", std::make_shared(DataFile::GetType(datafile_type))); + fields.push_back(data_file_type_field); + + return {std::move(fields), /*schema_id=*/std::nullopt}; +} + +} // namespace iceberg diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h new file mode 100644 index 000000000..f8c9a6b72 --- /dev/null +++ b/src/iceberg/manifest_entry.h @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::map partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::unordered_map column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::unordered_map value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::unordered_map null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::unordered_map nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-Nan values in the + /// column for the file. + /// + /// Reference: + /// - [Binary single-value + /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) + std::unordered_map> upper_bounds; + /// Field id: 131 + /// Implementation-specific key metadata for encryption + std::optional> key_metadata; + /// Field id: 132 + /// Element Field id: 133 + /// Split offsets for the data file. For example, all row group offsets in a Parquet + /// file. Must be sorted ascending. + std::vector split_offsets; + /// Field id: 135 + /// Element Field id: 136 + /// Field ids used to determine row equality in equality delete files. Required when + /// content=2 and should be null otherwise. Fields with ids listed in this column must + /// be present in the delete file. + std::vector equality_ids; + /// Field id: 140 + /// ID representing sort order for this file + /// + /// If sort order ID is missing or unknown, then the order is assumed to be unsorted. + /// Only data files and equality delete files should be written with a non-null order + /// id. Position deletes are required to be sorted by file and position, not a table + /// order, and should set sort order id to null. Readers must ignore sort order id for + /// position delete files. + std::optional sort_order_id; + /// Field id: 142 + /// The _row_id for the first row in the data file. + /// + /// Reference: + /// - [First Row ID + /// Inheritance](https://github.com/apache/iceberg/blob/main/format/spec.md#first-row-id-inheritance) + std::optional first_row_id; + /// Field id: 143 + /// Fully qualified location (URI with FS scheme) of a data file that all deletes + /// reference. + /// + /// Position delete metadata can use referenced_data_file when all deletes tracked by + /// the entry are in a single data file. Setting the referenced file is required for + /// deletion vectors. + std::optional referenced_data_file; + /// Field id: 144 + /// The offset in the file where the content starts. + /// + /// The content_offset and content_size_in_bytes fields are used to reference a specific + /// blob for direct access to a deletion vector. For deletion vectors, these values are + /// required and must exactly match the offset and length stored in the Puffin footer + /// for the deletion vector blob. + std::optional content_offset; + /// Field id: 145 + /// The length of a referenced content stored in the file; required if content_offset is + /// present + std::optional content_size_in_bytes; + + static const SchemaField CONTENT; + static const SchemaField FILE_PATH; + static const SchemaField FILE_FORMAT; + static const SchemaField RECORD_COUNT; + static const SchemaField FILE_SIZE; + static const SchemaField COLUMN_SIZES; + static const SchemaField VALUE_COUNTS; + static const SchemaField NULL_VALUE_COUNTS; + static const SchemaField NAN_VALUE_COUNTS; + static const SchemaField LOWER_BOUNDS; + static const SchemaField UPPER_BOUNDS; + static const SchemaField KEY_METADATA; + static const SchemaField SPLIT_OFFSETS; + static const SchemaField EQUALITY_IDS; + static const SchemaField SORT_ORDER_ID; + static const SchemaField FIRST_ROW_ID; + static const SchemaField REFERENCED_DATA_FILE; + static const SchemaField CONTENT_OFFSET; + static const SchemaField CONTENT_SIZE; + + static StructType GetType(StructType partition_type); +}; + +/// \brief A manifest is an immutable Avro file that lists data files or delete files, +/// along with each file's partition data tuple, metrics, and tracking information. + +/// \brief The schema of a manifest file +struct ICEBERG_EXPORT ManifestEntry { + /// Field id: 0 + /// Used to track additions and deletions. Deletes are informational only and not used + /// in scans. + ManifestStatus status; + /// Field id: 1 + /// Snapshot id where the file was added, or deleted if status is 2. Inherited when + /// null. + std::optional snapshot_id; + /// Field id: 3 + /// Data sequence number of the file. Inherited when null and status is 1 (added). + std::optional sequence_number; + /// Field id: 4 + /// File sequence number indicating when the file was added. Inherited when null and + /// status is 1 (added). + std::optional file_sequence_number; + /// Field id: 2 + /// File path, partition tuple, metrics, ... + DataFile data_file; + + static const SchemaField STATUS; + static const SchemaField SNAPSHOT_ID; + static const SchemaField SEQUENCE_NUMBER; + static const SchemaField FILE_SEQUENCE_NUMBER; + + static Schema GetSchema(StructType partition_type); + static Schema GetSchemaFromDataFileType(StructType datafile_type); +}; + +} // namespace iceberg diff --git a/src/iceberg/manifest_list.cc b/src/iceberg/manifest_list.cc new file mode 100644 index 000000000..f82d5c3a8 --- /dev/null +++ b/src/iceberg/manifest_list.cc @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_list.h" + +#include "iceberg/schema.h" +#include "iceberg/type.h" + +namespace iceberg { + +const SchemaField FieldSummary::CONTAINS_NULL = + SchemaField::MakeRequired(509, "contains_null", std::make_shared()); +const SchemaField FieldSummary::CONTAINS_NAN = + SchemaField::MakeOptional(518, "contains_nan", std::make_shared()); +const SchemaField FieldSummary::LOWER_BOUND = + SchemaField::MakeOptional(510, "lower_bound", std::make_shared()); +const SchemaField FieldSummary::UPPER_BOUND = + SchemaField::MakeOptional(511, "upper_bound", std::make_shared()); + +StructType FieldSummary::GetType() { + return StructType({ + CONTAINS_NULL, + CONTAINS_NAN, + LOWER_BOUND, + UPPER_BOUND, + }); +} + +const SchemaField ManifestFile::MANIFEST_PATH = + SchemaField::MakeRequired(500, "manifest_path", std::make_shared()); +const SchemaField ManifestFile::MANIFEST_LENGTH = + SchemaField::MakeRequired(501, "manifest_length", std::make_shared()); +const SchemaField ManifestFile::PARTITION_SPEC_ID = + SchemaField::MakeRequired(502, "partition_spec_id", std::make_shared()); +const SchemaField ManifestFile::CONTENT = + SchemaField::MakeOptional(517, "content", std::make_shared()); +const SchemaField ManifestFile::SEQUENCE_NUMBER = + SchemaField::MakeOptional(515, "sequence_number", std::make_shared()); +const SchemaField ManifestFile::MIN_SEQUENCE_NUMBER = + SchemaField::MakeOptional(516, "min_sequence_number", std::make_shared()); +const SchemaField ManifestFile::ADDED_SNAPSHOT_ID = + SchemaField::MakeRequired(503, "added_snapshot_id", std::make_shared()); +const SchemaField ManifestFile::ADDED_FILES_COUNT = + SchemaField::MakeOptional(504, "added_files_count", std::make_shared()); +const SchemaField ManifestFile::EXISTING_FILES_COUNT = + SchemaField::MakeOptional(505, "existing_files_count", std::make_shared()); +const SchemaField ManifestFile::DELETED_FILES_COUNT = + SchemaField::MakeOptional(506, "deleted_files_count", std::make_shared()); +const SchemaField ManifestFile::ADDED_ROWS_COUNT = + SchemaField::MakeOptional(512, "added_rows_count", std::make_shared()); +const SchemaField ManifestFile::EXISTING_ROWS_COUNT = + SchemaField::MakeOptional(513, "existing_rows_count", std::make_shared()); +const SchemaField ManifestFile::DELETED_ROWS_COUNT = + SchemaField::MakeOptional(514, "deleted_rows_count", std::make_shared()); +const SchemaField ManifestFile::PARTITIONS = SchemaField::MakeOptional( + 507, "partitions", + std::make_shared(SchemaField::MakeRequired( + 508, std::string(ListType::kElementName), + std::make_shared(FieldSummary::GetType())))); +const SchemaField ManifestFile::KEY_METADATA = + SchemaField::MakeOptional(519, "key_metadata", std::make_shared()); +const SchemaField ManifestFile::FIRST_ROW_ID = + SchemaField::MakeOptional(520, "first_row_id", std::make_shared()); + +Schema ManifestFile::schema() { + std::vector fields; + fields.push_back(MANIFEST_PATH); + fields.push_back(MANIFEST_LENGTH); + fields.push_back(PARTITION_SPEC_ID); + fields.push_back(CONTENT); + fields.push_back(SEQUENCE_NUMBER); + fields.push_back(MIN_SEQUENCE_NUMBER); + fields.push_back(ADDED_SNAPSHOT_ID); + fields.push_back(ADDED_FILES_COUNT); + fields.push_back(EXISTING_FILES_COUNT); + fields.push_back(DELETED_FILES_COUNT); + fields.push_back(ADDED_ROWS_COUNT); + fields.push_back(EXISTING_ROWS_COUNT); + fields.push_back(DELETED_ROWS_COUNT); + fields.push_back(PARTITIONS); + fields.push_back(KEY_METADATA); + fields.push_back(FIRST_ROW_ID); + + return {std::move(fields), /*schema_id=*/std::nullopt}; +} + +} // namespace iceberg diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h new file mode 100644 index 000000000..cec6025c7 --- /dev/null +++ b/src/iceberg/manifest_list.h @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/manifest_list.h + +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/schema_field.h" +#include "iceberg/type_fwd.h" + +namespace iceberg { + +/// \brief The type of files tracked by the manifest, either data or delete files; 0 for +/// all v1 manifests +enum class ManifestContent { + /// The manifest content is data. + kData = 0, + /// The manifest content is deletes. + kDeletes = 1, +}; + +/// \brief Get the relative manifest content type name +ICEBERG_EXPORT constexpr std::string_view ManifestContentToString( + ManifestContent type) noexcept { + switch (type) { + case ManifestContent::kData: + return "data"; + case ManifestContent::kDeletes: + return "deletes"; + } +} + +/// \brief Get the relative manifest content type from name +ICEBERG_EXPORT constexpr Result ManifestContentFromString( + std::string_view str) noexcept { + if (str == "data") return ManifestContent::kData; + if (str == "deletes") return ManifestContent::kDeletes; + return InvalidArgument("Invalid manifest content type: {}", str); +} + +struct ICEBERG_EXPORT FieldSummary { + /// Field id: 509 + /// Whether the manifest contains at least one partition with a null value for the field + bool contains_null; + /// Field id: 518 + /// Whether the manifest contains at least one partition with a NaN value for the field + std::optional contains_nan; + /// Field id: 510 + /// Lower bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional> lower_bound; + /// Field id: 511 + /// Upper bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional> upper_bound; + + static const SchemaField CONTAINS_NULL; + static const SchemaField CONTAINS_NAN; + static const SchemaField LOWER_BOUND; + static const SchemaField UPPER_BOUND; + + static StructType GetType(); +}; + +/// \brief Entry in a manifest list. +struct ICEBERG_EXPORT ManifestFile { + /// Field id: 500 + /// Location of the manifest file + std::string manifest_path; + /// Field id: 501 + /// Length of the manifest file in bytes + int64_t manifest_length; + /// Field id: 502 + /// ID of a partition spec used to write the manifest; must be listed in table metadata + /// partition-specs + int32_t partition_spec_id; + /// Field id: 517 + /// The type of files tracked by the manifest, either data or delete files; 0 for all v1 + /// manifests + ManifestContent content; + /// Field id: 515 + /// The sequence number when the manifest was added to the table; use 0 when reading v1 + /// manifest lists + int64_t sequence_number; + /// Field id: 516 + /// The minimum data sequence number of all live data or delete files in the manifest; + /// use 0 when reading v1 manifest lists + int64_t min_sequence_number; + /// Field id: 503 + /// ID of the snapshot where the manifest file was added + int64_t added_snapshot_id; + /// Field id: 504 + /// Number of entries in the manifest that have status ADDED (1), when null this is + /// assumed to be non-zero + std::optional added_files_count; + /// Field id: 505 + /// Number of entries in the manifest that have status EXISTING (0), when null this is + /// assumed to be non-zero + std::optional existing_files_count; + /// Field id: 506 + /// Number of entries in the manifest that have status DELETED (2), when null this is + /// assumed to be non-zero + std::optional deleted_files_count; + /// Field id: 512 + /// Number of rows in all of files in the manifest that have status ADDED, when null + /// this is assumed to be non-zero + std::optional added_rows_count; + /// Field id: 513 + /// Number of rows in all of files in the manifest that have status EXISTING, when null + /// this is assumed to be non-zero + std::optional existing_rows_count; + /// Field id: 514 + /// Number of rows in all of files in the manifest that have status DELETED, when null + /// this is assumed to be non-zero + std::optional deleted_rows_count; + /// Field id: 507 + /// Element field id: 508 + /// A list of field summaries for each partition field in the spec. Each field in the + /// list corresponds to a field in the manifest file's partition spec. + std::vector partitions; + /// Field id: 519 + /// Implementation-specific key metadata for encryption + std::vector key_metadata; + /// Field id: 520 + /// The starting _row_id to assign to rows added by ADDED data files + int64_t first_row_id; + + /// \brief Checks if this manifest file contains entries with ADDED status. + [[nodiscard]] bool has_added_files() const { + return added_files_count.has_value() && *added_files_count > 0; + } + + /// \brief Checks if this manifest file contains entries with EXISTING status. + [[nodiscard]] bool has_existing_files() const { + return existing_files_count.has_value() && *existing_files_count > 0; + } + + /// \brief Checks if this manifest file contains entries with DELETED status + [[nodiscard]] bool has_deleted_files() const { + return deleted_files_count.has_value() && *deleted_files_count > 0; + } + + static const SchemaField MANIFEST_PATH; + static const SchemaField MANIFEST_LENGTH; + static const SchemaField PARTITION_SPEC_ID; + static const SchemaField CONTENT; + static const SchemaField SEQUENCE_NUMBER; + static const SchemaField MIN_SEQUENCE_NUMBER; + static const SchemaField ADDED_SNAPSHOT_ID; + static const SchemaField ADDED_FILES_COUNT; + static const SchemaField EXISTING_FILES_COUNT; + static const SchemaField DELETED_FILES_COUNT; + static const SchemaField ADDED_ROWS_COUNT; + static const SchemaField EXISTING_ROWS_COUNT; + static const SchemaField DELETED_ROWS_COUNT; + static const SchemaField PARTITIONS; + static const SchemaField KEY_METADATA; + static const SchemaField FIRST_ROW_ID; + + static Schema schema(); +}; + +/// Snapshots are embedded in table metadata, but the list of manifests for a snapshot are +/// stored in a separate manifest list file. +/// +/// A new manifest list is written for each attempt to commit a snapshot because the list +/// of manifests always changes to produce a new snapshot. When a manifest list is +/// written, the (optimistic) sequence number of the snapshot is written for all new +/// manifest files tracked by the list. +/// +/// A manifest list includes summary metadata that can be used to avoid scanning all of +/// the manifests in a snapshot when planning a table scan. This includes the number of +/// added, existing, and deleted files, and a summary of values for each field of the +/// partition spec used to write the manifest. +struct ManifestList { + /// Entries in a manifest list. + std::vector entries; +}; + +} // namespace iceberg diff --git a/src/iceberg/manifest_reader.h b/src/iceberg/manifest_reader.h index a7350362d..b0b5e5a32 100644 --- a/src/iceberg/manifest_reader.h +++ b/src/iceberg/manifest_reader.h @@ -26,13 +26,15 @@ #include #include "iceberg/file_reader.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/type_fwd.h" namespace iceberg { /// \brief Read manifest entries from a manifest file. class ICEBERG_EXPORT ManifestReader { public: - virtual Result>> Entries() const = 0; + virtual Result>> Entries() const = 0; private: std::unique_ptr reader_; @@ -41,7 +43,7 @@ class ICEBERG_EXPORT ManifestReader { /// \brief Read manifest files from a manifest list file. class ICEBERG_EXPORT ManifestListReader { public: - virtual Result>> Files() const = 0; + virtual Result>> Files() const = 0; private: std::unique_ptr reader_; diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 39e46883e..9fc6bd6cb 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -121,4 +121,9 @@ class UpdateRequirement; class AppendFiles; class TableScan; +struct DataFile; +struct ManifestEntry; +struct ManifestFile; +struct ManifestList; + } // namespace iceberg From 58e53c2bf1b588b50d5347df8e6414f6b6f4fa4a Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 8 May 2025 21:36:44 +0800 Subject: [PATCH 02/11] fix: resolve review comments Signed-off-by: Junwang Zhao --- src/iceberg/manifest_entry.cc | 8 +++----- src/iceberg/manifest_entry.h | 4 ++-- src/iceberg/manifest_list.cc | 8 +++++--- src/iceberg/manifest_list.h | 3 +-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/iceberg/manifest_entry.cc b/src/iceberg/manifest_entry.cc index 017252307..14df5e0e9 100644 --- a/src/iceberg/manifest_entry.cc +++ b/src/iceberg/manifest_entry.cc @@ -20,10 +20,8 @@ #include "iceberg/manifest_entry.h" #include -#include #include -#include "iceberg/schema.h" #include "iceberg/schema_field.h" #include "iceberg/type.h" @@ -138,11 +136,11 @@ const SchemaField ManifestEntry::SEQUENCE_NUMBER = const SchemaField ManifestEntry::FILE_SEQUENCE_NUMBER = SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared()); -Schema ManifestEntry::GetSchema(StructType partition_type) { +StructType ManifestEntry::GetSchema(StructType partition_type) { return GetSchemaFromDataFileType(DataFile::GetType(partition_type)); } -Schema ManifestEntry::GetSchemaFromDataFileType(StructType datafile_type) { +StructType ManifestEntry::GetSchemaFromDataFileType(StructType datafile_type) { std::vector fields; fields.push_back(STATUS); @@ -155,7 +153,7 @@ Schema ManifestEntry::GetSchemaFromDataFileType(StructType datafile_type) { 2, "data_file", std::make_shared(DataFile::GetType(datafile_type))); fields.push_back(data_file_type_field); - return {std::move(fields), /*schema_id=*/std::nullopt}; + return StructType(std::move(fields)); } } // namespace iceberg diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index f8c9a6b72..eb46a7544 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -246,8 +246,8 @@ struct ICEBERG_EXPORT ManifestEntry { static const SchemaField SEQUENCE_NUMBER; static const SchemaField FILE_SEQUENCE_NUMBER; - static Schema GetSchema(StructType partition_type); - static Schema GetSchemaFromDataFileType(StructType datafile_type); + static StructType GetSchema(StructType partition_type); + static StructType GetSchemaFromDataFileType(StructType datafile_type); }; } // namespace iceberg diff --git a/src/iceberg/manifest_list.cc b/src/iceberg/manifest_list.cc index f82d5c3a8..f715ca4bc 100644 --- a/src/iceberg/manifest_list.cc +++ b/src/iceberg/manifest_list.cc @@ -19,7 +19,9 @@ #include "iceberg/manifest_list.h" -#include "iceberg/schema.h" +#include + +#include "iceberg/schema_field.h" #include "iceberg/type.h" namespace iceberg { @@ -78,7 +80,7 @@ const SchemaField ManifestFile::KEY_METADATA = const SchemaField ManifestFile::FIRST_ROW_ID = SchemaField::MakeOptional(520, "first_row_id", std::make_shared()); -Schema ManifestFile::schema() { +StructType ManifestFile::Schema() { std::vector fields; fields.push_back(MANIFEST_PATH); fields.push_back(MANIFEST_LENGTH); @@ -97,7 +99,7 @@ Schema ManifestFile::schema() { fields.push_back(KEY_METADATA); fields.push_back(FIRST_ROW_ID); - return {std::move(fields), /*schema_id=*/std::nullopt}; + return StructType(std::move(fields)); } } // namespace iceberg diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h index cec6025c7..974897d72 100644 --- a/src/iceberg/manifest_list.h +++ b/src/iceberg/manifest_list.h @@ -28,7 +28,6 @@ #include "iceberg/iceberg_export.h" #include "iceberg/result.h" -#include "iceberg/schema_field.h" #include "iceberg/type_fwd.h" namespace iceberg { @@ -180,7 +179,7 @@ struct ICEBERG_EXPORT ManifestFile { static const SchemaField KEY_METADATA; static const SchemaField FIRST_ROW_ID; - static Schema schema(); + static StructType Schema(); }; /// Snapshots are embedded in table metadata, but the list of manifests for a snapshot are From c817b2d8c688d678abf65bd5bfbc7a21ab31ec4b Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Thu, 8 May 2025 22:00:11 +0800 Subject: [PATCH 03/11] fix: add in memory partition_spec_id Signed-off-by: Junwang Zhao --- src/iceberg/manifest_entry.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index eb46a7544..7a6213ab0 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -166,6 +166,9 @@ struct ICEBERG_EXPORT DataFile { /// order, and should set sort order id to null. Readers must ignore sort order id for /// position delete files. std::optional sort_order_id; + /// This field is not included in spec, so it is not serialized into the manifest file. + /// It is just store in memory representation used in process. + int32_t partition_spec_id; /// Field id: 142 /// The _row_id for the first row in the data file. /// From fd91bb1f2589cc042e18087895e6a6c2c5ec0f71 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sat, 10 May 2025 16:04:44 +0800 Subject: [PATCH 04/11] fix: more review comments Signed-off-by: Junwang Zhao --- src/iceberg/manifest_entry.cc | 153 +++++++--------------------------- src/iceberg/manifest_entry.h | 142 +++++++++++++++++++++---------- src/iceberg/manifest_list.cc | 88 ++++--------------- src/iceberg/manifest_list.h | 95 +++++++++++++-------- src/iceberg/manifest_reader.h | 4 +- 5 files changed, 203 insertions(+), 279 deletions(-) diff --git a/src/iceberg/manifest_entry.cc b/src/iceberg/manifest_entry.cc index 14df5e0e9..16df2f029 100644 --- a/src/iceberg/manifest_entry.cc +++ b/src/iceberg/manifest_entry.cc @@ -26,134 +26,41 @@ #include "iceberg/type.h" namespace iceberg { -const SchemaField DataFile::CONTENT = - SchemaField::MakeRequired(134, "content", std::make_shared()); -const SchemaField DataFile::FILE_PATH = - SchemaField::MakeRequired(100, "file_path", std::make_shared()); -const SchemaField DataFile::FILE_FORMAT = - SchemaField::MakeRequired(101, "file_format", std::make_shared()); -const SchemaField DataFile::RECORD_COUNT = - SchemaField::MakeRequired(103, "record_count", std::make_shared()); -const SchemaField DataFile::FILE_SIZE = - SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared()); -const SchemaField DataFile::COLUMN_SIZES = SchemaField::MakeOptional( - 108, "column_sizes", - std::make_shared( - SchemaField::MakeRequired(117, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(118, std::string(MapType::kValueName), - std::make_shared()))); -const SchemaField DataFile::VALUE_COUNTS = SchemaField::MakeOptional( - 109, "value_counts", - std::make_shared( - SchemaField::MakeRequired(119, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(120, std::string(MapType::kValueName), - std::make_shared()))); -const SchemaField DataFile::NULL_VALUE_COUNTS = SchemaField::MakeOptional( - 110, "null_value_counts", - std::make_shared( - SchemaField::MakeRequired(121, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(122, std::string(MapType::kValueName), - std::make_shared()))); -const SchemaField DataFile::NAN_VALUE_COUNTS = SchemaField::MakeOptional( - 137, "nan_value_counts", - std::make_shared( - SchemaField::MakeRequired(138, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(139, std::string(MapType::kValueName), - std::make_shared()))); -const SchemaField DataFile::LOWER_BOUNDS = SchemaField::MakeOptional( - 125, "lower_bounds", - std::make_shared( - SchemaField::MakeRequired(126, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(127, std::string(MapType::kValueName), - std::make_shared()))); -const SchemaField DataFile::UPPER_BOUNDS = SchemaField::MakeOptional( - 128, "upper_bounds", - std::make_shared( - SchemaField::MakeRequired(129, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(130, std::string(MapType::kValueName), - std::make_shared()))); -const SchemaField DataFile::KEY_METADATA = - SchemaField::MakeOptional(131, "key_metadata", std::make_shared()); -const SchemaField DataFile::SPLIT_OFFSETS = SchemaField::MakeOptional( - 132, "split_offsets", - std::make_shared(SchemaField::MakeRequired( - 133, std::string(ListType::kElementName), std::make_shared()))); -const SchemaField DataFile::EQUALITY_IDS = SchemaField::MakeOptional( - 135, "equality_ids", - std::make_shared(SchemaField::MakeRequired( - 136, std::string(ListType::kElementName), std::make_shared()))); -const SchemaField DataFile::SORT_ORDER_ID = - SchemaField::MakeOptional(140, "sort_order_id", std::make_shared()); -const SchemaField DataFile::FIRST_ROW_ID = - SchemaField::MakeOptional(142, "first_row_id", std::make_shared()); -const SchemaField DataFile::REFERENCED_DATA_FILE = SchemaField::MakeOptional( - 143, "referenced_data_file", std::make_shared()); -const SchemaField DataFile::CONTENT_OFFSET = - SchemaField::MakeOptional(144, "content_offset", std::make_shared()); -const SchemaField DataFile::CONTENT_SIZE = - SchemaField::MakeOptional(145, "content_size_in_bytes", std::make_shared()); -StructType DataFile::GetType(StructType partition_type) { - std::vector fields; - - fields.push_back(CONTENT); - fields.push_back(FILE_PATH); - fields.push_back(FILE_FORMAT); - fields.push_back(SchemaField::MakeRequired( - 102, "partition", std::make_shared(partition_type))); - fields.push_back(RECORD_COUNT); - fields.push_back(FILE_SIZE); - fields.push_back(COLUMN_SIZES); - fields.push_back(VALUE_COUNTS); - fields.push_back(NULL_VALUE_COUNTS); - fields.push_back(NAN_VALUE_COUNTS); - fields.push_back(LOWER_BOUNDS); - fields.push_back(UPPER_BOUNDS); - fields.push_back(KEY_METADATA); - fields.push_back(SPLIT_OFFSETS); - fields.push_back(EQUALITY_IDS); - fields.push_back(SORT_ORDER_ID); - fields.push_back(FIRST_ROW_ID); - fields.push_back(REFERENCED_DATA_FILE); - fields.push_back(CONTENT_OFFSET); - fields.push_back(CONTENT_SIZE); - - return StructType(std::move(fields)); +std::shared_ptr DataFile::Type(std::shared_ptr partition_type) { + return std::make_shared(std::vector{ + kContent, + kFilePath, + kFileFormat, + SchemaField::MakeRequired(102, "partition", std::move(partition_type)), + kRecordCount, + kFileSize, + kColumnSizes, + kValueCounts, + kNullValueCounts, + kNanValueCounts, + kLowerBounds, + kUpperBounds, + kKeyMetadata, + kSplitOffsets, + kEqualityIds, + kSortOrderId, + kFirstRowId, + kReferencedDataFile, + kContentOffset, + kContentSize}); } -const SchemaField ManifestEntry::STATUS = - SchemaField::MakeRequired(0, "status", std::make_shared()); -const SchemaField ManifestEntry::SNAPSHOT_ID = - SchemaField::MakeOptional(1, "snapshot_id", std::make_shared()); -const SchemaField ManifestEntry::SEQUENCE_NUMBER = - SchemaField::MakeOptional(3, "sequence_number", std::make_shared()); -const SchemaField ManifestEntry::FILE_SEQUENCE_NUMBER = - SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared()); - -StructType ManifestEntry::GetSchema(StructType partition_type) { - return GetSchemaFromDataFileType(DataFile::GetType(partition_type)); +std::shared_ptr ManifestEntry::TypeFromPartitionType( + std::shared_ptr partition_type) { + return TypeFromDataFileType(DataFile::Type(std::move(partition_type))); } -StructType ManifestEntry::GetSchemaFromDataFileType(StructType datafile_type) { - std::vector fields; - - fields.push_back(STATUS); - fields.push_back(SNAPSHOT_ID); - fields.push_back(SEQUENCE_NUMBER); - fields.push_back(FILE_SEQUENCE_NUMBER); - - // Add the data file schema - auto data_file_type_field = SchemaField::MakeRequired( - 2, "data_file", std::make_shared(DataFile::GetType(datafile_type))); - fields.push_back(data_file_type_field); - - return StructType(std::move(fields)); +std::shared_ptr ManifestEntry::TypeFromDataFileType( + std::shared_ptr datafile_type) { + return std::make_shared(std::vector{ + kStatus, kSnapshotId, kSequenceNumber, kFileSequenceNumber, + SchemaField::MakeRequired(2, "data_file", std::move(datafile_type))}); } } // namespace iceberg diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index 7a6213ab0..bf86fee31 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -22,15 +22,18 @@ #include #include #include +#include #include #include -#include #include +#include + #include "iceberg/file_format.h" #include "iceberg/iceberg_export.h" #include "iceberg/result.h" -#include "iceberg/type_fwd.h" +#include "iceberg/schema_field.h" +#include "iceberg/type.h" namespace iceberg { @@ -92,7 +95,7 @@ struct ICEBERG_EXPORT DataFile { /// Partition data tuple, schema based on the partition spec output using partition /// field ids for the struct field ids /// TODO(zhjwpku): use StructLike to represent partition data tuple - std::map partition; + std::any partition; /// Field id: 103 /// Number of records in this file, or the cardinality of a deletion vector int64_t record_count = 0; @@ -105,44 +108,36 @@ struct ICEBERG_EXPORT DataFile { /// Map from column id to the total size on disk of all regions that store the column. /// Does not include bytes necessary to read other columns, like footers. Leave null for /// row-oriented formats (Avro) - std::unordered_map column_sizes; + std::map column_sizes; /// Field id: 109 /// Key field id: 119 /// Value field id: 120 /// Map from column id to number of values in the column (including null and NaN values) - std::unordered_map value_counts; + std::map value_counts; /// Field id: 110 /// Key field id: 121 /// Value field id: 122 /// Map from column id to number of null values in the column - std::unordered_map null_value_counts; + std::map null_value_counts; /// Field id: 137 /// Key field id: 138 /// Value field id: 139 /// Map from column id to number of NaN values in the column - std::unordered_map nan_value_counts; + std::map nan_value_counts; /// Field id: 125 /// Key field id: 126 /// Value field id: 127 /// Map from column id to lower bound in the column serialized as binary. /// Each value must be less than or equal to all non-null, non-NaN values in the column /// for the file. - /// - /// Reference: - /// - [Binary single-value - /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) - std::unordered_map> lower_bounds; + std::map> lower_bounds; /// Field id: 128 /// Key field id: 129 /// Value field id: 130 /// Map from column id to upper bound in the column serialized as binary. - /// Each value must be greater than or equal to all non-null, non-Nan values in the + /// Each value must be greater than or equal to all non-null, non-NaN values in the /// column for the file. - /// - /// Reference: - /// - [Binary single-value - /// serialization](https://iceberg.apache.org/spec/#binary-single-value-serialization) - std::unordered_map> upper_bounds; + std::map> upper_bounds; /// Field id: 131 /// Implementation-specific key metadata for encryption std::optional> key_metadata; @@ -197,27 +192,80 @@ struct ICEBERG_EXPORT DataFile { /// present std::optional content_size_in_bytes; - static const SchemaField CONTENT; - static const SchemaField FILE_PATH; - static const SchemaField FILE_FORMAT; - static const SchemaField RECORD_COUNT; - static const SchemaField FILE_SIZE; - static const SchemaField COLUMN_SIZES; - static const SchemaField VALUE_COUNTS; - static const SchemaField NULL_VALUE_COUNTS; - static const SchemaField NAN_VALUE_COUNTS; - static const SchemaField LOWER_BOUNDS; - static const SchemaField UPPER_BOUNDS; - static const SchemaField KEY_METADATA; - static const SchemaField SPLIT_OFFSETS; - static const SchemaField EQUALITY_IDS; - static const SchemaField SORT_ORDER_ID; - static const SchemaField FIRST_ROW_ID; - static const SchemaField REFERENCED_DATA_FILE; - static const SchemaField CONTENT_OFFSET; - static const SchemaField CONTENT_SIZE; + inline static const SchemaField kContent = + SchemaField::MakeRequired(134, "content", std::make_shared()); + inline static const SchemaField kFilePath = + SchemaField::MakeRequired(100, "file_path", std::make_shared()); + inline static const SchemaField kFileFormat = + SchemaField::MakeRequired(101, "file_format", std::make_shared()); + inline static const SchemaField kRecordCount = + SchemaField::MakeRequired(103, "record_count", std::make_shared()); + inline static const SchemaField kFileSize = + SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared()); + inline static const SchemaField kColumnSizes = SchemaField::MakeOptional( + 108, "column_sizes", + std::make_shared( + SchemaField::MakeRequired(117, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(118, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kValueCounts = SchemaField::MakeOptional( + 109, "value_counts", + std::make_shared( + SchemaField::MakeRequired(119, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(120, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional( + 110, "null_value_counts", + std::make_shared( + SchemaField::MakeRequired(121, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(122, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional( + 137, "nan_value_counts", + std::make_shared( + SchemaField::MakeRequired(138, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(139, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kLowerBounds = SchemaField::MakeOptional( + 125, "lower_bounds", + std::make_shared( + SchemaField::MakeRequired(126, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(127, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kUpperBounds = SchemaField::MakeOptional( + 128, "upper_bounds", + std::make_shared( + SchemaField::MakeRequired(129, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(130, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kKeyMetadata = + SchemaField::MakeOptional(131, "key_metadata", std::make_shared()); + inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional( + 132, "split_offsets", + std::make_shared(SchemaField::MakeRequired( + 133, std::string(ListType::kElementName), std::make_shared()))); + inline static const SchemaField kEqualityIds = SchemaField::MakeOptional( + 135, "equality_ids", + std::make_shared(SchemaField::MakeRequired( + 136, std::string(ListType::kElementName), std::make_shared()))); + inline static const SchemaField kSortOrderId = + SchemaField::MakeOptional(140, "sort_order_id", std::make_shared()); + inline static const SchemaField kFirstRowId = + SchemaField::MakeOptional(142, "first_row_id", std::make_shared()); + inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional( + 143, "referenced_data_file", std::make_shared()); + inline static const SchemaField kContentOffset = + SchemaField::MakeOptional(144, "content_offset", std::make_shared()); + inline static const SchemaField kContentSize = SchemaField::MakeOptional( + 145, "content_size_in_bytes", std::make_shared()); - static StructType GetType(StructType partition_type); + static std::shared_ptr Type(std::shared_ptr partition_type); }; /// \brief A manifest is an immutable Avro file that lists data files or delete files, @@ -244,13 +292,19 @@ struct ICEBERG_EXPORT ManifestEntry { /// File path, partition tuple, metrics, ... DataFile data_file; - static const SchemaField STATUS; - static const SchemaField SNAPSHOT_ID; - static const SchemaField SEQUENCE_NUMBER; - static const SchemaField FILE_SEQUENCE_NUMBER; + inline static const SchemaField kStatus = + SchemaField::MakeRequired(0, "status", std::make_shared()); + inline static const SchemaField kSnapshotId = + SchemaField::MakeOptional(1, "snapshot_id", std::make_shared()); + inline static const SchemaField kSequenceNumber = + SchemaField::MakeOptional(3, "sequence_number", std::make_shared()); + inline static const SchemaField kFileSequenceNumber = + SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared()); - static StructType GetSchema(StructType partition_type); - static StructType GetSchemaFromDataFileType(StructType datafile_type); + static std::shared_ptr TypeFromPartitionType( + std::shared_ptr partition_type); + static std::shared_ptr TypeFromDataFileType( + std::shared_ptr datafile_type); }; } // namespace iceberg diff --git a/src/iceberg/manifest_list.cc b/src/iceberg/manifest_list.cc index f715ca4bc..95ad98aad 100644 --- a/src/iceberg/manifest_list.cc +++ b/src/iceberg/manifest_list.cc @@ -21,85 +21,27 @@ #include -#include "iceberg/schema_field.h" #include "iceberg/type.h" namespace iceberg { -const SchemaField FieldSummary::CONTAINS_NULL = - SchemaField::MakeRequired(509, "contains_null", std::make_shared()); -const SchemaField FieldSummary::CONTAINS_NAN = - SchemaField::MakeOptional(518, "contains_nan", std::make_shared()); -const SchemaField FieldSummary::LOWER_BOUND = - SchemaField::MakeOptional(510, "lower_bound", std::make_shared()); -const SchemaField FieldSummary::UPPER_BOUND = - SchemaField::MakeOptional(511, "upper_bound", std::make_shared()); - -StructType FieldSummary::GetType() { - return StructType({ - CONTAINS_NULL, - CONTAINS_NAN, - LOWER_BOUND, - UPPER_BOUND, - }); +const StructType& PartitionFieldSummary::Type() { + static const std::shared_ptr instance{new StructType({ + PartitionFieldSummary::kConsTainsNull, + PartitionFieldSummary::kContainsNaN, + PartitionFieldSummary::kLowerBound, + PartitionFieldSummary::kUpperBound, + })}; + return *instance; } -const SchemaField ManifestFile::MANIFEST_PATH = - SchemaField::MakeRequired(500, "manifest_path", std::make_shared()); -const SchemaField ManifestFile::MANIFEST_LENGTH = - SchemaField::MakeRequired(501, "manifest_length", std::make_shared()); -const SchemaField ManifestFile::PARTITION_SPEC_ID = - SchemaField::MakeRequired(502, "partition_spec_id", std::make_shared()); -const SchemaField ManifestFile::CONTENT = - SchemaField::MakeOptional(517, "content", std::make_shared()); -const SchemaField ManifestFile::SEQUENCE_NUMBER = - SchemaField::MakeOptional(515, "sequence_number", std::make_shared()); -const SchemaField ManifestFile::MIN_SEQUENCE_NUMBER = - SchemaField::MakeOptional(516, "min_sequence_number", std::make_shared()); -const SchemaField ManifestFile::ADDED_SNAPSHOT_ID = - SchemaField::MakeRequired(503, "added_snapshot_id", std::make_shared()); -const SchemaField ManifestFile::ADDED_FILES_COUNT = - SchemaField::MakeOptional(504, "added_files_count", std::make_shared()); -const SchemaField ManifestFile::EXISTING_FILES_COUNT = - SchemaField::MakeOptional(505, "existing_files_count", std::make_shared()); -const SchemaField ManifestFile::DELETED_FILES_COUNT = - SchemaField::MakeOptional(506, "deleted_files_count", std::make_shared()); -const SchemaField ManifestFile::ADDED_ROWS_COUNT = - SchemaField::MakeOptional(512, "added_rows_count", std::make_shared()); -const SchemaField ManifestFile::EXISTING_ROWS_COUNT = - SchemaField::MakeOptional(513, "existing_rows_count", std::make_shared()); -const SchemaField ManifestFile::DELETED_ROWS_COUNT = - SchemaField::MakeOptional(514, "deleted_rows_count", std::make_shared()); -const SchemaField ManifestFile::PARTITIONS = SchemaField::MakeOptional( - 507, "partitions", - std::make_shared(SchemaField::MakeRequired( - 508, std::string(ListType::kElementName), - std::make_shared(FieldSummary::GetType())))); -const SchemaField ManifestFile::KEY_METADATA = - SchemaField::MakeOptional(519, "key_metadata", std::make_shared()); -const SchemaField ManifestFile::FIRST_ROW_ID = - SchemaField::MakeOptional(520, "first_row_id", std::make_shared()); - -StructType ManifestFile::Schema() { - std::vector fields; - fields.push_back(MANIFEST_PATH); - fields.push_back(MANIFEST_LENGTH); - fields.push_back(PARTITION_SPEC_ID); - fields.push_back(CONTENT); - fields.push_back(SEQUENCE_NUMBER); - fields.push_back(MIN_SEQUENCE_NUMBER); - fields.push_back(ADDED_SNAPSHOT_ID); - fields.push_back(ADDED_FILES_COUNT); - fields.push_back(EXISTING_FILES_COUNT); - fields.push_back(DELETED_FILES_COUNT); - fields.push_back(ADDED_ROWS_COUNT); - fields.push_back(EXISTING_ROWS_COUNT); - fields.push_back(DELETED_ROWS_COUNT); - fields.push_back(PARTITIONS); - fields.push_back(KEY_METADATA); - fields.push_back(FIRST_ROW_ID); - - return StructType(std::move(fields)); +const StructType& ManifestFile::Type() { + static const std::shared_ptr instance{new StructType( + {kManifestPath, kManifestLength, kPartitionSpecId, kContent, kSequenceNumber, + kMinSequenceNumber, kAddedSnapshotId, kAddedFilesCount, kExistingFilesCount, + kDeletedFilesCount, kAddedRowsCount, kExistingRowsCount, kDeletedRowsCount, + kPartitions, kKeyMetadata, kFirstRowId})}; + return *instance; } } // namespace iceberg diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h index 974897d72..78719fe85 100644 --- a/src/iceberg/manifest_list.h +++ b/src/iceberg/manifest_list.h @@ -28,7 +28,8 @@ #include "iceberg/iceberg_export.h" #include "iceberg/result.h" -#include "iceberg/type_fwd.h" +#include "iceberg/schema_field.h" +#include "iceberg/type.h" namespace iceberg { @@ -60,7 +61,10 @@ ICEBERG_EXPORT constexpr Result ManifestContentFromString( return InvalidArgument("Invalid manifest content type: {}", str); } -struct ICEBERG_EXPORT FieldSummary { +/// \brief Field summary for partition field in the spec. +/// +/// Each field of this corresponds to a field in the manifest file's partition spec. +struct ICEBERG_EXPORT PartitionFieldSummary { /// Field id: 509 /// Whether the manifest contains at least one partition with a null value for the field bool contains_null; @@ -76,12 +80,16 @@ struct ICEBERG_EXPORT FieldSummary { /// values are null or NaN std::optional> upper_bound; - static const SchemaField CONTAINS_NULL; - static const SchemaField CONTAINS_NAN; - static const SchemaField LOWER_BOUND; - static const SchemaField UPPER_BOUND; + inline static const SchemaField kConsTainsNull = + SchemaField::MakeRequired(509, "contains_null", std::make_shared()); + inline static const SchemaField kContainsNaN = + SchemaField::MakeOptional(518, "contains_nan", std::make_shared()); + inline static const SchemaField kLowerBound = + SchemaField::MakeOptional(510, "lower_bound", std::make_shared()); + inline static const SchemaField kUpperBound = + SchemaField::MakeOptional(511, "upper_bound", std::make_shared()); - static StructType GetType(); + static const StructType& Type(); }; /// \brief Entry in a manifest list. @@ -139,7 +147,7 @@ struct ICEBERG_EXPORT ManifestFile { /// Element field id: 508 /// A list of field summaries for each partition field in the spec. Each field in the /// list corresponds to a field in the manifest file's partition spec. - std::vector partitions; + std::vector partitions; /// Field id: 519 /// Implementation-specific key metadata for encryption std::vector key_metadata; @@ -148,38 +156,51 @@ struct ICEBERG_EXPORT ManifestFile { int64_t first_row_id; /// \brief Checks if this manifest file contains entries with ADDED status. - [[nodiscard]] bool has_added_files() const { - return added_files_count.has_value() && *added_files_count > 0; - } + bool has_added_files() const { return added_files_count.value_or(-1) > 0; } /// \brief Checks if this manifest file contains entries with EXISTING status. - [[nodiscard]] bool has_existing_files() const { - return existing_files_count.has_value() && *existing_files_count > 0; - } + bool has_existing_files() const { return existing_files_count.value_or(-1) > 0; } /// \brief Checks if this manifest file contains entries with DELETED status - [[nodiscard]] bool has_deleted_files() const { - return deleted_files_count.has_value() && *deleted_files_count > 0; - } - - static const SchemaField MANIFEST_PATH; - static const SchemaField MANIFEST_LENGTH; - static const SchemaField PARTITION_SPEC_ID; - static const SchemaField CONTENT; - static const SchemaField SEQUENCE_NUMBER; - static const SchemaField MIN_SEQUENCE_NUMBER; - static const SchemaField ADDED_SNAPSHOT_ID; - static const SchemaField ADDED_FILES_COUNT; - static const SchemaField EXISTING_FILES_COUNT; - static const SchemaField DELETED_FILES_COUNT; - static const SchemaField ADDED_ROWS_COUNT; - static const SchemaField EXISTING_ROWS_COUNT; - static const SchemaField DELETED_ROWS_COUNT; - static const SchemaField PARTITIONS; - static const SchemaField KEY_METADATA; - static const SchemaField FIRST_ROW_ID; - - static StructType Schema(); + bool has_deleted_files() const { return deleted_files_count.value_or(-1) > 0; } + + inline static const SchemaField kManifestPath = + SchemaField::MakeRequired(500, "manifest_path", std::make_shared()); + inline static const SchemaField kManifestLength = + SchemaField::MakeRequired(501, "manifest_length", std::make_shared()); + inline static const SchemaField kPartitionSpecId = + SchemaField::MakeRequired(502, "partition_spec_id", std::make_shared()); + inline static const SchemaField kContent = + SchemaField::MakeOptional(517, "content", std::make_shared()); + inline static const SchemaField kSequenceNumber = + SchemaField::MakeOptional(515, "sequence_number", std::make_shared()); + inline static const SchemaField kMinSequenceNumber = + SchemaField::MakeOptional(516, "min_sequence_number", std::make_shared()); + inline static const SchemaField kAddedSnapshotId = + SchemaField::MakeRequired(503, "added_snapshot_id", std::make_shared()); + inline static const SchemaField kAddedFilesCount = + SchemaField::MakeOptional(504, "added_files_count", std::make_shared()); + inline static const SchemaField kExistingFilesCount = + SchemaField::MakeOptional(505, "existing_files_count", std::make_shared()); + inline static const SchemaField kDeletedFilesCount = + SchemaField::MakeOptional(506, "deleted_files_count", std::make_shared()); + inline static const SchemaField kAddedRowsCount = + SchemaField::MakeOptional(512, "added_rows_count", std::make_shared()); + inline static const SchemaField kExistingRowsCount = + SchemaField::MakeOptional(513, "existing_rows_count", std::make_shared()); + inline static const SchemaField kDeletedRowsCount = + SchemaField::MakeOptional(514, "deleted_rows_count", std::make_shared()); + inline static const SchemaField kPartitions = SchemaField::MakeOptional( + 507, "partitions", + std::make_shared(SchemaField::MakeRequired( + 508, std::string(ListType::kElementName), + std::make_shared(PartitionFieldSummary::Type())))); + inline static const SchemaField kKeyMetadata = + SchemaField::MakeOptional(519, "key_metadata", std::make_shared()); + inline static const SchemaField kFirstRowId = + SchemaField::MakeOptional(520, "first_row_id", std::make_shared()); + + static const StructType& Type(); }; /// Snapshots are embedded in table metadata, but the list of manifests for a snapshot are @@ -194,7 +215,7 @@ struct ICEBERG_EXPORT ManifestFile { /// the manifests in a snapshot when planning a table scan. This includes the number of /// added, existing, and deleted files, and a summary of values for each field of the /// partition spec used to write the manifest. -struct ManifestList { +struct ICEBERG_EXPORT ManifestList { /// Entries in a manifest list. std::vector entries; }; diff --git a/src/iceberg/manifest_reader.h b/src/iceberg/manifest_reader.h index b0b5e5a32..6b81eb9b0 100644 --- a/src/iceberg/manifest_reader.h +++ b/src/iceberg/manifest_reader.h @@ -34,7 +34,7 @@ namespace iceberg { /// \brief Read manifest entries from a manifest file. class ICEBERG_EXPORT ManifestReader { public: - virtual Result>> Entries() const = 0; + virtual Result>> Entries() const = 0; private: std::unique_ptr reader_; @@ -43,7 +43,7 @@ class ICEBERG_EXPORT ManifestReader { /// \brief Read manifest files from a manifest list file. class ICEBERG_EXPORT ManifestListReader { public: - virtual Result>> Files() const = 0; + virtual Result>> Files() const = 0; private: std::unique_ptr reader_; From d43ed8fddf02bc0bb2249a8828a94a5c01c63ff5 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sat, 10 May 2025 17:09:58 +0800 Subject: [PATCH 05/11] chore: add MakeRequiredField/MakeOptionalField to resolve lengthy lines Signed-off-by: Junwang Zhao --- src/iceberg/manifest_entry.h | 100 ++++++++++++++--------------------- src/iceberg/manifest_list.h | 46 ++++++++-------- src/iceberg/schema_field.h | 12 +++++ 3 files changed, 73 insertions(+), 85 deletions(-) diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index bf86fee31..22f01cd71 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -192,78 +192,57 @@ struct ICEBERG_EXPORT DataFile { /// present std::optional content_size_in_bytes; - inline static const SchemaField kContent = - SchemaField::MakeRequired(134, "content", std::make_shared()); + inline static const SchemaField kContent = MakeRequiredField(134, "content"); inline static const SchemaField kFilePath = - SchemaField::MakeRequired(100, "file_path", std::make_shared()); + MakeRequiredField(100, "file_path"); inline static const SchemaField kFileFormat = - SchemaField::MakeRequired(101, "file_format", std::make_shared()); + MakeRequiredField(101, "file_format"); inline static const SchemaField kRecordCount = - SchemaField::MakeRequired(103, "record_count", std::make_shared()); + MakeRequiredField(103, "record_count"); inline static const SchemaField kFileSize = - SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared()); - inline static const SchemaField kColumnSizes = SchemaField::MakeOptional( + MakeRequiredField(104, "file_size_in_bytes"); + inline static const SchemaField kColumnSizes = MakeOptionalField( 108, "column_sizes", - std::make_shared( - SchemaField::MakeRequired(117, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(118, std::string(MapType::kValueName), - std::make_shared()))); - inline static const SchemaField kValueCounts = SchemaField::MakeOptional( + MakeRequiredField(117, std::string(MapType::kKeyName)), + MakeRequiredField(118, std::string(MapType::kValueName))); + inline static const SchemaField kValueCounts = MakeOptionalField( 109, "value_counts", - std::make_shared( - SchemaField::MakeRequired(119, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(120, std::string(MapType::kValueName), - std::make_shared()))); - inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional( + MakeRequiredField(119, std::string(MapType::kKeyName)), + MakeRequiredField(120, std::string(MapType::kValueName))); + inline static const SchemaField kNullValueCounts = MakeOptionalField( 110, "null_value_counts", - std::make_shared( - SchemaField::MakeRequired(121, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(122, std::string(MapType::kValueName), - std::make_shared()))); - inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional( + MakeRequiredField(121, std::string(MapType::kKeyName)), + MakeRequiredField(122, std::string(MapType::kValueName))); + inline static const SchemaField kNanValueCounts = MakeOptionalField( 137, "nan_value_counts", - std::make_shared( - SchemaField::MakeRequired(138, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(139, std::string(MapType::kValueName), - std::make_shared()))); - inline static const SchemaField kLowerBounds = SchemaField::MakeOptional( + MakeRequiredField(138, std::string(MapType::kKeyName)), + MakeRequiredField(139, std::string(MapType::kValueName))); + inline static const SchemaField kLowerBounds = MakeOptionalField( 125, "lower_bounds", - std::make_shared( - SchemaField::MakeRequired(126, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(127, std::string(MapType::kValueName), - std::make_shared()))); - inline static const SchemaField kUpperBounds = SchemaField::MakeOptional( + MakeRequiredField(126, std::string(MapType::kKeyName)), + MakeRequiredField(127, std::string(MapType::kValueName))); + inline static const SchemaField kUpperBounds = MakeOptionalField( 128, "upper_bounds", - std::make_shared( - SchemaField::MakeRequired(129, std::string(MapType::kKeyName), - std::make_shared()), - SchemaField::MakeRequired(130, std::string(MapType::kValueName), - std::make_shared()))); + MakeRequiredField(129, std::string(MapType::kKeyName)), + MakeRequiredField(130, std::string(MapType::kValueName))); inline static const SchemaField kKeyMetadata = - SchemaField::MakeOptional(131, "key_metadata", std::make_shared()); - inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional( + MakeOptionalField(131, "key_metadata"); + inline static const SchemaField kSplitOffsets = MakeOptionalField( 132, "split_offsets", - std::make_shared(SchemaField::MakeRequired( - 133, std::string(ListType::kElementName), std::make_shared()))); - inline static const SchemaField kEqualityIds = SchemaField::MakeOptional( + MakeRequiredField(133, std::string(ListType::kElementName))); + inline static const SchemaField kEqualityIds = MakeOptionalField( 135, "equality_ids", - std::make_shared(SchemaField::MakeRequired( - 136, std::string(ListType::kElementName), std::make_shared()))); + MakeRequiredField(136, std::string(ListType::kElementName))); inline static const SchemaField kSortOrderId = - SchemaField::MakeOptional(140, "sort_order_id", std::make_shared()); + MakeOptionalField(140, "sort_order_id"); inline static const SchemaField kFirstRowId = - SchemaField::MakeOptional(142, "first_row_id", std::make_shared()); - inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional( - 143, "referenced_data_file", std::make_shared()); + MakeOptionalField(142, "first_row_id"); + inline static const SchemaField kReferencedDataFile = + MakeOptionalField(143, "referenced_data_file"); inline static const SchemaField kContentOffset = - SchemaField::MakeOptional(144, "content_offset", std::make_shared()); - inline static const SchemaField kContentSize = SchemaField::MakeOptional( - 145, "content_size_in_bytes", std::make_shared()); + MakeOptionalField(144, "content_offset"); + inline static const SchemaField kContentSize = + MakeOptionalField(145, "content_size_in_bytes"); static std::shared_ptr Type(std::shared_ptr partition_type); }; @@ -292,14 +271,13 @@ struct ICEBERG_EXPORT ManifestEntry { /// File path, partition tuple, metrics, ... DataFile data_file; - inline static const SchemaField kStatus = - SchemaField::MakeRequired(0, "status", std::make_shared()); + inline static const SchemaField kStatus = MakeRequiredField(0, "status"); inline static const SchemaField kSnapshotId = - SchemaField::MakeOptional(1, "snapshot_id", std::make_shared()); + MakeOptionalField(1, "snapshot_id"); inline static const SchemaField kSequenceNumber = - SchemaField::MakeOptional(3, "sequence_number", std::make_shared()); + MakeOptionalField(3, "sequence_number"); inline static const SchemaField kFileSequenceNumber = - SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared()); + MakeOptionalField(4, "file_sequence_number"); static std::shared_ptr TypeFromPartitionType( std::shared_ptr partition_type); diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h index 78719fe85..a65b6d11a 100644 --- a/src/iceberg/manifest_list.h +++ b/src/iceberg/manifest_list.h @@ -81,13 +81,13 @@ struct ICEBERG_EXPORT PartitionFieldSummary { std::optional> upper_bound; inline static const SchemaField kConsTainsNull = - SchemaField::MakeRequired(509, "contains_null", std::make_shared()); + MakeRequiredField(509, "contains_null"); inline static const SchemaField kContainsNaN = - SchemaField::MakeOptional(518, "contains_nan", std::make_shared()); + MakeOptionalField(518, "contains_nan"); inline static const SchemaField kLowerBound = - SchemaField::MakeOptional(510, "lower_bound", std::make_shared()); + MakeOptionalField(510, "lower_bound"); inline static const SchemaField kUpperBound = - SchemaField::MakeOptional(511, "upper_bound", std::make_shared()); + MakeOptionalField(511, "upper_bound"); static const StructType& Type(); }; @@ -165,40 +165,38 @@ struct ICEBERG_EXPORT ManifestFile { bool has_deleted_files() const { return deleted_files_count.value_or(-1) > 0; } inline static const SchemaField kManifestPath = - SchemaField::MakeRequired(500, "manifest_path", std::make_shared()); + MakeRequiredField(500, "manifest_path"); inline static const SchemaField kManifestLength = - SchemaField::MakeRequired(501, "manifest_length", std::make_shared()); + MakeRequiredField(501, "manifest_length"); inline static const SchemaField kPartitionSpecId = - SchemaField::MakeRequired(502, "partition_spec_id", std::make_shared()); - inline static const SchemaField kContent = - SchemaField::MakeOptional(517, "content", std::make_shared()); + MakeRequiredField(502, "partition_spec_id"); + inline static const SchemaField kContent = MakeOptionalField(517, "content"); inline static const SchemaField kSequenceNumber = - SchemaField::MakeOptional(515, "sequence_number", std::make_shared()); + MakeOptionalField(515, "sequence_number"); inline static const SchemaField kMinSequenceNumber = - SchemaField::MakeOptional(516, "min_sequence_number", std::make_shared()); + MakeOptionalField(516, "min_sequence_number"); inline static const SchemaField kAddedSnapshotId = - SchemaField::MakeRequired(503, "added_snapshot_id", std::make_shared()); + MakeRequiredField(503, "added_snapshot_id"); inline static const SchemaField kAddedFilesCount = - SchemaField::MakeOptional(504, "added_files_count", std::make_shared()); + MakeOptionalField(504, "added_files_count"); inline static const SchemaField kExistingFilesCount = - SchemaField::MakeOptional(505, "existing_files_count", std::make_shared()); + MakeOptionalField(505, "existing_files_count"); inline static const SchemaField kDeletedFilesCount = - SchemaField::MakeOptional(506, "deleted_files_count", std::make_shared()); + MakeOptionalField(506, "deleted_files_count"); inline static const SchemaField kAddedRowsCount = - SchemaField::MakeOptional(512, "added_rows_count", std::make_shared()); + MakeOptionalField(512, "added_rows_count"); inline static const SchemaField kExistingRowsCount = - SchemaField::MakeOptional(513, "existing_rows_count", std::make_shared()); + MakeOptionalField(513, "existing_rows_count"); inline static const SchemaField kDeletedRowsCount = - SchemaField::MakeOptional(514, "deleted_rows_count", std::make_shared()); - inline static const SchemaField kPartitions = SchemaField::MakeOptional( + MakeOptionalField(514, "deleted_rows_count"); + inline static const SchemaField kPartitions = MakeOptionalField( 507, "partitions", - std::make_shared(SchemaField::MakeRequired( - 508, std::string(ListType::kElementName), - std::make_shared(PartitionFieldSummary::Type())))); + MakeRequiredField(508, std::string(ListType::kElementName), + PartitionFieldSummary::Type())); inline static const SchemaField kKeyMetadata = - SchemaField::MakeOptional(519, "key_metadata", std::make_shared()); + MakeOptionalField(519, "key_metadata"); inline static const SchemaField kFirstRowId = - SchemaField::MakeOptional(520, "first_row_id", std::make_shared()); + MakeOptionalField(520, "first_row_id"); static const StructType& Type(); }; diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index afef71738..a04f39796 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -91,4 +91,16 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { std::string doc_; }; +template +inline SchemaField MakeRequiredField(int id, std::string name, Args&&... args) { + return SchemaField::MakeRequired(id, std::move(name), + std::make_shared(std::forward(args)...)); +} + +template +inline SchemaField MakeOptionalField(int id, std::string name, Args&&... args) { + return SchemaField::MakeOptional(id, std::move(name), + std::make_shared(std::forward(args)...)); +} + } // namespace iceberg From 481e682fc967bcc9a25e44cce9bb8ac3351dca24 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Sat, 10 May 2025 17:22:26 +0800 Subject: [PATCH 06/11] chore: remove useless header inclusion Signed-off-by: Junwang Zhao --- src/iceberg/manifest_entry.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index 22f01cd71..ec0ae26e7 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -27,8 +27,6 @@ #include #include -#include - #include "iceberg/file_format.h" #include "iceberg/iceberg_export.h" #include "iceberg/result.h" From d69d437ff069e8ff42ca52942e6636036cda1f53 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Tue, 13 May 2025 21:31:47 +0800 Subject: [PATCH 07/11] Revert "chore: add MakeRequiredField/MakeOptionalField to resolve lengthy lines" This reverts commit 7612e4a881966fbc7dc16a6d2ddeef181bc99c42. --- src/iceberg/manifest_entry.h | 100 +++++++++++++++++++++-------------- src/iceberg/manifest_list.h | 46 ++++++++-------- src/iceberg/schema_field.h | 12 ----- 3 files changed, 85 insertions(+), 73 deletions(-) diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index ec0ae26e7..0d44218e0 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -190,57 +190,78 @@ struct ICEBERG_EXPORT DataFile { /// present std::optional content_size_in_bytes; - inline static const SchemaField kContent = MakeRequiredField(134, "content"); + inline static const SchemaField kContent = + SchemaField::MakeRequired(134, "content", std::make_shared()); inline static const SchemaField kFilePath = - MakeRequiredField(100, "file_path"); + SchemaField::MakeRequired(100, "file_path", std::make_shared()); inline static const SchemaField kFileFormat = - MakeRequiredField(101, "file_format"); + SchemaField::MakeRequired(101, "file_format", std::make_shared()); inline static const SchemaField kRecordCount = - MakeRequiredField(103, "record_count"); + SchemaField::MakeRequired(103, "record_count", std::make_shared()); inline static const SchemaField kFileSize = - MakeRequiredField(104, "file_size_in_bytes"); - inline static const SchemaField kColumnSizes = MakeOptionalField( + SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared()); + inline static const SchemaField kColumnSizes = SchemaField::MakeOptional( 108, "column_sizes", - MakeRequiredField(117, std::string(MapType::kKeyName)), - MakeRequiredField(118, std::string(MapType::kValueName))); - inline static const SchemaField kValueCounts = MakeOptionalField( + std::make_shared( + SchemaField::MakeRequired(117, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(118, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kValueCounts = SchemaField::MakeOptional( 109, "value_counts", - MakeRequiredField(119, std::string(MapType::kKeyName)), - MakeRequiredField(120, std::string(MapType::kValueName))); - inline static const SchemaField kNullValueCounts = MakeOptionalField( + std::make_shared( + SchemaField::MakeRequired(119, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(120, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional( 110, "null_value_counts", - MakeRequiredField(121, std::string(MapType::kKeyName)), - MakeRequiredField(122, std::string(MapType::kValueName))); - inline static const SchemaField kNanValueCounts = MakeOptionalField( + std::make_shared( + SchemaField::MakeRequired(121, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(122, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional( 137, "nan_value_counts", - MakeRequiredField(138, std::string(MapType::kKeyName)), - MakeRequiredField(139, std::string(MapType::kValueName))); - inline static const SchemaField kLowerBounds = MakeOptionalField( + std::make_shared( + SchemaField::MakeRequired(138, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(139, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kLowerBounds = SchemaField::MakeOptional( 125, "lower_bounds", - MakeRequiredField(126, std::string(MapType::kKeyName)), - MakeRequiredField(127, std::string(MapType::kValueName))); - inline static const SchemaField kUpperBounds = MakeOptionalField( + std::make_shared( + SchemaField::MakeRequired(126, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(127, std::string(MapType::kValueName), + std::make_shared()))); + inline static const SchemaField kUpperBounds = SchemaField::MakeOptional( 128, "upper_bounds", - MakeRequiredField(129, std::string(MapType::kKeyName)), - MakeRequiredField(130, std::string(MapType::kValueName))); + std::make_shared( + SchemaField::MakeRequired(129, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(130, std::string(MapType::kValueName), + std::make_shared()))); inline static const SchemaField kKeyMetadata = - MakeOptionalField(131, "key_metadata"); - inline static const SchemaField kSplitOffsets = MakeOptionalField( + SchemaField::MakeOptional(131, "key_metadata", std::make_shared()); + inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional( 132, "split_offsets", - MakeRequiredField(133, std::string(ListType::kElementName))); - inline static const SchemaField kEqualityIds = MakeOptionalField( + std::make_shared(SchemaField::MakeRequired( + 133, std::string(ListType::kElementName), std::make_shared()))); + inline static const SchemaField kEqualityIds = SchemaField::MakeOptional( 135, "equality_ids", - MakeRequiredField(136, std::string(ListType::kElementName))); + std::make_shared(SchemaField::MakeRequired( + 136, std::string(ListType::kElementName), std::make_shared()))); inline static const SchemaField kSortOrderId = - MakeOptionalField(140, "sort_order_id"); + SchemaField::MakeOptional(140, "sort_order_id", std::make_shared()); inline static const SchemaField kFirstRowId = - MakeOptionalField(142, "first_row_id"); - inline static const SchemaField kReferencedDataFile = - MakeOptionalField(143, "referenced_data_file"); + SchemaField::MakeOptional(142, "first_row_id", std::make_shared()); + inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional( + 143, "referenced_data_file", std::make_shared()); inline static const SchemaField kContentOffset = - MakeOptionalField(144, "content_offset"); - inline static const SchemaField kContentSize = - MakeOptionalField(145, "content_size_in_bytes"); + SchemaField::MakeOptional(144, "content_offset", std::make_shared()); + inline static const SchemaField kContentSize = SchemaField::MakeOptional( + 145, "content_size_in_bytes", std::make_shared()); static std::shared_ptr Type(std::shared_ptr partition_type); }; @@ -269,13 +290,14 @@ struct ICEBERG_EXPORT ManifestEntry { /// File path, partition tuple, metrics, ... DataFile data_file; - inline static const SchemaField kStatus = MakeRequiredField(0, "status"); + inline static const SchemaField kStatus = + SchemaField::MakeRequired(0, "status", std::make_shared()); inline static const SchemaField kSnapshotId = - MakeOptionalField(1, "snapshot_id"); + SchemaField::MakeOptional(1, "snapshot_id", std::make_shared()); inline static const SchemaField kSequenceNumber = - MakeOptionalField(3, "sequence_number"); + SchemaField::MakeOptional(3, "sequence_number", std::make_shared()); inline static const SchemaField kFileSequenceNumber = - MakeOptionalField(4, "file_sequence_number"); + SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared()); static std::shared_ptr TypeFromPartitionType( std::shared_ptr partition_type); diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h index a65b6d11a..78719fe85 100644 --- a/src/iceberg/manifest_list.h +++ b/src/iceberg/manifest_list.h @@ -81,13 +81,13 @@ struct ICEBERG_EXPORT PartitionFieldSummary { std::optional> upper_bound; inline static const SchemaField kConsTainsNull = - MakeRequiredField(509, "contains_null"); + SchemaField::MakeRequired(509, "contains_null", std::make_shared()); inline static const SchemaField kContainsNaN = - MakeOptionalField(518, "contains_nan"); + SchemaField::MakeOptional(518, "contains_nan", std::make_shared()); inline static const SchemaField kLowerBound = - MakeOptionalField(510, "lower_bound"); + SchemaField::MakeOptional(510, "lower_bound", std::make_shared()); inline static const SchemaField kUpperBound = - MakeOptionalField(511, "upper_bound"); + SchemaField::MakeOptional(511, "upper_bound", std::make_shared()); static const StructType& Type(); }; @@ -165,38 +165,40 @@ struct ICEBERG_EXPORT ManifestFile { bool has_deleted_files() const { return deleted_files_count.value_or(-1) > 0; } inline static const SchemaField kManifestPath = - MakeRequiredField(500, "manifest_path"); + SchemaField::MakeRequired(500, "manifest_path", std::make_shared()); inline static const SchemaField kManifestLength = - MakeRequiredField(501, "manifest_length"); + SchemaField::MakeRequired(501, "manifest_length", std::make_shared()); inline static const SchemaField kPartitionSpecId = - MakeRequiredField(502, "partition_spec_id"); - inline static const SchemaField kContent = MakeOptionalField(517, "content"); + SchemaField::MakeRequired(502, "partition_spec_id", std::make_shared()); + inline static const SchemaField kContent = + SchemaField::MakeOptional(517, "content", std::make_shared()); inline static const SchemaField kSequenceNumber = - MakeOptionalField(515, "sequence_number"); + SchemaField::MakeOptional(515, "sequence_number", std::make_shared()); inline static const SchemaField kMinSequenceNumber = - MakeOptionalField(516, "min_sequence_number"); + SchemaField::MakeOptional(516, "min_sequence_number", std::make_shared()); inline static const SchemaField kAddedSnapshotId = - MakeRequiredField(503, "added_snapshot_id"); + SchemaField::MakeRequired(503, "added_snapshot_id", std::make_shared()); inline static const SchemaField kAddedFilesCount = - MakeOptionalField(504, "added_files_count"); + SchemaField::MakeOptional(504, "added_files_count", std::make_shared()); inline static const SchemaField kExistingFilesCount = - MakeOptionalField(505, "existing_files_count"); + SchemaField::MakeOptional(505, "existing_files_count", std::make_shared()); inline static const SchemaField kDeletedFilesCount = - MakeOptionalField(506, "deleted_files_count"); + SchemaField::MakeOptional(506, "deleted_files_count", std::make_shared()); inline static const SchemaField kAddedRowsCount = - MakeOptionalField(512, "added_rows_count"); + SchemaField::MakeOptional(512, "added_rows_count", std::make_shared()); inline static const SchemaField kExistingRowsCount = - MakeOptionalField(513, "existing_rows_count"); + SchemaField::MakeOptional(513, "existing_rows_count", std::make_shared()); inline static const SchemaField kDeletedRowsCount = - MakeOptionalField(514, "deleted_rows_count"); - inline static const SchemaField kPartitions = MakeOptionalField( + SchemaField::MakeOptional(514, "deleted_rows_count", std::make_shared()); + inline static const SchemaField kPartitions = SchemaField::MakeOptional( 507, "partitions", - MakeRequiredField(508, std::string(ListType::kElementName), - PartitionFieldSummary::Type())); + std::make_shared(SchemaField::MakeRequired( + 508, std::string(ListType::kElementName), + std::make_shared(PartitionFieldSummary::Type())))); inline static const SchemaField kKeyMetadata = - MakeOptionalField(519, "key_metadata"); + SchemaField::MakeOptional(519, "key_metadata", std::make_shared()); inline static const SchemaField kFirstRowId = - MakeOptionalField(520, "first_row_id"); + SchemaField::MakeOptional(520, "first_row_id", std::make_shared()); static const StructType& Type(); }; diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index a04f39796..afef71738 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -91,16 +91,4 @@ class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { std::string doc_; }; -template -inline SchemaField MakeRequiredField(int id, std::string name, Args&&... args) { - return SchemaField::MakeRequired(id, std::move(name), - std::make_shared(std::forward(args)...)); -} - -template -inline SchemaField MakeOptionalField(int id, std::string name, Args&&... args) { - return SchemaField::MakeOptional(id, std::move(name), - std::make_shared(std::forward(args)...)); -} - } // namespace iceberg From 5841811bbaa378b540155bcb709f8e0c4e11f8b6 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Tue, 13 May 2025 22:23:42 +0800 Subject: [PATCH 08/11] fix more review comments Signed-off-by: Junwang Zhao --- src/iceberg/manifest_entry.h | 65 ++++++++++++++++++++------------- src/iceberg/manifest_list.cc | 12 +++---- src/iceberg/manifest_list.h | 70 +++++++++++++++++++++--------------- 3 files changed, 88 insertions(+), 59 deletions(-) diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index 0d44218e0..c0393528d 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -138,7 +138,7 @@ struct ICEBERG_EXPORT DataFile { std::map> upper_bounds; /// Field id: 131 /// Implementation-specific key metadata for encryption - std::optional> key_metadata; + std::vector key_metadata; /// Field id: 132 /// Element Field id: 133 /// Split offsets for the data file. For example, all row group offsets in a Parquet @@ -190,78 +190,95 @@ struct ICEBERG_EXPORT DataFile { /// present std::optional content_size_in_bytes; - inline static const SchemaField kContent = - SchemaField::MakeRequired(134, "content", std::make_shared()); - inline static const SchemaField kFilePath = - SchemaField::MakeRequired(100, "file_path", std::make_shared()); + inline static const SchemaField kContent = SchemaField::MakeRequired( + 134, "content", std::make_shared(), + "Contents of the file: 0=data, 1=position deletes, 2=equality deletes"); + inline static const SchemaField kFilePath = SchemaField::MakeRequired( + 100, "file_path", std::make_shared(), "Location URI with FS scheme"); inline static const SchemaField kFileFormat = - SchemaField::MakeRequired(101, "file_format", std::make_shared()); - inline static const SchemaField kRecordCount = - SchemaField::MakeRequired(103, "record_count", std::make_shared()); + SchemaField::MakeRequired(101, "file_format", std::make_shared(), + "File format name: avro, orc, or parquet"); + inline static const SchemaField kRecordCount = SchemaField::MakeRequired( + 103, "record_count", std::make_shared(), "Number of records in the file"); inline static const SchemaField kFileSize = - SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared()); + SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared(), + "Total file size in bytes"); inline static const SchemaField kColumnSizes = SchemaField::MakeOptional( 108, "column_sizes", std::make_shared( SchemaField::MakeRequired(117, std::string(MapType::kKeyName), std::make_shared()), SchemaField::MakeRequired(118, std::string(MapType::kValueName), - std::make_shared()))); + std::make_shared())), + "Map of column id to total size on disk"); inline static const SchemaField kValueCounts = SchemaField::MakeOptional( 109, "value_counts", std::make_shared( SchemaField::MakeRequired(119, std::string(MapType::kKeyName), std::make_shared()), SchemaField::MakeRequired(120, std::string(MapType::kValueName), - std::make_shared()))); + std::make_shared())), + "Map of column id to total count, including null and NaN"); inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional( 110, "null_value_counts", std::make_shared( SchemaField::MakeRequired(121, std::string(MapType::kKeyName), std::make_shared()), SchemaField::MakeRequired(122, std::string(MapType::kValueName), - std::make_shared()))); + std::make_shared())), + "Map of column id to null value count"); inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional( 137, "nan_value_counts", std::make_shared( SchemaField::MakeRequired(138, std::string(MapType::kKeyName), std::make_shared()), SchemaField::MakeRequired(139, std::string(MapType::kValueName), - std::make_shared()))); + std::make_shared())), + "Map of column id to number of NaN values in the column"); inline static const SchemaField kLowerBounds = SchemaField::MakeOptional( 125, "lower_bounds", std::make_shared( SchemaField::MakeRequired(126, std::string(MapType::kKeyName), std::make_shared()), SchemaField::MakeRequired(127, std::string(MapType::kValueName), - std::make_shared()))); + std::make_shared())), + "Map of column id to lower bound"); inline static const SchemaField kUpperBounds = SchemaField::MakeOptional( 128, "upper_bounds", std::make_shared( SchemaField::MakeRequired(129, std::string(MapType::kKeyName), std::make_shared()), SchemaField::MakeRequired(130, std::string(MapType::kValueName), - std::make_shared()))); + std::make_shared())), + "Map of column id to upper bound"); inline static const SchemaField kKeyMetadata = - SchemaField::MakeOptional(131, "key_metadata", std::make_shared()); + SchemaField::MakeOptional(131, "key_metadata", std::make_shared(), + "Encryption key metadata blob"); inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional( 132, "split_offsets", std::make_shared(SchemaField::MakeRequired( - 133, std::string(ListType::kElementName), std::make_shared()))); + 133, std::string(ListType::kElementName), std::make_shared())), + "Splittable offsets"); inline static const SchemaField kEqualityIds = SchemaField::MakeOptional( 135, "equality_ids", std::make_shared(SchemaField::MakeRequired( - 136, std::string(ListType::kElementName), std::make_shared()))); - inline static const SchemaField kSortOrderId = - SchemaField::MakeOptional(140, "sort_order_id", std::make_shared()); + 136, std::string(ListType::kElementName), std::make_shared())), + "Equality comparison field IDs"); + inline static const SchemaField kSortOrderId = SchemaField::MakeOptional( + 140, "sort_order_id", std::make_shared(), "Sort order ID"); inline static const SchemaField kFirstRowId = - SchemaField::MakeOptional(142, "first_row_id", std::make_shared()); + SchemaField::MakeOptional(142, "first_row_id", std::make_shared(), + "Starting row ID to assign to new rows"); inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional( - 143, "referenced_data_file", std::make_shared()); + 143, "referenced_data_file", std::make_shared(), + "Fully qualified location (URI with FS scheme) of a data file that all deletes " + "reference"); inline static const SchemaField kContentOffset = - SchemaField::MakeOptional(144, "content_offset", std::make_shared()); + SchemaField::MakeOptional(144, "content_offset", std::make_shared(), + "The offset in the file where the content starts"); inline static const SchemaField kContentSize = SchemaField::MakeOptional( - 145, "content_size_in_bytes", std::make_shared()); + 145, "content_size_in_bytes", std::make_shared(), + "The length of referenced content stored in the file"); static std::shared_ptr Type(std::shared_ptr partition_type); }; diff --git a/src/iceberg/manifest_list.cc b/src/iceberg/manifest_list.cc index 95ad98aad..a0cf2053e 100644 --- a/src/iceberg/manifest_list.cc +++ b/src/iceberg/manifest_list.cc @@ -26,22 +26,22 @@ namespace iceberg { const StructType& PartitionFieldSummary::Type() { - static const std::shared_ptr instance{new StructType({ + static const StructType kInstance{{ PartitionFieldSummary::kConsTainsNull, PartitionFieldSummary::kContainsNaN, PartitionFieldSummary::kLowerBound, PartitionFieldSummary::kUpperBound, - })}; - return *instance; + }}; + return kInstance; } const StructType& ManifestFile::Type() { - static const std::shared_ptr instance{new StructType( + static const StructType kInstance( {kManifestPath, kManifestLength, kPartitionSpecId, kContent, kSequenceNumber, kMinSequenceNumber, kAddedSnapshotId, kAddedFilesCount, kExistingFilesCount, kDeletedFilesCount, kAddedRowsCount, kExistingRowsCount, kDeletedRowsCount, - kPartitions, kKeyMetadata, kFirstRowId})}; - return *instance; + kPartitions, kKeyMetadata, kFirstRowId}); + return kInstance; } } // namespace iceberg diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h index 78719fe85..a538773b3 100644 --- a/src/iceberg/manifest_list.h +++ b/src/iceberg/manifest_list.h @@ -81,13 +81,17 @@ struct ICEBERG_EXPORT PartitionFieldSummary { std::optional> upper_bound; inline static const SchemaField kConsTainsNull = - SchemaField::MakeRequired(509, "contains_null", std::make_shared()); + SchemaField::MakeRequired(509, "contains_null", std::make_shared(), + "True if any file has a null partition value"); inline static const SchemaField kContainsNaN = - SchemaField::MakeOptional(518, "contains_nan", std::make_shared()); + SchemaField::MakeOptional(518, "contains_nan", std::make_shared(), + "True if any file has a nan partition value"); inline static const SchemaField kLowerBound = - SchemaField::MakeOptional(510, "lower_bound", std::make_shared()); + SchemaField::MakeOptional(510, "lower_bound", std::make_shared(), + "Partition lower bound for all files"); inline static const SchemaField kUpperBound = - SchemaField::MakeOptional(511, "upper_bound", std::make_shared()); + SchemaField::MakeOptional(511, "upper_bound", std::make_shared(), + "Partition upper bound for all files"); static const StructType& Type(); }; @@ -165,40 +169,48 @@ struct ICEBERG_EXPORT ManifestFile { bool has_deleted_files() const { return deleted_files_count.value_or(-1) > 0; } inline static const SchemaField kManifestPath = - SchemaField::MakeRequired(500, "manifest_path", std::make_shared()); - inline static const SchemaField kManifestLength = - SchemaField::MakeRequired(501, "manifest_length", std::make_shared()); - inline static const SchemaField kPartitionSpecId = - SchemaField::MakeRequired(502, "partition_spec_id", std::make_shared()); + SchemaField::MakeRequired(500, "manifest_path", std::make_shared(), + "Location URI with FS scheme"); + inline static const SchemaField kManifestLength = SchemaField::MakeRequired( + 501, "manifest_length", std::make_shared(), "Total file size in bytes"); + inline static const SchemaField kPartitionSpecId = SchemaField::MakeRequired( + 502, "partition_spec_id", std::make_shared(), "Spec ID used to write"); inline static const SchemaField kContent = - SchemaField::MakeOptional(517, "content", std::make_shared()); + SchemaField::MakeOptional(517, "content", std::make_shared(), + "Contents of the manifest: 0=data, 1=deletes"); inline static const SchemaField kSequenceNumber = - SchemaField::MakeOptional(515, "sequence_number", std::make_shared()); + SchemaField::MakeOptional(515, "sequence_number", std::make_shared(), + "Sequence number when the manifest was added"); inline static const SchemaField kMinSequenceNumber = - SchemaField::MakeOptional(516, "min_sequence_number", std::make_shared()); + SchemaField::MakeOptional(516, "min_sequence_number", std::make_shared(), + "Lowest sequence number in the manifest"); inline static const SchemaField kAddedSnapshotId = - SchemaField::MakeRequired(503, "added_snapshot_id", std::make_shared()); - inline static const SchemaField kAddedFilesCount = - SchemaField::MakeOptional(504, "added_files_count", std::make_shared()); - inline static const SchemaField kExistingFilesCount = - SchemaField::MakeOptional(505, "existing_files_count", std::make_shared()); - inline static const SchemaField kDeletedFilesCount = - SchemaField::MakeOptional(506, "deleted_files_count", std::make_shared()); - inline static const SchemaField kAddedRowsCount = - SchemaField::MakeOptional(512, "added_rows_count", std::make_shared()); - inline static const SchemaField kExistingRowsCount = - SchemaField::MakeOptional(513, "existing_rows_count", std::make_shared()); - inline static const SchemaField kDeletedRowsCount = - SchemaField::MakeOptional(514, "deleted_rows_count", std::make_shared()); + SchemaField::MakeRequired(503, "added_snapshot_id", std::make_shared(), + "Snapshot ID that added the manifest"); + inline static const SchemaField kAddedFilesCount = SchemaField::MakeOptional( + 504, "added_files_count", std::make_shared(), "Added entry count"); + inline static const SchemaField kExistingFilesCount = SchemaField::MakeOptional( + 505, "existing_files_count", std::make_shared(), "Existing entry count"); + inline static const SchemaField kDeletedFilesCount = SchemaField::MakeOptional( + 506, "deleted_files_count", std::make_shared(), "Deleted entry count"); + inline static const SchemaField kAddedRowsCount = SchemaField::MakeOptional( + 512, "added_rows_count", std::make_shared(), "Added rows count"); + inline static const SchemaField kExistingRowsCount = SchemaField::MakeOptional( + 513, "existing_rows_count", std::make_shared(), "Existing rows count"); + inline static const SchemaField kDeletedRowsCount = SchemaField::MakeOptional( + 514, "deleted_rows_count", std::make_shared(), "Deleted rows count"); inline static const SchemaField kPartitions = SchemaField::MakeOptional( 507, "partitions", std::make_shared(SchemaField::MakeRequired( 508, std::string(ListType::kElementName), - std::make_shared(PartitionFieldSummary::Type())))); + std::make_shared(PartitionFieldSummary::Type()))), + "Summary for each partition"); inline static const SchemaField kKeyMetadata = - SchemaField::MakeOptional(519, "key_metadata", std::make_shared()); - inline static const SchemaField kFirstRowId = - SchemaField::MakeOptional(520, "first_row_id", std::make_shared()); + SchemaField::MakeOptional(519, "key_metadata", std::make_shared(), + "Encryption key metadata blob"); + inline static const SchemaField kFirstRowId = SchemaField::MakeOptional( + 520, "first_row_id", std::make_shared(), + "Starting row ID to assign to new rows in ADDED data files"); static const StructType& Type(); }; From 83db4149a698cdecf497e46377b206d3be335347 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Tue, 20 May 2025 18:36:20 +0800 Subject: [PATCH 09/11] When we don't know added files count, we assume that there are added files. Co-authored-by: Fokko Driesprong --- src/iceberg/manifest_list.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h index a538773b3..8caf9b4ba 100644 --- a/src/iceberg/manifest_list.h +++ b/src/iceberg/manifest_list.h @@ -160,7 +160,7 @@ struct ICEBERG_EXPORT ManifestFile { int64_t first_row_id; /// \brief Checks if this manifest file contains entries with ADDED status. - bool has_added_files() const { return added_files_count.value_or(-1) > 0; } + bool has_added_files() const { return added_files_count.value_or(1) > 0; } /// \brief Checks if this manifest file contains entries with EXISTING status. bool has_existing_files() const { return existing_files_count.value_or(-1) > 0; } From b3b65503b96fa31921f564b6123ccd0f2cae5d56 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Tue, 20 May 2025 18:36:44 +0800 Subject: [PATCH 10/11] When we don't know existing files count, we assume that there are existing files Co-authored-by: Fokko Driesprong --- src/iceberg/manifest_list.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h index 8caf9b4ba..e76ef133c 100644 --- a/src/iceberg/manifest_list.h +++ b/src/iceberg/manifest_list.h @@ -163,7 +163,7 @@ struct ICEBERG_EXPORT ManifestFile { bool has_added_files() const { return added_files_count.value_or(1) > 0; } /// \brief Checks if this manifest file contains entries with EXISTING status. - bool has_existing_files() const { return existing_files_count.value_or(-1) > 0; } + bool has_existing_files() const { return existing_files_count.value_or(1) > 0; } /// \brief Checks if this manifest file contains entries with DELETED status bool has_deleted_files() const { return deleted_files_count.value_or(-1) > 0; } From 9b27c31611ea89cdb4c36142e08932b736699888 Mon Sep 17 00:00:00 2001 From: Junwang Zhao Date: Tue, 20 May 2025 18:38:54 +0800 Subject: [PATCH 11/11] When we don't know deleted files count, we assume that there are deleted files. --- src/iceberg/manifest_list.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h index e76ef133c..af2b8dbbc 100644 --- a/src/iceberg/manifest_list.h +++ b/src/iceberg/manifest_list.h @@ -166,7 +166,7 @@ struct ICEBERG_EXPORT ManifestFile { bool has_existing_files() const { return existing_files_count.value_or(1) > 0; } /// \brief Checks if this manifest file contains entries with DELETED status - bool has_deleted_files() const { return deleted_files_count.value_or(-1) > 0; } + bool has_deleted_files() const { return deleted_files_count.value_or(1) > 0; } inline static const SchemaField kManifestPath = SchemaField::MakeRequired(500, "manifest_path", std::make_shared(),