diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 328befd54..85cf84d2d 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -23,6 +23,8 @@ set(ICEBERG_SOURCES expression/expression.cc file_reader.cc json_internal.cc + manifest_entry.cc + manifest_list.cc metadata_columns.cc name_mapping.cc partition_field.cc diff --git a/src/iceberg/file_format.h b/src/iceberg/file_format.h index 883782c7c..eebb76d14 100644 --- a/src/iceberg/file_format.h +++ b/src/iceberg/file_format.h @@ -25,6 +25,7 @@ #include #include "iceberg/iceberg_export.h" +#include "iceberg/result.h" namespace iceberg { @@ -50,4 +51,14 @@ ICEBERG_EXPORT inline std::string_view ToString(FileFormatType format_type) { } } +/// \brief Convert a string to a FileFormatType +ICEBERG_EXPORT constexpr Result FileFormatTypeFromString( + std::string_view str) noexcept { + if (str == "parquet") return FileFormatType::kParquet; + if (str == "avro") return FileFormatType::kAvro; + if (str == "orc") return FileFormatType::kOrc; + if (str == "puffin") return FileFormatType::kPuffin; + return InvalidArgument("Invalid file format type: {}", str); +} + } // namespace iceberg diff --git a/src/iceberg/manifest_entry.cc b/src/iceberg/manifest_entry.cc new file mode 100644 index 000000000..16df2f029 --- /dev/null +++ b/src/iceberg/manifest_entry.cc @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_entry.h" + +#include +#include + +#include "iceberg/schema_field.h" +#include "iceberg/type.h" + +namespace iceberg { + +std::shared_ptr DataFile::Type(std::shared_ptr partition_type) { + return std::make_shared(std::vector{ + kContent, + kFilePath, + kFileFormat, + SchemaField::MakeRequired(102, "partition", std::move(partition_type)), + kRecordCount, + kFileSize, + kColumnSizes, + kValueCounts, + kNullValueCounts, + kNanValueCounts, + kLowerBounds, + kUpperBounds, + kKeyMetadata, + kSplitOffsets, + kEqualityIds, + kSortOrderId, + kFirstRowId, + kReferencedDataFile, + kContentOffset, + kContentSize}); +} + +std::shared_ptr ManifestEntry::TypeFromPartitionType( + std::shared_ptr partition_type) { + return TypeFromDataFileType(DataFile::Type(std::move(partition_type))); +} + +std::shared_ptr ManifestEntry::TypeFromDataFileType( + std::shared_ptr datafile_type) { + return std::make_shared(std::vector{ + kStatus, kSnapshotId, kSequenceNumber, kFileSequenceNumber, + SchemaField::MakeRequired(2, "data_file", std::move(datafile_type))}); +} + +} // namespace iceberg diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h new file mode 100644 index 000000000..c0393528d --- /dev/null +++ b/src/iceberg/manifest_entry.h @@ -0,0 +1,325 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "iceberg/file_format.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/schema_field.h" +#include "iceberg/type.h" + +namespace iceberg { + +enum class ManifestStatus { + kExisting = 0, + kAdded = 1, + kDeleted = 2, +}; + +/// \brief Get the relative manifest status type from int +ICEBERG_EXPORT constexpr Result ManifestStatusFromInt( + int status) noexcept { + switch (status) { + case 0: + return ManifestStatus::kExisting; + case 1: + return ManifestStatus::kAdded; + case 2: + return ManifestStatus::kDeleted; + default: + return InvalidArgument("Invalid manifest status: {}", status); + } +} + +enum class DataFileContent { + kData = 0, + kPositionDeletes = 1, + kEqualityDeletes = 2, +}; + +/// \brief Get the relative data file content type from int +ICEBERG_EXPORT constexpr Result DataFileContentFromInt( + int content) noexcept { + switch (content) { + case 0: + return DataFileContent::kData; + case 1: + return DataFileContent::kPositionDeletes; + case 2: + return DataFileContent::kEqualityDeletes; + default: + return InvalidArgument("Invalid data file content: {}", content); + } +} + +/// \brief DataFile carries data file path, partition tuple, metrics, ... +struct ICEBERG_EXPORT DataFile { + /// Field id: 134 + /// Type of content stored by the data file: data, equality deletes, or position + /// deletes (all v1 files are data files) + DataFileContent content; + /// Field id: 100 + /// Full URI for the file with FS scheme + std::string file_path; + /// Field id: 101 + /// File format type, avro, orc, parquet, or puffin + FileFormatType file_format; + /// Field id: 102 + /// Partition data tuple, schema based on the partition spec output using partition + /// field ids for the struct field ids + /// TODO(zhjwpku): use StructLike to represent partition data tuple + std::any partition; + /// Field id: 103 + /// Number of records in this file, or the cardinality of a deletion vector + int64_t record_count = 0; + /// Field id: 104 + /// Total file size in bytes + int64_t file_size_in_bytes = 0; + /// Field id: 108 + /// Key field id: 117 + /// Value field id: 118 + /// Map from column id to the total size on disk of all regions that store the column. + /// Does not include bytes necessary to read other columns, like footers. Leave null for + /// row-oriented formats (Avro) + std::map column_sizes; + /// Field id: 109 + /// Key field id: 119 + /// Value field id: 120 + /// Map from column id to number of values in the column (including null and NaN values) + std::map value_counts; + /// Field id: 110 + /// Key field id: 121 + /// Value field id: 122 + /// Map from column id to number of null values in the column + std::map null_value_counts; + /// Field id: 137 + /// Key field id: 138 + /// Value field id: 139 + /// Map from column id to number of NaN values in the column + std::map nan_value_counts; + /// Field id: 125 + /// Key field id: 126 + /// Value field id: 127 + /// Map from column id to lower bound in the column serialized as binary. + /// Each value must be less than or equal to all non-null, non-NaN values in the column + /// for the file. + std::map> lower_bounds; + /// Field id: 128 + /// Key field id: 129 + /// Value field id: 130 + /// Map from column id to upper bound in the column serialized as binary. + /// Each value must be greater than or equal to all non-null, non-NaN values in the + /// column for the file. + std::map> upper_bounds; + /// Field id: 131 + /// Implementation-specific key metadata for encryption + std::vector key_metadata; + /// Field id: 132 + /// Element Field id: 133 + /// Split offsets for the data file. For example, all row group offsets in a Parquet + /// file. Must be sorted ascending. + std::vector split_offsets; + /// Field id: 135 + /// Element Field id: 136 + /// Field ids used to determine row equality in equality delete files. Required when + /// content=2 and should be null otherwise. Fields with ids listed in this column must + /// be present in the delete file. + std::vector equality_ids; + /// Field id: 140 + /// ID representing sort order for this file + /// + /// If sort order ID is missing or unknown, then the order is assumed to be unsorted. + /// Only data files and equality delete files should be written with a non-null order + /// id. Position deletes are required to be sorted by file and position, not a table + /// order, and should set sort order id to null. Readers must ignore sort order id for + /// position delete files. + std::optional sort_order_id; + /// This field is not included in spec, so it is not serialized into the manifest file. + /// It is just store in memory representation used in process. + int32_t partition_spec_id; + /// Field id: 142 + /// The _row_id for the first row in the data file. + /// + /// Reference: + /// - [First Row ID + /// Inheritance](https://github.com/apache/iceberg/blob/main/format/spec.md#first-row-id-inheritance) + std::optional first_row_id; + /// Field id: 143 + /// Fully qualified location (URI with FS scheme) of a data file that all deletes + /// reference. + /// + /// Position delete metadata can use referenced_data_file when all deletes tracked by + /// the entry are in a single data file. Setting the referenced file is required for + /// deletion vectors. + std::optional referenced_data_file; + /// Field id: 144 + /// The offset in the file where the content starts. + /// + /// The content_offset and content_size_in_bytes fields are used to reference a specific + /// blob for direct access to a deletion vector. For deletion vectors, these values are + /// required and must exactly match the offset and length stored in the Puffin footer + /// for the deletion vector blob. + std::optional content_offset; + /// Field id: 145 + /// The length of a referenced content stored in the file; required if content_offset is + /// present + std::optional content_size_in_bytes; + + inline static const SchemaField kContent = SchemaField::MakeRequired( + 134, "content", std::make_shared(), + "Contents of the file: 0=data, 1=position deletes, 2=equality deletes"); + inline static const SchemaField kFilePath = SchemaField::MakeRequired( + 100, "file_path", std::make_shared(), "Location URI with FS scheme"); + inline static const SchemaField kFileFormat = + SchemaField::MakeRequired(101, "file_format", std::make_shared(), + "File format name: avro, orc, or parquet"); + inline static const SchemaField kRecordCount = SchemaField::MakeRequired( + 103, "record_count", std::make_shared(), "Number of records in the file"); + inline static const SchemaField kFileSize = + SchemaField::MakeRequired(104, "file_size_in_bytes", std::make_shared(), + "Total file size in bytes"); + inline static const SchemaField kColumnSizes = SchemaField::MakeOptional( + 108, "column_sizes", + std::make_shared( + SchemaField::MakeRequired(117, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(118, std::string(MapType::kValueName), + std::make_shared())), + "Map of column id to total size on disk"); + inline static const SchemaField kValueCounts = SchemaField::MakeOptional( + 109, "value_counts", + std::make_shared( + SchemaField::MakeRequired(119, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(120, std::string(MapType::kValueName), + std::make_shared())), + "Map of column id to total count, including null and NaN"); + inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional( + 110, "null_value_counts", + std::make_shared( + SchemaField::MakeRequired(121, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(122, std::string(MapType::kValueName), + std::make_shared())), + "Map of column id to null value count"); + inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional( + 137, "nan_value_counts", + std::make_shared( + SchemaField::MakeRequired(138, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(139, std::string(MapType::kValueName), + std::make_shared())), + "Map of column id to number of NaN values in the column"); + inline static const SchemaField kLowerBounds = SchemaField::MakeOptional( + 125, "lower_bounds", + std::make_shared( + SchemaField::MakeRequired(126, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(127, std::string(MapType::kValueName), + std::make_shared())), + "Map of column id to lower bound"); + inline static const SchemaField kUpperBounds = SchemaField::MakeOptional( + 128, "upper_bounds", + std::make_shared( + SchemaField::MakeRequired(129, std::string(MapType::kKeyName), + std::make_shared()), + SchemaField::MakeRequired(130, std::string(MapType::kValueName), + std::make_shared())), + "Map of column id to upper bound"); + inline static const SchemaField kKeyMetadata = + SchemaField::MakeOptional(131, "key_metadata", std::make_shared(), + "Encryption key metadata blob"); + inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional( + 132, "split_offsets", + std::make_shared(SchemaField::MakeRequired( + 133, std::string(ListType::kElementName), std::make_shared())), + "Splittable offsets"); + inline static const SchemaField kEqualityIds = SchemaField::MakeOptional( + 135, "equality_ids", + std::make_shared(SchemaField::MakeRequired( + 136, std::string(ListType::kElementName), std::make_shared())), + "Equality comparison field IDs"); + inline static const SchemaField kSortOrderId = SchemaField::MakeOptional( + 140, "sort_order_id", std::make_shared(), "Sort order ID"); + inline static const SchemaField kFirstRowId = + SchemaField::MakeOptional(142, "first_row_id", std::make_shared(), + "Starting row ID to assign to new rows"); + inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional( + 143, "referenced_data_file", std::make_shared(), + "Fully qualified location (URI with FS scheme) of a data file that all deletes " + "reference"); + inline static const SchemaField kContentOffset = + SchemaField::MakeOptional(144, "content_offset", std::make_shared(), + "The offset in the file where the content starts"); + inline static const SchemaField kContentSize = SchemaField::MakeOptional( + 145, "content_size_in_bytes", std::make_shared(), + "The length of referenced content stored in the file"); + + static std::shared_ptr Type(std::shared_ptr partition_type); +}; + +/// \brief A manifest is an immutable Avro file that lists data files or delete files, +/// along with each file's partition data tuple, metrics, and tracking information. + +/// \brief The schema of a manifest file +struct ICEBERG_EXPORT ManifestEntry { + /// Field id: 0 + /// Used to track additions and deletions. Deletes are informational only and not used + /// in scans. + ManifestStatus status; + /// Field id: 1 + /// Snapshot id where the file was added, or deleted if status is 2. Inherited when + /// null. + std::optional snapshot_id; + /// Field id: 3 + /// Data sequence number of the file. Inherited when null and status is 1 (added). + std::optional sequence_number; + /// Field id: 4 + /// File sequence number indicating when the file was added. Inherited when null and + /// status is 1 (added). + std::optional file_sequence_number; + /// Field id: 2 + /// File path, partition tuple, metrics, ... + DataFile data_file; + + inline static const SchemaField kStatus = + SchemaField::MakeRequired(0, "status", std::make_shared()); + inline static const SchemaField kSnapshotId = + SchemaField::MakeOptional(1, "snapshot_id", std::make_shared()); + inline static const SchemaField kSequenceNumber = + SchemaField::MakeOptional(3, "sequence_number", std::make_shared()); + inline static const SchemaField kFileSequenceNumber = + SchemaField::MakeOptional(4, "file_sequence_number", std::make_shared()); + + static std::shared_ptr TypeFromPartitionType( + std::shared_ptr partition_type); + static std::shared_ptr TypeFromDataFileType( + std::shared_ptr datafile_type); +}; + +} // namespace iceberg diff --git a/src/iceberg/manifest_list.cc b/src/iceberg/manifest_list.cc new file mode 100644 index 000000000..a0cf2053e --- /dev/null +++ b/src/iceberg/manifest_list.cc @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_list.h" + +#include + +#include "iceberg/type.h" + +namespace iceberg { + +const StructType& PartitionFieldSummary::Type() { + static const StructType kInstance{{ + PartitionFieldSummary::kConsTainsNull, + PartitionFieldSummary::kContainsNaN, + PartitionFieldSummary::kLowerBound, + PartitionFieldSummary::kUpperBound, + }}; + return kInstance; +} + +const StructType& ManifestFile::Type() { + static const StructType kInstance( + {kManifestPath, kManifestLength, kPartitionSpecId, kContent, kSequenceNumber, + kMinSequenceNumber, kAddedSnapshotId, kAddedFilesCount, kExistingFilesCount, + kDeletedFilesCount, kAddedRowsCount, kExistingRowsCount, kDeletedRowsCount, + kPartitions, kKeyMetadata, kFirstRowId}); + return kInstance; +} + +} // namespace iceberg diff --git a/src/iceberg/manifest_list.h b/src/iceberg/manifest_list.h new file mode 100644 index 000000000..af2b8dbbc --- /dev/null +++ b/src/iceberg/manifest_list.h @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +/// \file iceberg/manifest_list.h + +#include +#include +#include +#include + +#include "iceberg/iceberg_export.h" +#include "iceberg/result.h" +#include "iceberg/schema_field.h" +#include "iceberg/type.h" + +namespace iceberg { + +/// \brief The type of files tracked by the manifest, either data or delete files; 0 for +/// all v1 manifests +enum class ManifestContent { + /// The manifest content is data. + kData = 0, + /// The manifest content is deletes. + kDeletes = 1, +}; + +/// \brief Get the relative manifest content type name +ICEBERG_EXPORT constexpr std::string_view ManifestContentToString( + ManifestContent type) noexcept { + switch (type) { + case ManifestContent::kData: + return "data"; + case ManifestContent::kDeletes: + return "deletes"; + } +} + +/// \brief Get the relative manifest content type from name +ICEBERG_EXPORT constexpr Result ManifestContentFromString( + std::string_view str) noexcept { + if (str == "data") return ManifestContent::kData; + if (str == "deletes") return ManifestContent::kDeletes; + return InvalidArgument("Invalid manifest content type: {}", str); +} + +/// \brief Field summary for partition field in the spec. +/// +/// Each field of this corresponds to a field in the manifest file's partition spec. +struct ICEBERG_EXPORT PartitionFieldSummary { + /// Field id: 509 + /// Whether the manifest contains at least one partition with a null value for the field + bool contains_null; + /// Field id: 518 + /// Whether the manifest contains at least one partition with a NaN value for the field + std::optional contains_nan; + /// Field id: 510 + /// Lower bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional> lower_bound; + /// Field id: 511 + /// Upper bound for the non-null, non-NaN values in the partition field, or null if all + /// values are null or NaN + std::optional> upper_bound; + + inline static const SchemaField kConsTainsNull = + SchemaField::MakeRequired(509, "contains_null", std::make_shared(), + "True if any file has a null partition value"); + inline static const SchemaField kContainsNaN = + SchemaField::MakeOptional(518, "contains_nan", std::make_shared(), + "True if any file has a nan partition value"); + inline static const SchemaField kLowerBound = + SchemaField::MakeOptional(510, "lower_bound", std::make_shared(), + "Partition lower bound for all files"); + inline static const SchemaField kUpperBound = + SchemaField::MakeOptional(511, "upper_bound", std::make_shared(), + "Partition upper bound for all files"); + + static const StructType& Type(); +}; + +/// \brief Entry in a manifest list. +struct ICEBERG_EXPORT ManifestFile { + /// Field id: 500 + /// Location of the manifest file + std::string manifest_path; + /// Field id: 501 + /// Length of the manifest file in bytes + int64_t manifest_length; + /// Field id: 502 + /// ID of a partition spec used to write the manifest; must be listed in table metadata + /// partition-specs + int32_t partition_spec_id; + /// Field id: 517 + /// The type of files tracked by the manifest, either data or delete files; 0 for all v1 + /// manifests + ManifestContent content; + /// Field id: 515 + /// The sequence number when the manifest was added to the table; use 0 when reading v1 + /// manifest lists + int64_t sequence_number; + /// Field id: 516 + /// The minimum data sequence number of all live data or delete files in the manifest; + /// use 0 when reading v1 manifest lists + int64_t min_sequence_number; + /// Field id: 503 + /// ID of the snapshot where the manifest file was added + int64_t added_snapshot_id; + /// Field id: 504 + /// Number of entries in the manifest that have status ADDED (1), when null this is + /// assumed to be non-zero + std::optional added_files_count; + /// Field id: 505 + /// Number of entries in the manifest that have status EXISTING (0), when null this is + /// assumed to be non-zero + std::optional existing_files_count; + /// Field id: 506 + /// Number of entries in the manifest that have status DELETED (2), when null this is + /// assumed to be non-zero + std::optional deleted_files_count; + /// Field id: 512 + /// Number of rows in all of files in the manifest that have status ADDED, when null + /// this is assumed to be non-zero + std::optional added_rows_count; + /// Field id: 513 + /// Number of rows in all of files in the manifest that have status EXISTING, when null + /// this is assumed to be non-zero + std::optional existing_rows_count; + /// Field id: 514 + /// Number of rows in all of files in the manifest that have status DELETED, when null + /// this is assumed to be non-zero + std::optional deleted_rows_count; + /// Field id: 507 + /// Element field id: 508 + /// A list of field summaries for each partition field in the spec. Each field in the + /// list corresponds to a field in the manifest file's partition spec. + std::vector partitions; + /// Field id: 519 + /// Implementation-specific key metadata for encryption + std::vector key_metadata; + /// Field id: 520 + /// The starting _row_id to assign to rows added by ADDED data files + int64_t first_row_id; + + /// \brief Checks if this manifest file contains entries with ADDED status. + bool has_added_files() const { return added_files_count.value_or(1) > 0; } + + /// \brief Checks if this manifest file contains entries with EXISTING status. + bool has_existing_files() const { return existing_files_count.value_or(1) > 0; } + + /// \brief Checks if this manifest file contains entries with DELETED status + bool has_deleted_files() const { return deleted_files_count.value_or(1) > 0; } + + inline static const SchemaField kManifestPath = + SchemaField::MakeRequired(500, "manifest_path", std::make_shared(), + "Location URI with FS scheme"); + inline static const SchemaField kManifestLength = SchemaField::MakeRequired( + 501, "manifest_length", std::make_shared(), "Total file size in bytes"); + inline static const SchemaField kPartitionSpecId = SchemaField::MakeRequired( + 502, "partition_spec_id", std::make_shared(), "Spec ID used to write"); + inline static const SchemaField kContent = + SchemaField::MakeOptional(517, "content", std::make_shared(), + "Contents of the manifest: 0=data, 1=deletes"); + inline static const SchemaField kSequenceNumber = + SchemaField::MakeOptional(515, "sequence_number", std::make_shared(), + "Sequence number when the manifest was added"); + inline static const SchemaField kMinSequenceNumber = + SchemaField::MakeOptional(516, "min_sequence_number", std::make_shared(), + "Lowest sequence number in the manifest"); + inline static const SchemaField kAddedSnapshotId = + SchemaField::MakeRequired(503, "added_snapshot_id", std::make_shared(), + "Snapshot ID that added the manifest"); + inline static const SchemaField kAddedFilesCount = SchemaField::MakeOptional( + 504, "added_files_count", std::make_shared(), "Added entry count"); + inline static const SchemaField kExistingFilesCount = SchemaField::MakeOptional( + 505, "existing_files_count", std::make_shared(), "Existing entry count"); + inline static const SchemaField kDeletedFilesCount = SchemaField::MakeOptional( + 506, "deleted_files_count", std::make_shared(), "Deleted entry count"); + inline static const SchemaField kAddedRowsCount = SchemaField::MakeOptional( + 512, "added_rows_count", std::make_shared(), "Added rows count"); + inline static const SchemaField kExistingRowsCount = SchemaField::MakeOptional( + 513, "existing_rows_count", std::make_shared(), "Existing rows count"); + inline static const SchemaField kDeletedRowsCount = SchemaField::MakeOptional( + 514, "deleted_rows_count", std::make_shared(), "Deleted rows count"); + inline static const SchemaField kPartitions = SchemaField::MakeOptional( + 507, "partitions", + std::make_shared(SchemaField::MakeRequired( + 508, std::string(ListType::kElementName), + std::make_shared(PartitionFieldSummary::Type()))), + "Summary for each partition"); + inline static const SchemaField kKeyMetadata = + SchemaField::MakeOptional(519, "key_metadata", std::make_shared(), + "Encryption key metadata blob"); + inline static const SchemaField kFirstRowId = SchemaField::MakeOptional( + 520, "first_row_id", std::make_shared(), + "Starting row ID to assign to new rows in ADDED data files"); + + static const StructType& Type(); +}; + +/// Snapshots are embedded in table metadata, but the list of manifests for a snapshot are +/// stored in a separate manifest list file. +/// +/// A new manifest list is written for each attempt to commit a snapshot because the list +/// of manifests always changes to produce a new snapshot. When a manifest list is +/// written, the (optimistic) sequence number of the snapshot is written for all new +/// manifest files tracked by the list. +/// +/// A manifest list includes summary metadata that can be used to avoid scanning all of +/// the manifests in a snapshot when planning a table scan. This includes the number of +/// added, existing, and deleted files, and a summary of values for each field of the +/// partition spec used to write the manifest. +struct ICEBERG_EXPORT ManifestList { + /// Entries in a manifest list. + std::vector entries; +}; + +} // namespace iceberg diff --git a/src/iceberg/manifest_reader.h b/src/iceberg/manifest_reader.h index a7350362d..6b81eb9b0 100644 --- a/src/iceberg/manifest_reader.h +++ b/src/iceberg/manifest_reader.h @@ -26,13 +26,15 @@ #include #include "iceberg/file_reader.h" +#include "iceberg/iceberg_export.h" +#include "iceberg/type_fwd.h" namespace iceberg { /// \brief Read manifest entries from a manifest file. class ICEBERG_EXPORT ManifestReader { public: - virtual Result>> Entries() const = 0; + virtual Result>> Entries() const = 0; private: std::unique_ptr reader_; @@ -41,7 +43,7 @@ class ICEBERG_EXPORT ManifestReader { /// \brief Read manifest files from a manifest list file. class ICEBERG_EXPORT ManifestListReader { public: - virtual Result>> Files() const = 0; + virtual Result>> Files() const = 0; private: std::unique_ptr reader_; diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 39e46883e..9fc6bd6cb 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -121,4 +121,9 @@ class UpdateRequirement; class AppendFiles; class TableScan; +struct DataFile; +struct ManifestEntry; +struct ManifestFile; +struct ManifestList; + } // namespace iceberg