From ced1f26bf1256413bb08eeae63bc7231e3de2845 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Thu, 4 Sep 2025 19:21:42 +0800 Subject: [PATCH 01/17] feat: manifest writer and adapter impl part2 1 parse v1v2v3 manifest schema in adapter 2 convert ManifestFile&ManifestEntry into ArrowArray 3 add e2e case --- src/iceberg/CMakeLists.txt | 4 + src/iceberg/manifest_adapter.cc | 702 ++++++++++++++++++ src/iceberg/manifest_adapter.h | 81 +- src/iceberg/manifest_reader_internal.cc | 5 - src/iceberg/manifest_writer.cc | 101 ++- src/iceberg/manifest_writer.h | 19 +- src/iceberg/test/CMakeLists.txt | 4 +- ...cc => manifest_list_reader_writer_test.cc} | 83 ++- ...test.cc => manifest_reader_writer_test.cc} | 74 +- src/iceberg/type_fwd.h | 1 + src/iceberg/util/macros.h | 10 +- src/iceberg/v1_metadata.cc | 91 +++ src/iceberg/v1_metadata.h | 36 +- src/iceberg/v2_metadata.cc | 124 ++++ src/iceberg/v2_metadata.h | 44 +- src/iceberg/v3_metadata.cc | 176 +++++ src/iceberg/v3_metadata.h | 61 +- 17 files changed, 1476 insertions(+), 140 deletions(-) create mode 100644 src/iceberg/manifest_adapter.cc rename src/iceberg/test/{manifest_list_reader_test.cc => manifest_list_reader_writer_test.cc} (78%) rename src/iceberg/test/{manifest_reader_test.cc => manifest_reader_writer_test.cc} (75%) create mode 100644 src/iceberg/v1_metadata.cc create mode 100644 src/iceberg/v2_metadata.cc create mode 100644 src/iceberg/v3_metadata.cc diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 8c9d6a9fb..244b232c1 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -54,6 +54,10 @@ set(ICEBERG_SOURCES manifest_reader.cc manifest_reader_internal.cc manifest_writer.cc + manifest_adapter.cc + v1_metadata.cc + v2_metadata.cc + v3_metadata.cc arrow_c_data_guard_internal.cc util/conversions.cc util/decimal.cc diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc new file mode 100644 index 000000000..e595fc038 --- /dev/null +++ b/src/iceberg/manifest_adapter.cc @@ -0,0 +1,702 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/manifest_adapter.h" + +#include "iceberg/manifest_entry.h" +#include "iceberg/manifest_list.h" +#include "iceberg/schema.h" +#include "iceberg/schema_internal.h" +#include "iceberg/util/macros.h" +#include "nanoarrow/nanoarrow.h" + +namespace iceberg { + +Status ManifestAdapter::StartAppending() { + if (size_ > 0) { + return InvalidArgument("Adapter buffer not empty, cannot start appending."); + } + if (is_initialized_) { + // reset buffer + ArrowArrayRelease(&array_); + } + array_ = {}; + size_ = 0; + ArrowError error; + auto status = ArrowArrayInitFromSchema(&array_, &schema_, &error); + NANOARROW_RETURN_IF_NOT_OK(status, error); + status = ArrowArrayStartAppending(&array_); + NANOARROW_RETURN_IF_NOT_OK(status, error); + is_initialized_ = true; + return {}; +} + +Result ManifestAdapter::FinishAppending() { + ArrowError error; + auto status = ArrowArrayFinishBuildingDefault(&array_, &error); + NANOARROW_RETURN_IF_NOT_OK(status, error); + return array_; +} + +Status ManifestAdapter::AppendField(ArrowArray* arrowArray, int64_t value) { + auto status = ArrowArrayAppendInt(arrowArray, value); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Status ManifestAdapter::AppendField(ArrowArray* arrowArray, uint64_t value) { + auto status = ArrowArrayAppendUInt(arrowArray, value); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Status ManifestAdapter::AppendField(ArrowArray* arrowArray, double value) { + auto status = ArrowArrayAppendDouble(arrowArray, value); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Status ManifestAdapter::AppendField(ArrowArray* arrowArray, std::string_view value) { + ArrowStringView view(value.data(), value.size()); + auto status = ArrowArrayAppendString(arrowArray, view); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Status ManifestAdapter::AppendField(ArrowArray* arrowArray, + const std::vector& value) { + ArrowBufferViewData data; + data.as_char = reinterpret_cast(value.data()); + ArrowBufferView view(data, value.size()); + auto status = ArrowArrayAppendBytes(arrowArray, view); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +ManifestEntryAdapter::~ManifestEntryAdapter() { + // arrow::ImportedArrayData::Release() bridge.cc:1478 will release the + // internal array, so we have no need to release it here. + // if (is_initialized_) { + // ArrowArrayRelease(&array_); + // } +} + +std::shared_ptr ManifestEntryAdapter::GetManifestEntryStructType() { + return ManifestEntry::TypeFromPartitionType(std::move(partition_schema_)); +} + +Status ManifestEntryAdapter::AppendPartitions( + ArrowArray* arrow_array, const std::shared_ptr& partition_type, + const std::vector& partitions) { + if (arrow_array->n_children != partition_type->fields().size()) { + return InvalidManifest("Partition arrow not match partition type."); + } + auto fields = partition_type->fields(); + + for (const auto& partition : partitions) { + for (int32_t i = 0; i < fields.size(); i++) { + const auto& field = fields[i]; + auto array = arrow_array->children[i]; + if (partition.IsNull()) { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + continue; + } + switch (field.type()->type_id()) { + case TypeId::kBoolean: + ICEBERG_RETURN_UNEXPECTED(AppendField( + array, static_cast( + std::get(partition.value()) == true ? 1L : 0L))); + break; + case TypeId::kInt: + ICEBERG_RETURN_UNEXPECTED(AppendField( + array, static_cast(std::get(partition.value())))); + break; + case TypeId::kLong: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get(partition.value()))); + break; + case TypeId::kFloat: + ICEBERG_RETURN_UNEXPECTED(AppendField( + array, static_cast(std::get(partition.value())))); + break; + case TypeId::kDouble: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get(partition.value()))); + break; + case TypeId::kString: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get(partition.value()))); + break; + case TypeId::kBinary: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get>(partition.value()))); + break; + case TypeId::kDate: + ICEBERG_RETURN_UNEXPECTED(AppendField( + array, static_cast(std::get(partition.value())))); + break; + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get(partition.value()))); + break; + case TypeId::kDecimal: + case TypeId::kUuid: + case TypeId::kFixed: + // TODO(xiao.dong) currently literal does not support those types + default: + return InvalidManifest("Unsupported partition type: {}", field.ToString()); + } + } + } + auto status = ArrowArrayFinishElement(arrow_array); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Status ManifestEntryAdapter::AppendList(ArrowArray* arrow_array, + const std::vector& list_value) { + auto list_array = arrow_array->children[0]; + for (const auto& value : list_value) { + auto status = ArrowArrayAppendInt(list_array, static_cast(value)); + NANOARROW_RETURN_IF_FAILED(status); + } + auto status = ArrowArrayFinishElement(arrow_array); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Status ManifestEntryAdapter::AppendList(ArrowArray* arrow_array, + const std::vector& list_value) { + auto list_array = arrow_array->children[0]; + for (const auto& value : list_value) { + auto status = ArrowArrayAppendInt(list_array, value); + NANOARROW_RETURN_IF_FAILED(status); + } + auto status = ArrowArrayFinishElement(arrow_array); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Status ManifestEntryAdapter::AppendMap(ArrowArray* arrow_array, + const std::map& map_value) { + auto map_array = arrow_array->children[0]; + if (map_array->n_children != 2) { + return InvalidManifest("Invalid map array."); + } + for (const auto& [key, value] : map_value) { + auto key_array = map_array->children[0]; + auto value_array = map_array->children[1]; + ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); + ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); + } + auto status = ArrowArrayFinishElement(arrow_array); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Status ManifestEntryAdapter::AppendMap( + ArrowArray* arrow_array, const std::map>& map_value) { + auto map_array = arrow_array->children[0]; + if (map_array->n_children != 2) { + return InvalidManifest("Invalid map array."); + } + for (const auto& [key, value] : map_value) { + auto key_array = map_array->children[0]; + auto value_array = map_array->children[1]; + ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); + ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); + } + auto status = ArrowArrayFinishElement(arrow_array); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Status ManifestEntryAdapter::AppendDataFile( + ArrowArray* arrow_array, const std::shared_ptr& data_file_type, + const std::shared_ptr& file) { + auto fields = data_file_type->fields(); + for (int32_t i = 0; i < fields.size(); i++) { + const auto& field = fields[i]; + auto array = arrow_array->children[i]; + + switch (field.field_id()) { + case 134: // content (optional int32) + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, static_cast(file->content))); + break; + case 100: // file_path (required string) + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->file_path)); + break; + case 101: // file_format (required string) + ICEBERG_RETURN_UNEXPECTED(AppendField(array, ToString(file->file_format))); + break; + case 102: // partition (required struct) + { + auto partition_type = std::dynamic_pointer_cast(field.type()); + ICEBERG_RETURN_UNEXPECTED( + AppendPartitions(array, partition_type, file->partition)); + } break; + case 103: // record_count (required int64) + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->record_count)); + break; + case 104: // file_size_in_bytes (required int64) + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->file_size_in_bytes)); + break; + case 105: // block_size_in_bytes (compatible in v1) + // always 64MB for v1 + static const int64_t kBlockSizeInBytes = 64 * 1024 * 1024L; + ICEBERG_RETURN_UNEXPECTED(AppendField(array, kBlockSizeInBytes)); + break; + case 108: // column_sizes (optional map) + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->column_sizes)); + break; + case 109: // value_counts (optional map) + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->value_counts)); + break; + case 110: // null_value_counts (optional map) + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->null_value_counts)); + break; + case 137: // nan_value_counts (optional map) + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->nan_value_counts)); + break; + case 125: // lower_bounds (optional map) + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->lower_bounds)); + break; + case 128: // upper_bounds (optional map) + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->upper_bounds)); + break; + case 131: // key_metadata (optional binary) + if (!file->key_metadata.empty()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->key_metadata)); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 132: // split_offsets (optional list) + ICEBERG_RETURN_UNEXPECTED(AppendList(array, file->split_offsets)); + break; + case 135: // equality_ids (optional list) + ICEBERG_RETURN_UNEXPECTED(AppendList(array, file->equality_ids)); + break; + case 140: // sort_order_id (optional int32) + if (file->sort_order_id.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, static_cast(file->sort_order_id.value()))); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 142: // first_row_id (optional int64) + if (file->first_row_id.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->first_row_id.value())); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 143: // referenced_data_file (optional string) + { + ICEBERG_ASSIGN_OR_RAISE(auto referenced_data_file, + GetWrappedReferenceDataFile(file)); + if (referenced_data_file.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, referenced_data_file.value())); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + } + case 144: // content_offset (optional int64) + if (file->content_offset.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->content_offset.value())); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 145: // content_size_in_bytes (optional int64) + if (file->content_size_in_bytes.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, file->content_size_in_bytes.value())); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + default: + return InvalidManifest("Unknown data file field id: {} ", field.field_id()); + } + } + auto status = ArrowArrayFinishElement(arrow_array); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Result> ManifestEntryAdapter::GetWrappedSequenceNumber( + const iceberg::ManifestEntry& entry) { + return entry.sequence_number; +} + +Result> ManifestEntryAdapter::GetWrappedReferenceDataFile( + const std::shared_ptr& file) { + return file->referenced_data_file; +} + +Result> ManifestEntryAdapter::GetWrappedFirstRowId( + const std::shared_ptr& file) { + return file->first_row_id; +} + +Result> ManifestEntryAdapter::GetWrappedContentOffset( + const std::shared_ptr& file) { + return file->content_offset; +} + +Result> ManifestEntryAdapter::GetWrappedContentSizeInBytes( + const std::shared_ptr& file) { + return file->content_size_in_bytes; +} + +Status ManifestEntryAdapter::AppendInternal(const iceberg::ManifestEntry& entry) { + const auto& fields = manifest_schema_->fields(); + for (int32_t i = 0; i < fields.size(); i++) { + const auto& field = fields[i]; + auto array = array_.children[i]; + + switch (field.field_id()) { + case 0: // status (required int32) + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, static_cast(static_cast(entry.status)))); + break; + case 1: // snapshot_id (optional int64) + if (entry.snapshot_id.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, entry.snapshot_id.value())); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 2: // data_file (required struct) + if (entry.data_file) { + // Get the data file type from the field + auto data_file_type = std::dynamic_pointer_cast(field.type()); + ICEBERG_RETURN_UNEXPECTED( + AppendDataFile(array, data_file_type, entry.data_file)); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 3: // sequence_number (optional int64) + { + ICEBERG_ASSIGN_OR_RAISE(auto sequence_num, GetWrappedSequenceNumber(entry)); + if (sequence_num.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, sequence_num.value())); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + } + case 4: // file_sequence_number (optional int64) + if (entry.file_sequence_number.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, entry.file_sequence_number.value())); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + default: + return InvalidManifest("Unknown manifest entry field id: {}", field.field_id()); + } + } + + auto status = ArrowArrayFinishElement(&array_); + NANOARROW_RETURN_IF_FAILED(status); + size_++; + return {}; +} + +Status ManifestEntryAdapter::InitSchema(const std::unordered_set& fields_ids) { + auto manifest_entry_schema = GetManifestEntryStructType(); + auto fields_span = manifest_entry_schema->fields(); + std::vector fields; + // TODO(xiao.dong) make this a common function to recursive handle + // all nested fields in schema + for (const auto& field : fields_span) { + if (field.field_id() == 2) { + // handle data_file field + auto data_file_struct = std::dynamic_pointer_cast(field.type()); + std::vector data_file_fields; + for (const auto& data_file_field : data_file_struct->fields()) { + if (fields_ids.contains(data_file_field.field_id())) { + data_file_fields.emplace_back(data_file_field); + } + } + auto type = std::make_shared(data_file_fields); + auto data_file_field = SchemaField::MakeRequired( + field.field_id(), std::string(field.name()), std::move(type)); + fields.emplace_back(std::move(data_file_field)); + } else { + if (fields_ids.contains(field.field_id())) { + fields.emplace_back(field); + } + } + } + manifest_schema_ = std::make_shared(fields); + ICEBERG_RETURN_UNEXPECTED(ToArrowSchema(*manifest_schema_, &schema_)); + return {}; +} + +ManifestFileAdapter::~ManifestFileAdapter() { + // arrow::ImportedArrayData::Release() bridge.cc:1478 will release the + // internal array, so we have no need to release it here. + // if (is_initialized_) { + // ArrowArrayRelease(&array_); + // } +} + +Status ManifestFileAdapter::AppendPartitions( + ArrowArray* arrow_array, const std::shared_ptr& partition_type, + const std::vector& partitions) { + auto& array = arrow_array->children[0]; + if (array->n_children != 4) { + return InvalidManifestList("Invalid partition array."); + } + auto partition_struct = + std::dynamic_pointer_cast(partition_type->fields()[0].type()); + auto fields = partition_struct->fields(); + for (const auto& partition : partitions) { + for (const auto& field : fields) { + switch (field.field_id()) { + case 509: // contains_null (required bool) + ICEBERG_RETURN_UNEXPECTED( + AppendField(array->children[0], + static_cast(partition.contains_null ? 1 : 0))); + break; + case 518: // contains_nan (optional bool) + { + auto field_array = array->children[1]; + if (partition.contains_nan.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField( + field_array, + static_cast(partition.contains_nan.value() ? 1 : 0))); + } else { + auto status = ArrowArrayAppendNull(field_array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + } + case 510: // lower_bound (optional binary) + { + auto field_array = array->children[2]; + if (partition.lower_bound.has_value() && !partition.lower_bound->empty()) { + ICEBERG_RETURN_UNEXPECTED( + AppendField(field_array, partition.lower_bound.value())); + } else { + auto status = ArrowArrayAppendNull(field_array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + } + case 511: // upper_bound (optional binary) + { + auto field_array = array->children[3]; + if (partition.upper_bound.has_value() && !partition.upper_bound->empty()) { + ICEBERG_RETURN_UNEXPECTED( + AppendField(field_array, partition.upper_bound.value())); + } else { + auto status = ArrowArrayAppendNull(field_array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + } + default: + return InvalidManifestList("Unknown field id: {}", field.field_id()); + } + } + auto status = ArrowArrayFinishElement(array); + NANOARROW_RETURN_IF_FAILED(status); + } + + auto status = ArrowArrayFinishElement(arrow_array); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + +Result ManifestFileAdapter::GetWrappedSequenceNumber( + const iceberg::ManifestFile& file) { + return file.sequence_number; +} + +Result ManifestFileAdapter::GetWrappedMinSequenceNumber( + const iceberg::ManifestFile& file) { + return file.min_sequence_number; +} + +Result> ManifestFileAdapter::GetWrappedFirstRowId( + const iceberg::ManifestFile& file) { + return file.first_row_id; +} + +Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { + const auto& fields = manifest_list_schema_->fields(); + for (int32_t i = 0; i < fields.size(); i++) { + const auto& field = fields[i]; + auto array = array_.children[i]; + switch (field.field_id()) { + case 500: // manifest_path + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.manifest_path)); + break; + case 501: // manifest_length + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.manifest_length)); + break; + case 502: // partition_spec_id + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, static_cast(file.partition_spec_id))); + break; + case 517: // content + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, static_cast(static_cast(file.content)))); + break; + case 515: // sequence_number + { + ICEBERG_ASSIGN_OR_RAISE(auto sequence_num, GetWrappedSequenceNumber(file)); + ICEBERG_RETURN_UNEXPECTED(AppendField(array, sequence_num)); + break; + } + case 516: // min_sequence_number + { + ICEBERG_ASSIGN_OR_RAISE(auto min_sequence_num, GetWrappedMinSequenceNumber(file)); + ICEBERG_RETURN_UNEXPECTED(AppendField(array, min_sequence_num)); + break; + } + case 503: // added_snapshot_id + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.added_snapshot_id)); + break; + case 504: // added_files_count + if (file.added_files_count.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, static_cast(file.added_files_count.value()))); + } else { + // Append null for optional field + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 505: // existing_files_count + if (file.existing_files_count.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField( + array, static_cast(file.existing_files_count.value()))); + } else { + // Append null for optional field + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 506: // deleted_files_count + if (file.deleted_files_count.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, static_cast(file.deleted_files_count.value()))); + } else { + // Append null for optional field + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 512: // added_rows_count + if (file.added_rows_count.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.added_rows_count.value())); + } else { + // Append null for optional field + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 513: // existing_rows_count + if (file.existing_rows_count.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.existing_rows_count.value())); + } else { + // Append null for optional field + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 514: // deleted_rows_count + if (file.deleted_rows_count.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.deleted_rows_count.value())); + } else { + // Append null for optional field + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 507: // partitions + ICEBERG_RETURN_UNEXPECTED(AppendPartitions( + array, std::dynamic_pointer_cast(field.type()), file.partitions)); + break; + case 519: // key_metadata + if (!file.key_metadata.empty()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.key_metadata)); + } else { + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + case 520: // first_row_id + { + ICEBERG_ASSIGN_OR_RAISE(auto first_row_id, GetWrappedFirstRowId(file)); + if (first_row_id.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, first_row_id.value())); + } else { + // Append null for optional field + auto status = ArrowArrayAppendNull(array, 1); + NANOARROW_RETURN_IF_FAILED(status); + } + break; + } + default: + return InvalidManifestList("Unknown field id: {}", field.field_id()); + } + } + auto status = ArrowArrayFinishElement(&array_); + NANOARROW_RETURN_IF_FAILED(status); + size_++; + return {}; +} + +Status ManifestFileAdapter::InitSchema(const std::unordered_set& fields_ids) { + std::vector fields; + for (const auto& field : ManifestFile::Type().fields()) { + if (fields_ids.contains(field.field_id())) { + fields.emplace_back(field); + } + } + manifest_list_schema_ = std::make_shared(fields); + ICEBERG_RETURN_UNEXPECTED(ToArrowSchema(*manifest_list_schema_, &schema_)); + return {}; +} + +} // namespace iceberg diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index 2ffe51be3..0577ad326 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -22,6 +22,11 @@ /// \file iceberg/metadata_adapter.h /// Base class of adapter for v1v2v3v4 metadata. +#include +#include +#include +#include + #include "iceberg/arrow_c_data.h" #include "iceberg/result.h" #include "iceberg/type_fwd.h" @@ -33,13 +38,24 @@ class ICEBERG_EXPORT ManifestAdapter { public: ManifestAdapter() = default; virtual ~ManifestAdapter() = default; + virtual Status Init() = 0; - virtual Status StartAppending() = 0; - virtual Result FinishAppending() = 0; + Status StartAppending(); + Result FinishAppending(); int64_t size() const { return size_; } + virtual std::shared_ptr schema() const = 0; + + protected: + Status AppendField(ArrowArray* arrowArray, int64_t value); + Status AppendField(ArrowArray* arrowArray, uint64_t value); + Status AppendField(ArrowArray* arrowArray, double value); + Status AppendField(ArrowArray* arrowArray, std::string_view value); + Status AppendField(ArrowArray* arrowArray, const std::vector& value); protected: + bool is_initialized_ = false; ArrowArray array_; + ArrowSchema schema_; // converted from manifest_schema_ or manifest_list_schema_ int64_t size_ = 0; }; @@ -47,10 +63,48 @@ class ICEBERG_EXPORT ManifestAdapter { // append a list of `ManifestEntry`s to an `ArrowArray`. class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { public: - ManifestEntryAdapter() = default; - ~ManifestEntryAdapter() override = default; + explicit ManifestEntryAdapter(std::shared_ptr partition_schema, + std::shared_ptr partition_spec) + : partition_spec_(std::move(partition_spec)), + partition_schema_(std::move(partition_schema)) {} + ~ManifestEntryAdapter() override; virtual Status Append(const ManifestEntry& entry) = 0; + + std::shared_ptr schema() const override { return manifest_schema_; } + + protected: + virtual std::shared_ptr GetManifestEntryStructType(); + Status InitSchema(const std::unordered_set& fields_ids); + Status AppendInternal(const ManifestEntry& entry); + Status AppendDataFile(ArrowArray* arrow_array, + const std::shared_ptr& data_file_type, + const std::shared_ptr& file); + Status AppendPartitions(ArrowArray* arrow_array, + const std::shared_ptr& partition_type, + const std::vector& partitions); + Status AppendList(ArrowArray* arrow_array, const std::vector& list_value); + Status AppendList(ArrowArray* arrow_array, const std::vector& list_value); + Status AppendMap(ArrowArray* arrow_array, const std::map& map_value); + Status AppendMap(ArrowArray* arrow_array, + const std::map>& map_value); + + virtual Result> GetWrappedSequenceNumber( + const ManifestEntry& entry); + virtual Result> GetWrappedReferenceDataFile( + const std::shared_ptr& file); + virtual Result> GetWrappedFirstRowId( + const std::shared_ptr& file); + virtual Result> GetWrappedContentOffset( + const std::shared_ptr& file); + virtual Result> GetWrappedContentSizeInBytes( + const std::shared_ptr& file); + + protected: + std::shared_ptr partition_spec_; + std::shared_ptr partition_schema_; + std::shared_ptr manifest_schema_; + std::unordered_map metadata_; }; // \brief Implemented by different versions with different schemas to @@ -58,9 +112,26 @@ class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { class ICEBERG_EXPORT ManifestFileAdapter : public ManifestAdapter { public: ManifestFileAdapter() = default; - ~ManifestFileAdapter() override = default; + ~ManifestFileAdapter() override; virtual Status Append(const ManifestFile& file) = 0; + + std::shared_ptr schema() const override { return manifest_list_schema_; } + + protected: + Status InitSchema(const std::unordered_set& fields_ids); + Status AppendInternal(const ManifestFile& file); + Status AppendPartitions(ArrowArray* arrow_array, + const std::shared_ptr& partition_type, + const std::vector& partitions); + + virtual Result GetWrappedSequenceNumber(const ManifestFile& file); + virtual Result GetWrappedMinSequenceNumber(const ManifestFile& file); + virtual Result> GetWrappedFirstRowId(const ManifestFile& file); + + protected: + std::shared_ptr manifest_list_schema_; + std::unordered_map metadata_; }; } // namespace iceberg diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index feff82f1b..82eeb3476 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -32,11 +32,6 @@ namespace iceberg { -#define NANOARROW_RETURN_IF_NOT_OK(status, error) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return InvalidArrowData("Nanoarrow error: {}", error.message); \ - } - #define PARSE_PRIMITIVE_FIELD(item, array_view, type) \ for (int64_t row_idx = 0; row_idx < array_view->length; row_idx++) { \ if (!ArrowArrayViewIsNull(array_view, row_idx)) { \ diff --git a/src/iceberg/manifest_writer.cc b/src/iceberg/manifest_writer.cc index 27fd3f767..02fbd3269 100644 --- a/src/iceberg/manifest_writer.cc +++ b/src/iceberg/manifest_writer.cc @@ -50,7 +50,7 @@ Status ManifestWriter::Close() { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); } - return {}; + return writer_->Close(); } Result> OpenFileWriter(std::string_view location, @@ -66,48 +66,44 @@ Result> OpenFileWriter(std::string_view location, Result> ManifestWriter::MakeV1Writer( std::optional snapshot_id, std::string_view manifest_location, - std::shared_ptr file_io, std::shared_ptr partition_schema) { - // TODO(xiao.dong) parse v1 schema - auto manifest_entry_schema = - ManifestEntry::TypeFromPartitionType(std::move(partition_schema)); - auto fields_span = manifest_entry_schema->fields(); - std::vector fields(fields_span.begin(), fields_span.end()); - auto schema = std::make_shared(fields); - ICEBERG_ASSIGN_OR_RAISE(auto writer, - OpenFileWriter(manifest_location, schema, std::move(file_io))); - auto adapter = std::make_unique(snapshot_id, std::move(schema)); + std::shared_ptr file_io, std::shared_ptr partition_schema, + std::shared_ptr partition_spec) { + auto adapter = std::make_unique( + snapshot_id, std::move(partition_schema), std::move(partition_spec)); + ICEBERG_RETURN_UNEXPECTED(adapter->Init()); + ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + OpenFileWriter(manifest_location, adapter->schema(), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } Result> ManifestWriter::MakeV2Writer( std::optional snapshot_id, std::string_view manifest_location, - std::shared_ptr file_io, std::shared_ptr partition_schema) { - // TODO(xiao.dong) parse v2 schema - auto manifest_entry_schema = - ManifestEntry::TypeFromPartitionType(std::move(partition_schema)); - auto fields_span = manifest_entry_schema->fields(); - std::vector fields(fields_span.begin(), fields_span.end()); - auto schema = std::make_shared(fields); - ICEBERG_ASSIGN_OR_RAISE(auto writer, - OpenFileWriter(manifest_location, schema, std::move(file_io))); - auto adapter = std::make_unique(snapshot_id, std::move(schema)); + std::shared_ptr file_io, std::shared_ptr partition_schema, + std::shared_ptr partition_spec) { + auto adapter = std::make_unique( + snapshot_id, std::move(partition_schema), std::move(partition_spec)); + ICEBERG_RETURN_UNEXPECTED(adapter->Init()); + ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + OpenFileWriter(manifest_location, adapter->schema(), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } Result> ManifestWriter::MakeV3Writer( std::optional snapshot_id, std::optional first_row_id, std::string_view manifest_location, std::shared_ptr file_io, - std::shared_ptr partition_schema) { - // TODO(xiao.dong) parse v3 schema - auto manifest_entry_schema = - ManifestEntry::TypeFromPartitionType(std::move(partition_schema)); - auto fields_span = manifest_entry_schema->fields(); - std::vector fields(fields_span.begin(), fields_span.end()); - auto schema = std::make_shared(fields); - ICEBERG_ASSIGN_OR_RAISE(auto writer, - OpenFileWriter(manifest_location, schema, std::move(file_io))); - auto adapter = std::make_unique(snapshot_id, first_row_id, - std::move(schema)); + std::shared_ptr partition_schema, + std::shared_ptr partition_spec) { + auto adapter = std::make_unique( + snapshot_id, first_row_id, std::move(partition_schema), std::move(partition_spec)); + ICEBERG_RETURN_UNEXPECTED(adapter->Init()); + ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); + ICEBERG_ASSIGN_OR_RAISE( + auto writer, + OpenFileWriter(manifest_location, adapter->schema(), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } @@ -132,20 +128,18 @@ Status ManifestListWriter::Close() { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); } - return {}; + return writer_->Close(); } Result> ManifestListWriter::MakeV1Writer( int64_t snapshot_id, std::optional parent_snapshot_id, std::string_view manifest_list_location, std::shared_ptr file_io) { - // TODO(xiao.dong) parse v1 schema - std::vector fields(ManifestFile::Type().fields().begin(), - ManifestFile::Type().fields().end()); - auto schema = std::make_shared(fields); + auto adapter = std::make_unique(snapshot_id, parent_snapshot_id); + ICEBERG_RETURN_UNEXPECTED(adapter->Init()); + ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); ICEBERG_ASSIGN_OR_RAISE( - auto writer, OpenFileWriter(manifest_list_location, schema, std::move(file_io))); - auto adapter = std::make_unique(snapshot_id, parent_snapshot_id, - std::move(schema)); + auto writer, + OpenFileWriter(manifest_list_location, adapter->schema(), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } @@ -153,14 +147,14 @@ Result> ManifestListWriter::MakeV2Writer( int64_t snapshot_id, std::optional parent_snapshot_id, int64_t sequence_number, std::string_view manifest_list_location, std::shared_ptr file_io) { - // TODO(xiao.dong) parse v2 schema - std::vector fields(ManifestFile::Type().fields().begin(), - ManifestFile::Type().fields().end()); - auto schema = std::make_shared(fields); + auto adapter = std::make_unique(snapshot_id, parent_snapshot_id, + sequence_number); + ICEBERG_RETURN_UNEXPECTED(adapter->Init()); + ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); ICEBERG_ASSIGN_OR_RAISE( - auto writer, OpenFileWriter(manifest_list_location, schema, std::move(file_io))); - auto adapter = std::make_unique( - snapshot_id, parent_snapshot_id, sequence_number, std::move(schema)); + auto writer, + OpenFileWriter(manifest_list_location, adapter->schema(), std::move(file_io))); + return std::make_unique(std::move(writer), std::move(adapter)); } @@ -168,14 +162,13 @@ Result> ManifestListWriter::MakeV3Writer( int64_t snapshot_id, std::optional parent_snapshot_id, int64_t sequence_number, std::optional first_row_id, std::string_view manifest_list_location, std::shared_ptr file_io) { - // TODO(xiao.dong) parse v3 schema - std::vector fields(ManifestFile::Type().fields().begin(), - ManifestFile::Type().fields().end()); - auto schema = std::make_shared(fields); + auto adapter = std::make_unique(snapshot_id, parent_snapshot_id, + sequence_number, first_row_id); + ICEBERG_RETURN_UNEXPECTED(adapter->Init()); + ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); ICEBERG_ASSIGN_OR_RAISE( - auto writer, OpenFileWriter(manifest_list_location, schema, std::move(file_io))); - auto adapter = std::make_unique( - snapshot_id, parent_snapshot_id, sequence_number, first_row_id, std::move(schema)); + auto writer, + OpenFileWriter(manifest_list_location, adapter->schema(), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } diff --git a/src/iceberg/manifest_writer.h b/src/iceberg/manifest_writer.h index c9ec30702..a9403b732 100644 --- a/src/iceberg/manifest_writer.h +++ b/src/iceberg/manifest_writer.h @@ -39,7 +39,7 @@ class ICEBERG_EXPORT ManifestWriter { std::unique_ptr adapter) : writer_(std::move(writer)), adapter_(std::move(adapter)) {} - virtual ~ManifestWriter() = default; + ~ManifestWriter() = default; /// \brief Write manifest entry to file. /// \param entry Manifest entry to write. @@ -58,30 +58,39 @@ class ICEBERG_EXPORT ManifestWriter { /// \param snapshot_id ID of the snapshot. /// \param manifest_location Path to the manifest file. /// \param file_io File IO implementation to use. + /// \param partition_schema Schema of the partition columns. + /// \param partition_spec Partition spec for the manifest. /// \return A Result containing the writer or an error. static Result> MakeV1Writer( std::optional snapshot_id, std::string_view manifest_location, - std::shared_ptr file_io, std::shared_ptr partition_schema); + std::shared_ptr file_io, std::shared_ptr partition_schema, + std::shared_ptr partition_spec); /// \brief Creates a writer for a manifest file. /// \param snapshot_id ID of the snapshot. /// \param manifest_location Path to the manifest file. /// \param file_io File IO implementation to use. + /// \param partition_schema Schema of the partition columns. + /// \param partition_spec Partition spec for the manifest. /// \return A Result containing the writer or an error. static Result> MakeV2Writer( std::optional snapshot_id, std::string_view manifest_location, - std::shared_ptr file_io, std::shared_ptr partition_schema); + std::shared_ptr file_io, std::shared_ptr partition_schema, + std::shared_ptr partition_spec); /// \brief Creates a writer for a manifest file. /// \param snapshot_id ID of the snapshot. /// \param first_row_id First row ID of the snapshot. /// \param manifest_location Path to the manifest file. /// \param file_io File IO implementation to use. + /// \param partition_schema Schema of the partition columns. + /// \param partition_spec Partition spec for the manifest. /// \return A Result containing the writer or an error. static Result> MakeV3Writer( std::optional snapshot_id, std::optional first_row_id, std::string_view manifest_location, std::shared_ptr file_io, - std::shared_ptr partition_schema); + std::shared_ptr partition_schema, + std::shared_ptr partition_spec); private: static constexpr int64_t kBatchSize = 1024; @@ -96,7 +105,7 @@ class ICEBERG_EXPORT ManifestListWriter { std::unique_ptr adapter) : writer_(std::move(writer)), adapter_(std::move(adapter)) {} - virtual ~ManifestListWriter() = default; + ~ManifestListWriter() = default; /// \brief Write manifest file to manifest list file. /// \param file Manifest file to write. diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index e687f97cb..7c62a2a55 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -117,8 +117,8 @@ if(ICEBERG_BUILD_BUNDLE) avro_test.cc avro_schema_test.cc avro_stream_test.cc - manifest_list_reader_test.cc - manifest_reader_test.cc + manifest_list_reader_writer_test.cc + manifest_reader_writer_test.cc test_common.cc) add_iceberg_test(arrow_test diff --git a/src/iceberg/test/manifest_list_reader_test.cc b/src/iceberg/test/manifest_list_reader_writer_test.cc similarity index 78% rename from src/iceberg/test/manifest_list_reader_test.cc rename to src/iceberg/test/manifest_list_reader_writer_test.cc index 9fd6e4c16..da91d5415 100644 --- a/src/iceberg/test/manifest_list_reader_test.cc +++ b/src/iceberg/test/manifest_list_reader_writer_test.cc @@ -26,12 +26,14 @@ #include "iceberg/expression/literal.h" #include "iceberg/manifest_list.h" #include "iceberg/manifest_reader.h" +#include "iceberg/manifest_writer.h" +#include "matchers.h" #include "temp_file_test_base.h" #include "test_common.h" namespace iceberg { -class ManifestListReaderTestBase : public TempFileTestBase { +class ManifestListReaderWriterTestBase : public TempFileTestBase { protected: static void SetUpTestSuite() { avro::RegisterAll(); } @@ -44,6 +46,11 @@ class ManifestListReaderTestBase : public TempFileTestBase { void TestManifestListReading(const std::string& resource_name, const std::vector& expected_manifest_list) { std::string path = GetResourcePath(resource_name); + TestManifestListReadingByPath(path, expected_manifest_list); + } + + void TestManifestListReadingByPath( + const std::string& path, const std::vector& expected_manifest_list) { auto manifest_reader_result = ManifestListReader::Make(path, file_io_); ASSERT_EQ(manifest_reader_result.has_value(), true); @@ -66,7 +73,7 @@ class ManifestListReaderTestBase : public TempFileTestBase { std::shared_ptr file_io_; }; -class ManifestListReaderV1Test : public ManifestListReaderTestBase { +class ManifestListReaderWriterV1Test : public ManifestListReaderWriterTestBase { protected: std::vector PreparePartitionedTestData() { std::vector paths = { @@ -202,9 +209,20 @@ class ManifestListReaderV1Test : public ManifestListReaderTestBase { .lower_bound = lower_bounds[3], .upper_bound = upper_bounds[3]}}}}; } + + void TestWriteManifestList(const std::string& manifest_list_path, + const std::vector& manifest_files) { + auto result = ManifestListWriter::MakeV1Writer(1, 0, manifest_list_path, file_io_); + ASSERT_TRUE(result.has_value()) << result.error().message; + auto writer = std::move(result.value()); + auto status = writer->AddAll(manifest_files); + EXPECT_THAT(status, IsOk()); + status = writer->Close(); + EXPECT_THAT(status, IsOk()); + } }; -class ManifestListReaderV2Test : public ManifestListReaderTestBase { +class ManifestListReaderWriterV2Test : public ManifestListReaderWriterTestBase { protected: std::vector PreparePartitionedTestData() { std::vector manifest_files; @@ -280,39 +298,71 @@ class ManifestListReaderV2Test : public ManifestListReaderTestBase { } return manifest_files; } + + void TestWriteManifestList(const std::string& manifest_list_path, + const std::vector& manifest_files) { + auto result = ManifestListWriter::MakeV2Writer(1, 0, 4, manifest_list_path, file_io_); + ASSERT_TRUE(result.has_value()) << result.error().message; + auto writer = std::move(result.value()); + auto status = writer->AddAll(manifest_files); + EXPECT_THAT(status, IsOk()); + status = writer->Close(); + EXPECT_THAT(status, IsOk()); + } }; // V1 Tests -TEST_F(ManifestListReaderV1Test, PartitionedTest) { +TEST_F(ManifestListReaderWriterV1Test, PartitionedTest) { auto expected_manifest_list = PreparePartitionedTestData(); TestManifestListReading( "snap-7532614258660258098-1-eafd2972-f58e-4185-9237-6378f564787e.avro", expected_manifest_list); } -TEST_F(ManifestListReaderV1Test, ComplexTypeTest) { +TEST_F(ManifestListReaderWriterV1Test, ComplexTypeTest) { auto expected_manifest_list = PrepareComplexTypeTestData(); TestManifestListReading( "snap-4134160420377642835-1-aeffe099-3bac-4011-bc17-5875210d8dc0.avro", expected_manifest_list); } -TEST_F(ManifestListReaderV1Test, ComplexPartitionedTest) { +TEST_F(ManifestListReaderWriterV1Test, ComplexPartitionedTest) { auto expected_manifest_list = PrepareComplexPartitionedTestData(); TestManifestListReading( "snap-7522296285847100621-1-5d690750-8fb4-4cd1-8ae7-85c7b39abe14.avro", expected_manifest_list); } +TEST_F(ManifestListReaderWriterV1Test, WritePartitionedTest) { + auto expected_manifest_list = PreparePartitionedTestData(); + auto write_manifest_list_path = CreateNewTempFilePath(); + TestWriteManifestList(write_manifest_list_path, expected_manifest_list); + TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); +} + +TEST_F(ManifestListReaderWriterV1Test, WriteComplexTypeTest) { + auto expected_manifest_list = PrepareComplexTypeTestData(); + auto write_manifest_list_path = CreateNewTempFilePath(); + TestWriteManifestList(write_manifest_list_path, expected_manifest_list); + TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); +} + +TEST_F(ManifestListReaderWriterV1Test, WriteComplexPartitionedTest) { + auto expected_manifest_list = PrepareComplexPartitionedTestData(); + auto write_manifest_list_path = CreateNewTempFilePath(); + TestWriteManifestList(write_manifest_list_path, expected_manifest_list); + TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); +} + // V2 Tests -TEST_F(ManifestListReaderV2Test, PartitionedTest) { +TEST_F(ManifestListReaderWriterV2Test, PartitionedTest) { auto expected_manifest_list = PreparePartitionedTestData(); TestManifestListReading( "snap-7412193043800610213-1-2bccd69e-d642-4816-bba0-261cd9bd0d93.avro", expected_manifest_list); } -TEST_F(ManifestListReaderV2Test, NonPartitionedTest) { +TEST_F(ManifestListReaderWriterV2Test, NonPartitionedTest) { auto expected_manifest_list = PrepareNonPartitionedTestData(); TestManifestListReading( "snap-251167482216575399-1-ccb6dbcb-0611-48da-be68-bd506ea63188.avro", @@ -322,4 +372,21 @@ TEST_F(ManifestListReaderV2Test, NonPartitionedTest) { TestNonPartitionedManifests(expected_manifest_list); } +TEST_F(ManifestListReaderWriterV2Test, WritePartitionedTest) { + auto expected_manifest_list = PreparePartitionedTestData(); + auto write_manifest_list_path = CreateNewTempFilePath(); + TestWriteManifestList(write_manifest_list_path, expected_manifest_list); + TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); +} + +TEST_F(ManifestListReaderWriterV2Test, WriteNonPartitionedTest) { + auto expected_manifest_list = PrepareNonPartitionedTestData(); + auto write_manifest_list_path = CreateNewTempFilePath(); + TestWriteManifestList(write_manifest_list_path, expected_manifest_list); + TestManifestListReadingByPath(write_manifest_list_path, expected_manifest_list); + + // Additional verification: ensure all manifests are truly non-partitioned + TestNonPartitionedManifests(expected_manifest_list); +} + } // namespace iceberg diff --git a/src/iceberg/test/manifest_reader_test.cc b/src/iceberg/test/manifest_reader_writer_test.cc similarity index 75% rename from src/iceberg/test/manifest_reader_test.cc rename to src/iceberg/test/manifest_reader_writer_test.cc index 7381b2981..54165455f 100644 --- a/src/iceberg/test/manifest_reader_test.cc +++ b/src/iceberg/test/manifest_reader_writer_test.cc @@ -17,8 +17,6 @@ * under the License. */ -#include "iceberg/manifest_reader.h" - #include #include @@ -28,7 +26,12 @@ #include "iceberg/avro/avro_register.h" #include "iceberg/manifest_entry.h" #include "iceberg/manifest_list.h" +#include "iceberg/manifest_reader.h" +#include "iceberg/manifest_writer.h" +#include "iceberg/partition_spec.h" #include "iceberg/schema.h" +#include "iceberg/transform.h" +#include "matchers.h" #include "temp_file_test_base.h" #include "test_common.h" @@ -48,6 +51,12 @@ class ManifestReaderTestBase : public TempFileTestBase { const std::vector& expected_entries, std::shared_ptr partition_schema = nullptr) { std::string path = GetResourcePath(resource_name); + TestManifestReadingByPath(path, expected_entries, partition_schema); + } + + void TestManifestReadingByPath(const std::string& path, + const std::vector& expected_entries, + std::shared_ptr partition_schema = nullptr) { auto manifest_reader_result = ManifestReader::Make(path, file_io_, partition_schema); ASSERT_TRUE(manifest_reader_result.has_value()) << manifest_reader_result.error().message; @@ -142,6 +151,22 @@ class ManifestReaderV1Test : public ManifestReaderTestBase { } return manifest_entries; } + + void TestWriteManifest(const std::string& manifest_list_path, + std::shared_ptr partition_schema, + std::shared_ptr partition_spec, + const std::vector& manifest_entries) { + std::cout << "Writing manifest list to " << manifest_list_path << std::endl; + auto result = ManifestWriter::MakeV1Writer(1, manifest_list_path, file_io_, + std::move(partition_schema), + std::move(partition_spec)); + ASSERT_TRUE(result.has_value()) << result.error().message; + auto writer = std::move(result.value()); + auto status = writer->AddAll(manifest_entries); + EXPECT_THAT(status, IsOk()); + status = writer->Close(); + EXPECT_THAT(status, IsOk()); + } }; TEST_F(ManifestReaderV1Test, PartitionedTest) { @@ -153,6 +178,21 @@ TEST_F(ManifestReaderV1Test, PartitionedTest) { partition_schema); } +TEST_F(ManifestReaderV1Test, WritePartitionedTest) { + iceberg::SchemaField partition_field(1000, "order_ts_hour", iceberg::int32(), true); + auto partition_schema = + std::make_shared(std::vector({partition_field})); + auto identity_transform = Transform::Identity(); + std::vector fields{ + PartitionField(1, 1000, "order_ts_hour", identity_transform)}; + auto partition_spec = std::make_shared(partition_schema, 1, fields); + + auto expected_entries = PreparePartitionedTestData(); + auto write_manifest_path = CreateNewTempFilePath(); + TestWriteManifest(write_manifest_path, partition_schema, partition_spec, + expected_entries); +} + class ManifestReaderV2Test : public ManifestReaderTestBase { protected: std::vector CreateV2TestData( @@ -218,6 +258,22 @@ class ManifestReaderV2Test : public ManifestReaderTestBase { std::vector PrepareMetadataInheritanceTestData() { return CreateV2TestData(/*sequence_number=*/15, /*partition_spec_id*/ 12); } + + void TestWriteManifest(int64_t snapshot_id, const std::string& manifest_list_path, + std::shared_ptr partition_schema, + std::shared_ptr partition_spec, + const std::vector& manifest_entries) { + std::cout << "Writing manifest list to " << manifest_list_path << std::endl; + auto result = ManifestWriter::MakeV2Writer(snapshot_id, manifest_list_path, file_io_, + std::move(partition_schema), + std::move(partition_spec)); + ASSERT_TRUE(result.has_value()) << result.error().message; + auto writer = std::move(result.value()); + auto status = writer->AddAll(manifest_entries); + EXPECT_THAT(status, IsOk()); + status = writer->Close(); + EXPECT_THAT(status, IsOk()); + } }; TEST_F(ManifestReaderV2Test, NonPartitionedTest) { @@ -239,4 +295,18 @@ TEST_F(ManifestReaderV2Test, MetadataInheritanceTest) { TestManifestReadingWithManifestFile(manifest_file, expected_entries); } +TEST_F(ManifestReaderV2Test, WriteNonPartitionedTest) { + auto expected_entries = PrepareNonPartitionedTestData(); + auto write_manifest_path = CreateNewTempFilePath(); + TestWriteManifest(679879563479918846LL, write_manifest_path, nullptr, nullptr, + expected_entries); +} + +TEST_F(ManifestReaderV2Test, WriteInheritancePartitionedTest) { + auto expected_entries = PrepareMetadataInheritanceTestData(); + auto write_manifest_path = CreateNewTempFilePath(); + TestWriteManifest(679879563479918846LL, write_manifest_path, nullptr, nullptr, + expected_entries); +} + } // namespace iceberg diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h index 41061d399..bdc5c1e35 100644 --- a/src/iceberg/type_fwd.h +++ b/src/iceberg/type_fwd.h @@ -128,6 +128,7 @@ struct DataFile; struct ManifestEntry; struct ManifestFile; struct ManifestList; +struct PartitionFieldSummary; class ManifestListReader; class ManifestListWriter; diff --git a/src/iceberg/util/macros.h b/src/iceberg/util/macros.h index 278035d3f..5e5e3de86 100644 --- a/src/iceberg/util/macros.h +++ b/src/iceberg/util/macros.h @@ -39,4 +39,12 @@ ICEBERG_ASSIGN_OR_RAISE_IMPL(ICEBERG_ASSIGN_OR_RAISE_NAME(result_, __COUNTER__), lhs, \ rexpr) -#define ICEBERG_DCHECK(expr, message) assert((expr) && (message)) +#define NANOARROW_RETURN_IF_FAILED(status) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return InvalidArrowData("Nanoarrow error code: {}", status); \ + } + +#define NANOARROW_RETURN_IF_NOT_OK(status, error) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return InvalidArrowData("Nanoarrow error msg: {}", error.message); \ + } diff --git a/src/iceberg/v1_metadata.cc b/src/iceberg/v1_metadata.cc new file mode 100644 index 000000000..2fda59100 --- /dev/null +++ b/src/iceberg/v1_metadata.cc @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/v1_metadata.h" + +#include "iceberg/manifest_entry.h" +#include "iceberg/manifest_list.h" +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" + +namespace iceberg { + +Status ManifestEntryAdapterV1::Init() { + static std::unordered_set compatible_fields{ + 0, 1, 2, 100, 101, 102, 103, 104, 105, 108, 109, 110, 137, 125, 128, 131, 132, 140, + }; + // TODO(xiao.dong) schema to json + metadata_["schema"] = "{}"; + // TODO(xiao.dong) partition spec to json + metadata_["partition-spec"] = "{}"; + if (partition_spec_ != nullptr) { + metadata_["partition-spec-id"] = std::to_string(partition_spec_->spec_id()); + } + metadata_["format-version"] = "1"; + return InitSchema(compatible_fields); +} + +Status ManifestEntryAdapterV1::Append(const iceberg::ManifestEntry& entry) { + return AppendInternal(entry); +} + +std::shared_ptr ManifestEntryAdapterV1::GetManifestEntryStructType() { + // V1 compatible fields. Official suggestions: + // Deprecated. Always write a default in v1. Do not write in v2 or v3. + static const SchemaField kBlockSizeInBytes = SchemaField::MakeRequired( + 105, "block_size_in_bytes", iceberg::int64(), "Block size in bytes"); + std::shared_ptr partition_type = partition_schema_; + if (!partition_type) { + partition_type = PartitionSpec::Unpartitioned()->schema(); + } + auto datafile_type = std::make_shared(std::vector{ + DataFile::kFilePath, DataFile::kFileFormat, + SchemaField::MakeRequired(102, DataFile::kPartitionField, + std::move(partition_type)), + DataFile::kRecordCount, DataFile::kFileSize, kBlockSizeInBytes, + DataFile::kColumnSizes, DataFile::kValueCounts, DataFile::kNullValueCounts, + DataFile::kNanValueCounts, DataFile::kLowerBounds, DataFile::kUpperBounds, + DataFile::kKeyMetadata, DataFile::kSplitOffsets, DataFile::kSortOrderId}); + + return std::make_shared( + std::vector{ManifestEntry::kStatus, ManifestEntry::kSnapshotId, + SchemaField::MakeRequired(2, ManifestEntry::kDataFileField, + std::move(datafile_type))}); +} + +Status ManifestFileAdapterV1::Init() { + static std::unordered_set compatible_fields{ + 500, 501, 502, 503, 504, 505, 506, 512, 513, 514, 507, 519, + }; + metadata_["snapshot-id"] = std::to_string(snapshot_id_); + metadata_["parent-snapshot-id"] = parent_snapshot_id_.has_value() + ? std::to_string(parent_snapshot_id_.value()) + : "null"; + metadata_["format-version"] = "1"; + return InitSchema(compatible_fields); +} + +Status ManifestFileAdapterV1::Append(const iceberg::ManifestFile& file) { + if (file.content != ManifestFile::Content::kData) { + return InvalidManifestList("Cannot store delete manifests in a v1 table"); + } + return AppendInternal(file); +} + +} // namespace iceberg diff --git a/src/iceberg/v1_metadata.h b/src/iceberg/v1_metadata.h index 7e91da7b3..ea31993df 100644 --- a/src/iceberg/v1_metadata.h +++ b/src/iceberg/v1_metadata.h @@ -20,9 +20,6 @@ #pragma once /// \file iceberg/v1_metadata.h - -#include - #include "iceberg/manifest_adapter.h" namespace iceberg { @@ -31,32 +28,31 @@ namespace iceberg { class ManifestEntryAdapterV1 : public ManifestEntryAdapter { public: ManifestEntryAdapterV1(std::optional snapshot_id, - std::shared_ptr schema) { - // TODO(xiao.dong): init v1 schema - } - Status StartAppending() override { return {}; } - Status Append(const ManifestEntry& entry) override { return {}; } - Result FinishAppending() override { return {}; } + std::shared_ptr partition_schema, + std::shared_ptr partition_spec) + : ManifestEntryAdapter(std::move(partition_schema), std::move(partition_spec)), + snapshot_id_(snapshot_id) {} + Status Init() override; + Status Append(const ManifestEntry& entry) override; + + protected: + std::shared_ptr GetManifestEntryStructType() override; private: - std::shared_ptr manifest_schema_; - ArrowSchema schema_; // converted from manifest_schema_ + std::optional snapshot_id_; }; /// \brief Adapter to convert V1 ManifestFile to `ArrowArray`. class ManifestFileAdapterV1 : public ManifestFileAdapter { public: - ManifestFileAdapterV1(int64_t snapshot_id, std::optional parent_snapshot_id, - std::shared_ptr schema) { - // TODO(xiao.dong): init v1 schema - } - Status StartAppending() override { return {}; } - Status Append(const ManifestFile& file) override { return {}; } - Result FinishAppending() override { return {}; } + ManifestFileAdapterV1(int64_t snapshot_id, std::optional parent_snapshot_id) + : snapshot_id_(snapshot_id), parent_snapshot_id_(parent_snapshot_id) {} + Status Init() override; + Status Append(const ManifestFile& file) override; private: - std::shared_ptr manifest_list_schema_; - ArrowSchema schema_; // converted from manifest_list_schema_ + int64_t snapshot_id_; + std::optional parent_snapshot_id_; }; } // namespace iceberg diff --git a/src/iceberg/v2_metadata.cc b/src/iceberg/v2_metadata.cc new file mode 100644 index 000000000..54b74f1ee --- /dev/null +++ b/src/iceberg/v2_metadata.cc @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/v2_metadata.h" + +#include "iceberg/manifest_entry.h" +#include "iceberg/manifest_list.h" +#include "iceberg/schema.h" + +namespace iceberg { + +Status ManifestEntryAdapterV2::Init() { + static std::unordered_set compatible_fields{ + 0, 1, 2, 3, 4, 134, 100, 101, 102, 103, 104, + 108, 109, 110, 137, 125, 128, 131, 132, 135, 140, 143, + }; + // TODO(xiao.dong) schema to json + metadata_["schema"] = "{}"; + // TODO(xiao.dong) partition spec to json + metadata_["partition-spec"] = "{}"; + if (partition_spec_ != nullptr) { + metadata_["partition-spec-id"] = std::to_string(partition_spec_->spec_id()); + } + metadata_["format-version"] = "2"; + metadata_["content"] = "data"; + return InitSchema(compatible_fields); +} + +Status ManifestEntryAdapterV2::Append(const iceberg::ManifestEntry& entry) { + return AppendInternal(entry); +} + +Result> ManifestEntryAdapterV2::GetWrappedSequenceNumber( + const iceberg::ManifestEntry& entry) { + if (!entry.sequence_number.has_value()) { + // if the entry's data sequence number is null, + // then it will inherit the sequence number of the current commit. + // to validate that this is correct, check that the snapshot id is either null (will + // also be inherited) or that it matches the id of the current commit. + if (entry.snapshot_id.has_value() && entry.snapshot_id.value() != snapshot_id_) { + return InvalidManifest( + "Found unassigned sequence number for an entry from snapshot: {}", + entry.snapshot_id.value()); + } + + // inheritance should work only for ADDED entries + if (entry.status != ManifestStatus::kAdded) { + return InvalidManifest( + "Only entries with status ADDED can have null sequence number"); + } + + return std::nullopt; + } + return entry.sequence_number; +} + +Result> ManifestEntryAdapterV2::GetWrappedReferenceDataFile( + const std::shared_ptr& file) { + if (file->content == DataFile::Content::kPositionDeletes) { + return file->referenced_data_file; + } + return std::nullopt; +} + +Status ManifestFileAdapterV2::Init() { + static std::unordered_set compatible_fields{ + 500, 501, 502, 517, 515, 516, 503, 504, 505, 506, 512, 513, 514, 507, 519, + }; + metadata_["snapshot-id"] = std::to_string(snapshot_id_); + metadata_["parent-snapshot-id"] = parent_snapshot_id_.has_value() + ? std::to_string(parent_snapshot_id_.value()) + : "null"; + metadata_["sequence-number"] = std::to_string(sequence_number_); + metadata_["format-version"] = "2"; + return InitSchema(compatible_fields); +} + +Status ManifestFileAdapterV2::Append(const iceberg::ManifestFile& file) { + return AppendInternal(file); +} + +Result ManifestFileAdapterV2::GetWrappedSequenceNumber( + const iceberg::ManifestFile& file) { + if (file.sequence_number == TableMetadata::kInvalidSequenceNumber) { + if (snapshot_id_ != file.added_snapshot_id) { + return InvalidManifestList( + "Found unassigned sequence number for a manifest from snapshot: %s", + file.added_snapshot_id); + } + return sequence_number_; + } + return file.sequence_number; +} + +Result ManifestFileAdapterV2::GetWrappedMinSequenceNumber( + const iceberg::ManifestFile& file) { + if (file.min_sequence_number == TableMetadata::kInvalidSequenceNumber) { + if (snapshot_id_ != file.added_snapshot_id) { + return InvalidManifestList( + "Found unassigned sequence number for a manifest from snapshot: %s", + file.added_snapshot_id); + } + return sequence_number_; + } + return file.min_sequence_number; +} + +} // namespace iceberg diff --git a/src/iceberg/v2_metadata.h b/src/iceberg/v2_metadata.h index d6ff6aa3a..788295427 100644 --- a/src/iceberg/v2_metadata.h +++ b/src/iceberg/v2_metadata.h @@ -21,8 +21,6 @@ /// \file iceberg/v2_metadata.h -#include - #include "iceberg/manifest_adapter.h" namespace iceberg { @@ -31,32 +29,42 @@ namespace iceberg { class ManifestEntryAdapterV2 : public ManifestEntryAdapter { public: ManifestEntryAdapterV2(std::optional snapshot_id, - std::shared_ptr schema) { - // TODO(xiao.dong): init v2 schema - } - Status StartAppending() override { return {}; } - Status Append(const ManifestEntry& entry) override { return {}; } - Result FinishAppending() override { return {}; } + std::shared_ptr partition_schema, + std::shared_ptr partition_spec) + : ManifestEntryAdapter(std::move(partition_schema), std::move(partition_spec)), + snapshot_id_(snapshot_id) {} + Status Init() override; + Status Append(const ManifestEntry& entry) override; + + protected: + Result> GetWrappedSequenceNumber( + const ManifestEntry& entry) override; + Result> GetWrappedReferenceDataFile( + const std::shared_ptr& file) override; private: - std::shared_ptr manifest_schema_; - ArrowSchema schema_; // converted from manifest_schema_ + std::optional snapshot_id_; }; /// \brief Adapter to convert V2 ManifestFile to `ArrowArray`. class ManifestFileAdapterV2 : public ManifestFileAdapter { public: ManifestFileAdapterV2(int64_t snapshot_id, std::optional parent_snapshot_id, - int64_t sequence_number, std::shared_ptr schema) { - // TODO(xiao.dong): init v2 schema - } - Status StartAppending() override { return {}; } - Status Append(const ManifestFile& file) override { return {}; } - Result FinishAppending() override { return {}; } + int64_t sequence_number) + : snapshot_id_(snapshot_id), + parent_snapshot_id_(parent_snapshot_id), + sequence_number_(sequence_number) {} + Status Init() override; + Status Append(const ManifestFile& file) override; + + protected: + Result GetWrappedSequenceNumber(const ManifestFile& file) override; + Result GetWrappedMinSequenceNumber(const ManifestFile& file) override; private: - std::shared_ptr manifest_list_schema_; - ArrowSchema schema_; // converted from manifest_list_schema_ + int64_t snapshot_id_; + std::optional parent_snapshot_id_; + int64_t sequence_number_; }; } // namespace iceberg diff --git a/src/iceberg/v3_metadata.cc b/src/iceberg/v3_metadata.cc new file mode 100644 index 000000000..534abe48d --- /dev/null +++ b/src/iceberg/v3_metadata.cc @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "iceberg/v3_metadata.h" + +#include "iceberg/manifest_entry.h" +#include "iceberg/manifest_list.h" +#include "iceberg/schema.h" +#include "iceberg/util/macros.h" + +namespace iceberg { + +Status ManifestEntryAdapterV3::Init() { + static std::unordered_set compatible_fields{ + 0, 1, 2, 3, 4, 134, 100, 101, 102, 103, 104, 108, 109, + 110, 137, 125, 128, 131, 132, 135, 140, 142, 143, 144, 145, + }; + // TODO(xiao.dong) schema to json + metadata_["schema"] = "{}"; + // TODO(xiao.dong) partition spec to json + metadata_["partition-spec"] = "{}"; + if (partition_spec_ != nullptr) { + metadata_["partition-spec-id"] = std::to_string(partition_spec_->spec_id()); + } + metadata_["format-version"] = "3"; + metadata_["content"] = "data"; + return InitSchema(compatible_fields); +} + +Status ManifestEntryAdapterV3::Append(const iceberg::ManifestEntry& entry) { + return AppendInternal(entry); +} + +Result> ManifestEntryAdapterV3::GetWrappedSequenceNumber( + const iceberg::ManifestEntry& entry) { + if (!entry.sequence_number.has_value()) { + // if the entry's data sequence number is null, + // then it will inherit the sequence number of the current commit. + // to validate that this is correct, check that the snapshot id is either null (will + // also be inherited) or that it matches the id of the current commit. + if (entry.snapshot_id.has_value() && entry.snapshot_id.value() != snapshot_id_) { + return InvalidManifest( + "Found unassigned sequence number for an entry from snapshot: {}", + entry.snapshot_id.value()); + } + + // inheritance should work only for ADDED entries + if (entry.status != ManifestStatus::kAdded) { + return InvalidManifest( + "Only entries with status ADDED can have null sequence number"); + } + + return std::nullopt; + } + return entry.sequence_number; +} + +Result> ManifestEntryAdapterV3::GetWrappedReferenceDataFile( + const std::shared_ptr& file) { + if (file->content == DataFile::Content::kPositionDeletes) { + return file->referenced_data_file; + } + return std::nullopt; +} + +Result> ManifestEntryAdapterV3::GetWrappedFirstRowId( + const std::shared_ptr& file) { + if (file->content == DataFile::Content::kData) { + return file->first_row_id; + } + return std::nullopt; +} + +Result> ManifestEntryAdapterV3::GetWrappedContentOffset( + const std::shared_ptr& file) { + if (file->content == DataFile::Content::kPositionDeletes) { + return file->content_offset; + } + return std::nullopt; +} + +Result> ManifestEntryAdapterV3::GetWrappedContentSizeInBytes( + const std::shared_ptr& file) { + if (file->content == DataFile::Content::kPositionDeletes) { + return file->content_size_in_bytes; + } + return std::nullopt; +} + +Status ManifestFileAdapterV3::Init() { + static std::unordered_set compatible_fields{ + 500, 501, 502, 517, 515, 516, 503, 504, 505, 506, 512, 513, 514, 507, 519, 520, + }; + metadata_["snapshot-id"] = std::to_string(snapshot_id_); + metadata_["parent-snapshot-id"] = parent_snapshot_id_.has_value() + ? std::to_string(parent_snapshot_id_.value()) + : "null"; + metadata_["sequence-number"] = std::to_string(sequence_number_); + metadata_["first-row-id"] = + next_row_id_.has_value() ? std::to_string(next_row_id_.value()) : "null"; + metadata_["format-version"] = "3"; + return InitSchema(compatible_fields); +} + +Status ManifestFileAdapterV3::Append(const iceberg::ManifestFile& file) { + auto status = AppendInternal(file); + ICEBERG_RETURN_UNEXPECTED(status); + if (WrappedFirstRowId(file) && next_row_id_.has_value()) { + next_row_id_ = next_row_id_.value() + file.existing_rows_count.value_or(0) + + file.added_rows_count.value_or(0); + } + return status; +} + +Result ManifestFileAdapterV3::GetWrappedSequenceNumber( + const iceberg::ManifestFile& file) { + if (file.sequence_number == TableMetadata::kInvalidSequenceNumber) { + if (snapshot_id_ != file.added_snapshot_id) { + return InvalidManifestList( + "Found unassigned sequence number for a manifest from snapshot: %s", + file.added_snapshot_id); + } + return sequence_number_; + } + return file.sequence_number; +} + +Result ManifestFileAdapterV3::GetWrappedMinSequenceNumber( + const iceberg::ManifestFile& file) { + if (file.min_sequence_number == TableMetadata::kInvalidSequenceNumber) { + if (snapshot_id_ != file.added_snapshot_id) { + return InvalidManifestList( + "Found unassigned sequence number for a manifest from snapshot: %s", + file.added_snapshot_id); + } + return sequence_number_; + } + return file.min_sequence_number; +} + +Result> ManifestFileAdapterV3::GetWrappedFirstRowId( + const iceberg::ManifestFile& file) { + if (WrappedFirstRowId(file)) { + return next_row_id_; + } else if (file.content != ManifestFile::Content::kData) { + return std::nullopt; + } else { + if (!file.first_row_id.has_value()) { + return InvalidManifestList("Found unassigned first-row-id for file:{}", + file.manifest_path); + } + return file.first_row_id.value(); + } +} + +bool ManifestFileAdapterV3::WrappedFirstRowId(const iceberg::ManifestFile& file) { + return file.content == ManifestFile::Content::kData && !file.first_row_id.has_value(); +} + +} // namespace iceberg diff --git a/src/iceberg/v3_metadata.h b/src/iceberg/v3_metadata.h index e7bcc3552..d27116416 100644 --- a/src/iceberg/v3_metadata.h +++ b/src/iceberg/v3_metadata.h @@ -21,44 +21,65 @@ /// \file iceberg/v3_metadata.h -#include - #include "iceberg/manifest_adapter.h" namespace iceberg { -/// \brief Adapter to convert V3ManifestEntry to `ArrowArray`. +/// \brief Adapter to convert V3 ManifestEntry to `ArrowArray`. class ManifestEntryAdapterV3 : public ManifestEntryAdapter { public: ManifestEntryAdapterV3(std::optional snapshot_id, std::optional first_row_id, - std::shared_ptr schema) { - // TODO(xiao.dong): init v3 schema - } - Status StartAppending() override { return {}; } - Status Append(const ManifestEntry& entry) override { return {}; } - Result FinishAppending() override { return {}; } + std::shared_ptr partition_schema, + std::shared_ptr partition_spec) + : ManifestEntryAdapter(std::move(partition_schema), std::move(partition_spec)), + snapshot_id_(snapshot_id), + first_row_id_(first_row_id) {} + Status Init() override; + Status Append(const ManifestEntry& entry) override; + + protected: + Result> GetWrappedSequenceNumber( + const ManifestEntry& entry) override; + Result> GetWrappedReferenceDataFile( + const std::shared_ptr& file) override; + Result> GetWrappedFirstRowId( + const std::shared_ptr& file) override; + Result> GetWrappedContentOffset( + const std::shared_ptr& file) override; + Result> GetWrappedContentSizeInBytes( + const std::shared_ptr& file) override; private: - std::shared_ptr manifest_schema_; - ArrowSchema schema_; // converted from manifest_schema_ + std::optional snapshot_id_; + std::optional first_row_id_; }; /// \brief Adapter to convert V3 ManifestFile to `ArrowArray`. class ManifestFileAdapterV3 : public ManifestFileAdapter { public: ManifestFileAdapterV3(int64_t snapshot_id, std::optional parent_snapshot_id, - int64_t sequence_number, std::optional first_row_id, - std::shared_ptr schema) { - // TODO(xiao.dong): init v3 schema - } - Status StartAppending() override { return {}; } - Status Append(const ManifestFile& file) override { return {}; } - Result FinishAppending() override { return {}; } + int64_t sequence_number, std::optional first_row_id) + : snapshot_id_(snapshot_id), + parent_snapshot_id_(parent_snapshot_id), + sequence_number_(sequence_number), + next_row_id_(first_row_id) {} + Status Init() override; + Status Append(const ManifestFile& file) override; + + protected: + Result GetWrappedSequenceNumber(const ManifestFile& file) override; + Result GetWrappedMinSequenceNumber(const ManifestFile& file) override; + Result> GetWrappedFirstRowId(const ManifestFile& file) override; + + private: + bool WrappedFirstRowId(const ManifestFile& file); private: - std::shared_ptr manifest_list_schema_; - ArrowSchema schema_; // converted from manifest_list_schema_ + int64_t snapshot_id_; + std::optional parent_snapshot_id_; + int64_t sequence_number_; + std::optional next_row_id_; }; } // namespace iceberg From 449b4633b593d9d233fca05ea1d9909e194c6749 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Tue, 9 Sep 2025 11:26:51 +0800 Subject: [PATCH 02/17] fix warning --- src/iceberg/manifest_adapter.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index 0577ad326..45777061c 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "iceberg/arrow_c_data.h" #include "iceberg/result.h" From e34bb503ce43f02bba851b86779f0af826fa8886 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Tue, 9 Sep 2025 12:06:16 +0800 Subject: [PATCH 03/17] fix schema release --- src/iceberg/manifest_adapter.cc | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index e595fc038..8419527ea 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -90,11 +90,12 @@ Status ManifestAdapter::AppendField(ArrowArray* arrowArray, } ManifestEntryAdapter::~ManifestEntryAdapter() { - // arrow::ImportedArrayData::Release() bridge.cc:1478 will release the - // internal array, so we have no need to release it here. - // if (is_initialized_) { - // ArrowArrayRelease(&array_); - // } + if (is_initialized_) { + // arrow::ImportedArrayData::Release() bridge.cc:1478 will release the + // internal array, so we have no need to release it here. + // ArrowArrayRelease(&array_); + ArrowSchemaRelease(&schema_); + } } std::shared_ptr ManifestEntryAdapter::GetManifestEntryStructType() { @@ -471,11 +472,12 @@ Status ManifestEntryAdapter::InitSchema(const std::unordered_set& field } ManifestFileAdapter::~ManifestFileAdapter() { - // arrow::ImportedArrayData::Release() bridge.cc:1478 will release the - // internal array, so we have no need to release it here. - // if (is_initialized_) { - // ArrowArrayRelease(&array_); - // } + if (is_initialized_) { + // arrow::ImportedArrayData::Release() bridge.cc:1478 will release the + // internal array, so we have no need to release it here. + // ArrowArrayRelease(&array_); + ArrowSchemaRelease(&schema_); + } } Status ManifestFileAdapter::AppendPartitions( From 470e10296e6c7e654d6d7d68562d2aeb29cd1927 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Tue, 9 Sep 2025 12:27:12 +0800 Subject: [PATCH 04/17] fix windows build --- src/iceberg/manifest_adapter.cc | 5 ++++- src/iceberg/manifest_adapter.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index 8419527ea..da99ea517 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -26,6 +26,10 @@ #include "iceberg/util/macros.h" #include "nanoarrow/nanoarrow.h" +namespace { +static constexpr int64_t kBlockSizeInBytes = 64 * 1024 * 1024L; +} + namespace iceberg { Status ManifestAdapter::StartAppending() { @@ -264,7 +268,6 @@ Status ManifestEntryAdapter::AppendDataFile( break; case 105: // block_size_in_bytes (compatible in v1) // always 64MB for v1 - static const int64_t kBlockSizeInBytes = 64 * 1024 * 1024L; ICEBERG_RETURN_UNEXPECTED(AppendField(array, kBlockSizeInBytes)); break; case 108: // column_sizes (optional map) diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index 45777061c..50210bad7 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -24,6 +24,7 @@ #include #include +#include #include #include #include From 39c7c4d906f2ac74614b5edba97c97fad2661d9d Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Tue, 9 Sep 2025 15:56:48 +0800 Subject: [PATCH 05/17] fix adapter init --- src/iceberg/manifest_adapter.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index da99ea517..f32196b85 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -36,10 +36,6 @@ Status ManifestAdapter::StartAppending() { if (size_ > 0) { return InvalidArgument("Adapter buffer not empty, cannot start appending."); } - if (is_initialized_) { - // reset buffer - ArrowArrayRelease(&array_); - } array_ = {}; size_ = 0; ArrowError error; @@ -47,7 +43,6 @@ Status ManifestAdapter::StartAppending() { NANOARROW_RETURN_IF_NOT_OK(status, error); status = ArrowArrayStartAppending(&array_); NANOARROW_RETURN_IF_NOT_OK(status, error); - is_initialized_ = true; return {}; } @@ -471,6 +466,7 @@ Status ManifestEntryAdapter::InitSchema(const std::unordered_set& field } manifest_schema_ = std::make_shared(fields); ICEBERG_RETURN_UNEXPECTED(ToArrowSchema(*manifest_schema_, &schema_)); + is_initialized_ = true; return {}; } @@ -701,6 +697,7 @@ Status ManifestFileAdapter::InitSchema(const std::unordered_set& fields } manifest_list_schema_ = std::make_shared(fields); ICEBERG_RETURN_UNEXPECTED(ToArrowSchema(*manifest_list_schema_, &schema_)); + is_initialized_ = true; return {}; } From 0fce68ab330a9e4e6d0dab260536e7f92caf190d Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Fri, 12 Sep 2025 09:23:13 +0800 Subject: [PATCH 06/17] change some func into static --- src/iceberg/manifest_adapter.h | 35 ++++++++++--------- .../test/manifest_list_reader_writer_test.cc | 1 - .../test/manifest_reader_writer_test.cc | 1 - src/iceberg/v1_metadata.cc | 4 +-- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index 50210bad7..baa27771d 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -48,11 +48,11 @@ class ICEBERG_EXPORT ManifestAdapter { virtual std::shared_ptr schema() const = 0; protected: - Status AppendField(ArrowArray* arrowArray, int64_t value); - Status AppendField(ArrowArray* arrowArray, uint64_t value); - Status AppendField(ArrowArray* arrowArray, double value); - Status AppendField(ArrowArray* arrowArray, std::string_view value); - Status AppendField(ArrowArray* arrowArray, const std::vector& value); + static Status AppendField(ArrowArray* arrowArray, int64_t value); + static Status AppendField(ArrowArray* arrowArray, uint64_t value); + static Status AppendField(ArrowArray* arrowArray, double value); + static Status AppendField(ArrowArray* arrowArray, std::string_view value); + static Status AppendField(ArrowArray* arrowArray, const std::vector& value); protected: bool is_initialized_ = false; @@ -82,14 +82,17 @@ class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { Status AppendDataFile(ArrowArray* arrow_array, const std::shared_ptr& data_file_type, const std::shared_ptr& file); - Status AppendPartitions(ArrowArray* arrow_array, - const std::shared_ptr& partition_type, - const std::vector& partitions); - Status AppendList(ArrowArray* arrow_array, const std::vector& list_value); - Status AppendList(ArrowArray* arrow_array, const std::vector& list_value); - Status AppendMap(ArrowArray* arrow_array, const std::map& map_value); - Status AppendMap(ArrowArray* arrow_array, - const std::map>& map_value); + static Status AppendPartitions(ArrowArray* arrow_array, + const std::shared_ptr& partition_type, + const std::vector& partitions); + static Status AppendList(ArrowArray* arrow_array, + const std::vector& list_value); + static Status AppendList(ArrowArray* arrow_array, + const std::vector& list_value); + static Status AppendMap(ArrowArray* arrow_array, + const std::map& map_value); + static Status AppendMap(ArrowArray* arrow_array, + const std::map>& map_value); virtual Result> GetWrappedSequenceNumber( const ManifestEntry& entry); @@ -123,9 +126,9 @@ class ICEBERG_EXPORT ManifestFileAdapter : public ManifestAdapter { protected: Status InitSchema(const std::unordered_set& fields_ids); Status AppendInternal(const ManifestFile& file); - Status AppendPartitions(ArrowArray* arrow_array, - const std::shared_ptr& partition_type, - const std::vector& partitions); + static Status AppendPartitions(ArrowArray* arrow_array, + const std::shared_ptr& partition_type, + const std::vector& partitions); virtual Result GetWrappedSequenceNumber(const ManifestFile& file); virtual Result GetWrappedMinSequenceNumber(const ManifestFile& file); diff --git a/src/iceberg/test/manifest_list_reader_writer_test.cc b/src/iceberg/test/manifest_list_reader_writer_test.cc index da91d5415..793dd33b5 100644 --- a/src/iceberg/test/manifest_list_reader_writer_test.cc +++ b/src/iceberg/test/manifest_list_reader_writer_test.cc @@ -18,7 +18,6 @@ */ #include -#include #include #include "iceberg/arrow/arrow_fs_file_io_internal.h" diff --git a/src/iceberg/test/manifest_reader_writer_test.cc b/src/iceberg/test/manifest_reader_writer_test.cc index 54165455f..c9865c0cc 100644 --- a/src/iceberg/test/manifest_reader_writer_test.cc +++ b/src/iceberg/test/manifest_reader_writer_test.cc @@ -28,7 +28,6 @@ #include "iceberg/manifest_list.h" #include "iceberg/manifest_reader.h" #include "iceberg/manifest_writer.h" -#include "iceberg/partition_spec.h" #include "iceberg/schema.h" #include "iceberg/transform.h" #include "matchers.h" diff --git a/src/iceberg/v1_metadata.cc b/src/iceberg/v1_metadata.cc index 2fda59100..db2e287cc 100644 --- a/src/iceberg/v1_metadata.cc +++ b/src/iceberg/v1_metadata.cc @@ -21,7 +21,6 @@ #include "iceberg/manifest_entry.h" #include "iceberg/manifest_list.h" -#include "iceberg/partition_spec.h" #include "iceberg/schema.h" namespace iceberg { @@ -46,7 +45,8 @@ Status ManifestEntryAdapterV1::Append(const iceberg::ManifestEntry& entry) { } std::shared_ptr ManifestEntryAdapterV1::GetManifestEntryStructType() { - // V1 compatible fields. Official suggestions: + // 'block_size_in_bytes' (ID 105) is a deprecated field that is REQUIRED + // in the v1 data_file schema for backward compatibility. // Deprecated. Always write a default in v1. Do not write in v2 or v3. static const SchemaField kBlockSizeInBytes = SchemaField::MakeRequired( 105, "block_size_in_bytes", iceberg::int64(), "Block size in bytes"); From ff018cbdbe9efd415825e7e7ff8e7c2aae016f5f Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Fri, 12 Sep 2025 09:59:54 +0800 Subject: [PATCH 07/17] fix partition literal support type list --- src/iceberg/manifest_adapter.cc | 18 +++++++++++++++++- src/iceberg/manifest_adapter.h | 1 + 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index f32196b85..4f53aad44 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -88,6 +88,16 @@ Status ManifestAdapter::AppendField(ArrowArray* arrowArray, return {}; } +Status ManifestAdapter::AppendField(ArrowArray* arrowArray, + const std::array& value) { + ArrowBufferViewData data; + data.as_char = reinterpret_cast(value.data()); + ArrowBufferView view(data, value.size()); + auto status = ArrowArrayAppendBytes(arrowArray, view); + NANOARROW_RETURN_IF_FAILED(status); + return {}; +} + ManifestEntryAdapter::~ManifestEntryAdapter() { if (is_initialized_) { // arrow::ImportedArrayData::Release() bridge.cc:1478 will release the @@ -144,6 +154,7 @@ Status ManifestEntryAdapter::AppendPartitions( ICEBERG_RETURN_UNEXPECTED( AppendField(array, std::get(partition.value()))); break; + case TypeId::kFixed: case TypeId::kBinary: ICEBERG_RETURN_UNEXPECTED( AppendField(array, std::get>(partition.value()))); @@ -159,8 +170,13 @@ Status ManifestEntryAdapter::AppendPartitions( AppendField(array, std::get(partition.value()))); break; case TypeId::kDecimal: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get>(partition.value()))); + break; case TypeId::kUuid: - case TypeId::kFixed: + case TypeId::kStruct: + case TypeId::kList: + case TypeId::kMap: // TODO(xiao.dong) currently literal does not support those types default: return InvalidManifest("Unsupported partition type: {}", field.ToString()); diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index baa27771d..1675ab1ed 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -53,6 +53,7 @@ class ICEBERG_EXPORT ManifestAdapter { static Status AppendField(ArrowArray* arrowArray, double value); static Status AppendField(ArrowArray* arrowArray, std::string_view value); static Status AppendField(ArrowArray* arrowArray, const std::vector& value); + static Status AppendField(ArrowArray* arrowArray, const std::array& value); protected: bool is_initialized_ = false; From 6fb2b91802e52fbaff415f161587394b80ffe016 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Fri, 12 Sep 2025 16:08:57 +0800 Subject: [PATCH 08/17] fix comments --- src/iceberg/CMakeLists.txt | 18 +- .../nanoarrow_error_transform_internal.h | 25 ++ src/iceberg/avro/avro_writer.cc | 4 +- src/iceberg/avro/avro_writer.h | 2 +- src/iceberg/file_writer.h | 2 +- src/iceberg/manifest_adapter.cc | 348 ++++++++---------- src/iceberg/manifest_adapter.h | 45 ++- src/iceberg/manifest_entry.cc | 10 +- src/iceberg/manifest_entry.h | 3 +- src/iceberg/manifest_reader_internal.cc | 13 +- src/iceberg/manifest_writer.cc | 27 +- src/iceberg/manifest_writer.h | 10 +- src/iceberg/parquet/parquet_writer.cc | 4 +- src/iceberg/parquet/parquet_writer.h | 2 +- src/iceberg/partition_spec.cc | 46 +++ src/iceberg/partition_spec.h | 6 + .../test/manifest_reader_writer_test.cc | 15 +- src/iceberg/util/macros.h | 10 - src/iceberg/v1_metadata.cc | 51 ++- src/iceberg/v1_metadata.h | 6 +- src/iceberg/v2_metadata.cc | 61 ++- src/iceberg/v2_metadata.h | 9 +- src/iceberg/v3_metadata.cc | 69 +++- src/iceberg/v3_metadata.h | 8 +- 24 files changed, 454 insertions(+), 340 deletions(-) create mode 100644 src/iceberg/arrow/nanoarrow_error_transform_internal.h diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt index 244b232c1..75ac9c847 100644 --- a/src/iceberg/CMakeLists.txt +++ b/src/iceberg/CMakeLists.txt @@ -18,6 +18,7 @@ set(ICEBERG_INCLUDES "$" "$") set(ICEBERG_SOURCES + arrow_c_data_guard_internal.cc catalog/memory/in_memory_catalog.cc expression/expression.cc expression/expressions.cc @@ -28,8 +29,12 @@ set(ICEBERG_SOURCES file_writer.cc inheritable_metadata.cc json_internal.cc + manifest_adapter.cc manifest_entry.cc manifest_list.cc + manifest_reader.cc + manifest_reader_internal.cc + manifest_writer.cc metadata_columns.cc name_mapping.cc partition_field.cc @@ -51,20 +56,15 @@ set(ICEBERG_SOURCES transform.cc transform_function.cc type.cc - manifest_reader.cc - manifest_reader_internal.cc - manifest_writer.cc - manifest_adapter.cc - v1_metadata.cc - v2_metadata.cc - v3_metadata.cc - arrow_c_data_guard_internal.cc util/conversions.cc util/decimal.cc util/gzip_internal.cc util/murmurhash3_internal.cc util/timepoint.cc - util/uuid.cc) + util/uuid.cc + v1_metadata.cc + v2_metadata.cc + v3_metadata.cc) set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS) set(ICEBERG_SHARED_BUILD_INTERFACE_LIBS) diff --git a/src/iceberg/arrow/nanoarrow_error_transform_internal.h b/src/iceberg/arrow/nanoarrow_error_transform_internal.h new file mode 100644 index 000000000..2b32fea0c --- /dev/null +++ b/src/iceberg/arrow/nanoarrow_error_transform_internal.h @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#define ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return InvalidArrowData("Nanoarrow error msg: {}", error.message); \ + } diff --git a/src/iceberg/avro/avro_writer.cc b/src/iceberg/avro/avro_writer.cc index 4d82fb9a3..bfa82c762 100644 --- a/src/iceberg/avro/avro_writer.cc +++ b/src/iceberg/avro/avro_writer.cc @@ -75,7 +75,7 @@ class AvroWriter::Impl { return {}; } - Status Write(ArrowArray data) { + Status Write(ArrowArray& data) { ICEBERG_ARROW_ASSIGN_OR_RETURN(auto result, ::arrow::ImportArray(&data, &arrow_schema_)); @@ -119,7 +119,7 @@ class AvroWriter::Impl { AvroWriter::~AvroWriter() = default; -Status AvroWriter::Write(ArrowArray data) { return impl_->Write(data); } +Status AvroWriter::Write(ArrowArray& data) { return impl_->Write(data); } Status AvroWriter::Open(const WriterOptions& options) { impl_ = std::make_unique(); diff --git a/src/iceberg/avro/avro_writer.h b/src/iceberg/avro/avro_writer.h index 57499d8ed..e539a7588 100644 --- a/src/iceberg/avro/avro_writer.h +++ b/src/iceberg/avro/avro_writer.h @@ -35,7 +35,7 @@ class ICEBERG_BUNDLE_EXPORT AvroWriter : public Writer { Status Close() final; - Status Write(ArrowArray data) final; + Status Write(ArrowArray& data) final; std::optional metrics() final; diff --git a/src/iceberg/file_writer.h b/src/iceberg/file_writer.h index 135151afb..874c983fa 100644 --- a/src/iceberg/file_writer.h +++ b/src/iceberg/file_writer.h @@ -65,7 +65,7 @@ class ICEBERG_EXPORT Writer { /// \brief Write arrow data to the file. /// /// \return Status of write results. - virtual Status Write(ArrowArray data) = 0; + virtual Status Write(ArrowArray& data) = 0; /// \brief Get the file statistics. /// Only valid after the file is closed. diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index 4f53aad44..71296f8c0 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -19,15 +19,22 @@ #include "iceberg/manifest_adapter.h" +#include "iceberg/arrow/nanoarrow_error_transform_internal.h" #include "iceberg/manifest_entry.h" #include "iceberg/manifest_list.h" #include "iceberg/schema.h" #include "iceberg/schema_internal.h" +#include "iceberg/util/checked_cast.h" #include "iceberg/util/macros.h" #include "nanoarrow/nanoarrow.h" +#define NANOARROW_RETURN_IF_FAILED(status) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return InvalidArrowData("Nanoarrow error code: {}", status); \ + } + namespace { -static constexpr int64_t kBlockSizeInBytes = 64 * 1024 * 1024L; +static constexpr int64_t kBlockSizeInBytesV1 = 64 * 1024 * 1024L; } namespace iceberg { @@ -36,155 +43,143 @@ Status ManifestAdapter::StartAppending() { if (size_ > 0) { return InvalidArgument("Adapter buffer not empty, cannot start appending."); } - array_ = {}; + array_ = std::make_shared(); size_ = 0; ArrowError error; - auto status = ArrowArrayInitFromSchema(&array_, &schema_, &error); - NANOARROW_RETURN_IF_NOT_OK(status, error); - status = ArrowArrayStartAppending(&array_); - NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK( + ArrowArrayInitFromSchema(array_.get(), &schema_, &error), error); + NANOARROW_RETURN_IF_FAILED(ArrowArrayStartAppending(array_.get())); return {}; } -Result ManifestAdapter::FinishAppending() { +Result> ManifestAdapter::FinishAppending() { ArrowError error; - auto status = ArrowArrayFinishBuildingDefault(&array_, &error); - NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK( + ArrowArrayFinishBuildingDefault(array_.get(), &error), error); return array_; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, int64_t value) { - auto status = ArrowArrayAppendInt(arrowArray, value); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendInt(arrowArray, value)); return {}; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, uint64_t value) { - auto status = ArrowArrayAppendUInt(arrowArray, value); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendUInt(arrowArray, value)); return {}; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, double value) { - auto status = ArrowArrayAppendDouble(arrowArray, value); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendDouble(arrowArray, value)); return {}; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, std::string_view value) { ArrowStringView view(value.data(), value.size()); - auto status = ArrowArrayAppendString(arrowArray, view); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendString(arrowArray, view)); return {}; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, - const std::vector& value) { + const std::span& value) { ArrowBufferViewData data; data.as_char = reinterpret_cast(value.data()); ArrowBufferView view(data, value.size()); - auto status = ArrowArrayAppendBytes(arrowArray, view); - NANOARROW_RETURN_IF_FAILED(status); - return {}; -} - -Status ManifestAdapter::AppendField(ArrowArray* arrowArray, - const std::array& value) { - ArrowBufferViewData data; - data.as_char = reinterpret_cast(value.data()); - ArrowBufferView view(data, value.size()); - auto status = ArrowArrayAppendBytes(arrowArray, view); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendBytes(arrowArray, view)); return {}; } ManifestEntryAdapter::~ManifestEntryAdapter() { - if (is_initialized_) { - // arrow::ImportedArrayData::Release() bridge.cc:1478 will release the - // internal array, so we have no need to release it here. - // ArrowArrayRelease(&array_); + if (array_ != nullptr && array_->release != nullptr) { + ArrowArrayRelease(array_.get()); + } + if (schema_.release != nullptr) { ArrowSchemaRelease(&schema_); } } -std::shared_ptr ManifestEntryAdapter::GetManifestEntryStructType() { - return ManifestEntry::TypeFromPartitionType(std::move(partition_schema_)); +Result> ManifestEntryAdapter::GetManifestEntryStructType() { + if (partition_spec_ == nullptr) { + return ManifestEntry::TypeFromPartitionType(nullptr); + } + ICEBERG_ASSIGN_OR_RAISE(auto partition_schema, partition_spec_->partition_schema()); + return ManifestEntry::TypeFromPartitionType(std::move(partition_schema)); } -Status ManifestEntryAdapter::AppendPartitions( +Status ManifestEntryAdapter::AppendPartition( ArrowArray* arrow_array, const std::shared_ptr& partition_type, const std::vector& partitions) { - if (arrow_array->n_children != partition_type->fields().size()) { - return InvalidManifest("Partition arrow not match partition type."); + if (arrow_array->n_children != partition_type->fields().size()) [[unlikely]] { + return InvalidManifest("Arrow array of partition does not match partition type."); + } + if (partitions.size() != partition_type->fields().size()) [[unlikely]] { + return InvalidManifest("Literal list of partition does not match partition type."); } auto fields = partition_type->fields(); - for (const auto& partition : partitions) { - for (int32_t i = 0; i < fields.size(); i++) { - const auto& field = fields[i]; - auto array = arrow_array->children[i]; - if (partition.IsNull()) { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); - continue; - } - switch (field.type()->type_id()) { - case TypeId::kBoolean: - ICEBERG_RETURN_UNEXPECTED(AppendField( - array, static_cast( - std::get(partition.value()) == true ? 1L : 0L))); - break; - case TypeId::kInt: - ICEBERG_RETURN_UNEXPECTED(AppendField( - array, static_cast(std::get(partition.value())))); - break; - case TypeId::kLong: - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get(partition.value()))); - break; - case TypeId::kFloat: - ICEBERG_RETURN_UNEXPECTED(AppendField( - array, static_cast(std::get(partition.value())))); - break; - case TypeId::kDouble: - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get(partition.value()))); - break; - case TypeId::kString: - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get(partition.value()))); - break; - case TypeId::kFixed: - case TypeId::kBinary: - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get>(partition.value()))); - break; - case TypeId::kDate: - ICEBERG_RETURN_UNEXPECTED(AppendField( - array, static_cast(std::get(partition.value())))); - break; - case TypeId::kTime: - case TypeId::kTimestamp: - case TypeId::kTimestampTz: - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get(partition.value()))); - break; - case TypeId::kDecimal: - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get>(partition.value()))); - break; - case TypeId::kUuid: - case TypeId::kStruct: - case TypeId::kList: - case TypeId::kMap: - // TODO(xiao.dong) currently literal does not support those types - default: - return InvalidManifest("Unsupported partition type: {}", field.ToString()); - } + for (int32_t i = 0; i < fields.size(); i++) { + const auto& partition = partitions[i]; + const auto& field = fields[i]; + auto array = arrow_array->children[i]; + if (partition.IsNull()) { + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + continue; + } + switch (field.type()->type_id()) { + case TypeId::kBoolean: + ICEBERG_RETURN_UNEXPECTED(AppendField( + array, + static_cast(std::get(partition.value()) == true ? 1L : 0L))); + break; + case TypeId::kInt: + ICEBERG_RETURN_UNEXPECTED(AppendField( + array, static_cast(std::get(partition.value())))); + break; + case TypeId::kLong: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get(partition.value()))); + break; + case TypeId::kFloat: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, static_cast(std::get(partition.value())))); + break; + case TypeId::kDouble: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get(partition.value()))); + break; + case TypeId::kString: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get(partition.value()))); + break; + case TypeId::kFixed: + case TypeId::kBinary: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get>(partition.value()))); + break; + case TypeId::kDate: + ICEBERG_RETURN_UNEXPECTED(AppendField( + array, static_cast(std::get(partition.value())))); + break; + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get(partition.value()))); + break; + case TypeId::kDecimal: + ICEBERG_RETURN_UNEXPECTED( + AppendField(array, std::get>(partition.value()))); + break; + case TypeId::kUuid: + case TypeId::kStruct: + case TypeId::kList: + case TypeId::kMap: + // TODO(xiao.dong) currently literal does not support those types + default: + return InvalidManifest("Unsupported partition type: {}", field.ToString()); } } - auto status = ArrowArrayFinishElement(arrow_array); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -192,11 +187,10 @@ Status ManifestEntryAdapter::AppendList(ArrowArray* arrow_array, const std::vector& list_value) { auto list_array = arrow_array->children[0]; for (const auto& value : list_value) { - auto status = ArrowArrayAppendInt(list_array, static_cast(value)); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED( + ArrowArrayAppendInt(list_array, static_cast(value))); } - auto status = ArrowArrayFinishElement(arrow_array); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -204,11 +198,9 @@ Status ManifestEntryAdapter::AppendList(ArrowArray* arrow_array, const std::vector& list_value) { auto list_array = arrow_array->children[0]; for (const auto& value : list_value) { - auto status = ArrowArrayAppendInt(list_array, value); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendInt(list_array, value)); } - auto status = ArrowArrayFinishElement(arrow_array); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -224,8 +216,7 @@ Status ManifestEntryAdapter::AppendMap(ArrowArray* arrow_array, ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); } - auto status = ArrowArrayFinishElement(arrow_array); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -241,8 +232,7 @@ Status ManifestEntryAdapter::AppendMap( ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); } - auto status = ArrowArrayFinishElement(arrow_array); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -267,9 +257,9 @@ Status ManifestEntryAdapter::AppendDataFile( break; case 102: // partition (required struct) { - auto partition_type = std::dynamic_pointer_cast(field.type()); + auto partition_type = internal::checked_pointer_cast(field.type()); ICEBERG_RETURN_UNEXPECTED( - AppendPartitions(array, partition_type, file->partition)); + AppendPartition(array, partition_type, file->partition)); } break; case 103: // record_count (required int64) ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->record_count)); @@ -279,7 +269,7 @@ Status ManifestEntryAdapter::AppendDataFile( break; case 105: // block_size_in_bytes (compatible in v1) // always 64MB for v1 - ICEBERG_RETURN_UNEXPECTED(AppendField(array, kBlockSizeInBytes)); + ICEBERG_RETURN_UNEXPECTED(AppendField(array, kBlockSizeInBytesV1)); break; case 108: // column_sizes (optional map) ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->column_sizes)); @@ -303,8 +293,7 @@ Status ManifestEntryAdapter::AppendDataFile( if (!file->key_metadata.empty()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->key_metadata)); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 132: // split_offsets (optional list) @@ -318,16 +307,14 @@ Status ManifestEntryAdapter::AppendDataFile( ICEBERG_RETURN_UNEXPECTED( AppendField(array, static_cast(file->sort_order_id.value()))); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 142: // first_row_id (optional int64) if (file->first_row_id.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->first_row_id.value())); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 143: // referenced_data_file (optional string) @@ -337,8 +324,7 @@ Status ManifestEntryAdapter::AppendDataFile( if (referenced_data_file.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, referenced_data_file.value())); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; } @@ -346,8 +332,7 @@ Status ManifestEntryAdapter::AppendDataFile( if (file->content_offset.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->content_offset.value())); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 145: // content_size_in_bytes (optional int64) @@ -355,21 +340,19 @@ Status ManifestEntryAdapter::AppendDataFile( ICEBERG_RETURN_UNEXPECTED( AppendField(array, file->content_size_in_bytes.value())); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; default: return InvalidManifest("Unknown data file field id: {} ", field.field_id()); } } - auto status = ArrowArrayFinishElement(arrow_array); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); return {}; } -Result> ManifestEntryAdapter::GetWrappedSequenceNumber( - const iceberg::ManifestEntry& entry) { +Result> ManifestEntryAdapter::GetSequenceNumber( + const ManifestEntry& entry) { return entry.sequence_number; } @@ -393,11 +376,11 @@ Result> ManifestEntryAdapter::GetWrappedContentSizeInByte return file->content_size_in_bytes; } -Status ManifestEntryAdapter::AppendInternal(const iceberg::ManifestEntry& entry) { +Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { const auto& fields = manifest_schema_->fields(); for (int32_t i = 0; i < fields.size(); i++) { const auto& field = fields[i]; - auto array = array_.children[i]; + auto array = array_->children[i]; switch (field.field_id()) { case 0: // status (required int32) @@ -408,29 +391,26 @@ Status ManifestEntryAdapter::AppendInternal(const iceberg::ManifestEntry& entry) if (entry.snapshot_id.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, entry.snapshot_id.value())); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 2: // data_file (required struct) if (entry.data_file) { // Get the data file type from the field - auto data_file_type = std::dynamic_pointer_cast(field.type()); + auto data_file_type = internal::checked_pointer_cast(field.type()); ICEBERG_RETURN_UNEXPECTED( AppendDataFile(array, data_file_type, entry.data_file)); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 3: // sequence_number (optional int64) { - ICEBERG_ASSIGN_OR_RAISE(auto sequence_num, GetWrappedSequenceNumber(entry)); + ICEBERG_ASSIGN_OR_RAISE(auto sequence_num, GetSequenceNumber(entry)); if (sequence_num.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, sequence_num.value())); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; } @@ -439,8 +419,7 @@ Status ManifestEntryAdapter::AppendInternal(const iceberg::ManifestEntry& entry) ICEBERG_RETURN_UNEXPECTED( AppendField(array, entry.file_sequence_number.value())); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; default: @@ -448,14 +427,13 @@ Status ManifestEntryAdapter::AppendInternal(const iceberg::ManifestEntry& entry) } } - auto status = ArrowArrayFinishElement(&array_); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(array_.get())); size_++; return {}; } Status ManifestEntryAdapter::InitSchema(const std::unordered_set& fields_ids) { - auto manifest_entry_schema = GetManifestEntryStructType(); + ICEBERG_ASSIGN_OR_RAISE(auto manifest_entry_schema, GetManifestEntryStructType()) auto fields_span = manifest_entry_schema->fields(); std::vector fields; // TODO(xiao.dong) make this a common function to recursive handle @@ -463,7 +441,7 @@ Status ManifestEntryAdapter::InitSchema(const std::unordered_set& field for (const auto& field : fields_span) { if (field.field_id() == 2) { // handle data_file field - auto data_file_struct = std::dynamic_pointer_cast(field.type()); + auto data_file_struct = internal::checked_pointer_cast(field.type()); std::vector data_file_fields; for (const auto& data_file_field : data_file_struct->fields()) { if (fields_ids.contains(data_file_field.field_id())) { @@ -482,15 +460,14 @@ Status ManifestEntryAdapter::InitSchema(const std::unordered_set& field } manifest_schema_ = std::make_shared(fields); ICEBERG_RETURN_UNEXPECTED(ToArrowSchema(*manifest_schema_, &schema_)); - is_initialized_ = true; return {}; } ManifestFileAdapter::~ManifestFileAdapter() { - if (is_initialized_) { - // arrow::ImportedArrayData::Release() bridge.cc:1478 will release the - // internal array, so we have no need to release it here. - // ArrowArrayRelease(&array_); + if (array_ != nullptr && array_->release != nullptr) { + ArrowArrayRelease(array_.get()); + } + if (schema_.release != nullptr) { ArrowSchemaRelease(&schema_); } } @@ -503,7 +480,7 @@ Status ManifestFileAdapter::AppendPartitions( return InvalidManifestList("Invalid partition array."); } auto partition_struct = - std::dynamic_pointer_cast(partition_type->fields()[0].type()); + internal::checked_pointer_cast(partition_type->fields()[0].type()); auto fields = partition_struct->fields(); for (const auto& partition : partitions) { for (const auto& field : fields) { @@ -521,8 +498,7 @@ Status ManifestFileAdapter::AppendPartitions( field_array, static_cast(partition.contains_nan.value() ? 1 : 0))); } else { - auto status = ArrowArrayAppendNull(field_array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(field_array, 1)); } break; } @@ -533,8 +509,7 @@ Status ManifestFileAdapter::AppendPartitions( ICEBERG_RETURN_UNEXPECTED( AppendField(field_array, partition.lower_bound.value())); } else { - auto status = ArrowArrayAppendNull(field_array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(field_array, 1)); } break; } @@ -545,8 +520,7 @@ Status ManifestFileAdapter::AppendPartitions( ICEBERG_RETURN_UNEXPECTED( AppendField(field_array, partition.upper_bound.value())); } else { - auto status = ArrowArrayAppendNull(field_array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(field_array, 1)); } break; } @@ -554,35 +528,32 @@ Status ManifestFileAdapter::AppendPartitions( return InvalidManifestList("Unknown field id: {}", field.field_id()); } } - auto status = ArrowArrayFinishElement(array); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(array)); } - auto status = ArrowArrayFinishElement(arrow_array); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); return {}; } -Result ManifestFileAdapter::GetWrappedSequenceNumber( - const iceberg::ManifestFile& file) { +Result ManifestFileAdapter::GetSequenceNumber(const ManifestFile& file) { return file.sequence_number; } Result ManifestFileAdapter::GetWrappedMinSequenceNumber( - const iceberg::ManifestFile& file) { + const ManifestFile& file) { return file.min_sequence_number; } Result> ManifestFileAdapter::GetWrappedFirstRowId( - const iceberg::ManifestFile& file) { + const ManifestFile& file) { return file.first_row_id; } -Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { +Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { const auto& fields = manifest_list_schema_->fields(); for (int32_t i = 0; i < fields.size(); i++) { const auto& field = fields[i]; - auto array = array_.children[i]; + auto array = array_->children[i]; switch (field.field_id()) { case 500: // manifest_path ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.manifest_path)); @@ -600,7 +571,7 @@ Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { break; case 515: // sequence_number { - ICEBERG_ASSIGN_OR_RAISE(auto sequence_num, GetWrappedSequenceNumber(file)); + ICEBERG_ASSIGN_OR_RAISE(auto sequence_num, GetSequenceNumber(file)); ICEBERG_RETURN_UNEXPECTED(AppendField(array, sequence_num)); break; } @@ -619,8 +590,7 @@ Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { AppendField(array, static_cast(file.added_files_count.value()))); } else { // Append null for optional field - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 505: // existing_files_count @@ -629,8 +599,7 @@ Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { array, static_cast(file.existing_files_count.value()))); } else { // Append null for optional field - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 506: // deleted_files_count @@ -639,8 +608,7 @@ Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { AppendField(array, static_cast(file.deleted_files_count.value()))); } else { // Append null for optional field - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 512: // added_rows_count @@ -648,8 +616,7 @@ Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.added_rows_count.value())); } else { // Append null for optional field - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 513: // existing_rows_count @@ -657,8 +624,7 @@ Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.existing_rows_count.value())); } else { // Append null for optional field - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 514: // deleted_rows_count @@ -666,20 +632,19 @@ Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.deleted_rows_count.value())); } else { // Append null for optional field - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 507: // partitions ICEBERG_RETURN_UNEXPECTED(AppendPartitions( - array, std::dynamic_pointer_cast(field.type()), file.partitions)); + array, internal::checked_pointer_cast(field.type()), + file.partitions)); break; case 519: // key_metadata if (!file.key_metadata.empty()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.key_metadata)); } else { - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; case 520: // first_row_id @@ -689,8 +654,7 @@ Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, first_row_id.value())); } else { // Append null for optional field - auto status = ArrowArrayAppendNull(array, 1); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); } break; } @@ -698,8 +662,7 @@ Status ManifestFileAdapter::AppendInternal(const iceberg::ManifestFile& file) { return InvalidManifestList("Unknown field id: {}", field.field_id()); } } - auto status = ArrowArrayFinishElement(&array_); - NANOARROW_RETURN_IF_FAILED(status); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(array_.get())); size_++; return {}; } @@ -713,7 +676,6 @@ Status ManifestFileAdapter::InitSchema(const std::unordered_set& fields } manifest_list_schema_ = std::make_shared(fields); ICEBERG_RETURN_UNEXPECTED(ToArrowSchema(*manifest_list_schema_, &schema_)); - is_initialized_ = true; return {}; } diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index 1675ab1ed..4f3a758f9 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -43,21 +44,20 @@ class ICEBERG_EXPORT ManifestAdapter { virtual Status Init() = 0; Status StartAppending(); - Result FinishAppending(); + Result> FinishAppending(); int64_t size() const { return size_; } - virtual std::shared_ptr schema() const = 0; + virtual const std::shared_ptr& schema() const = 0; protected: static Status AppendField(ArrowArray* arrowArray, int64_t value); static Status AppendField(ArrowArray* arrowArray, uint64_t value); static Status AppendField(ArrowArray* arrowArray, double value); static Status AppendField(ArrowArray* arrowArray, std::string_view value); - static Status AppendField(ArrowArray* arrowArray, const std::vector& value); - static Status AppendField(ArrowArray* arrowArray, const std::array& value); + static Status AppendField(ArrowArray* arrowArray, + const std::span& value); protected: - bool is_initialized_ = false; - ArrowArray array_; + std::shared_ptr array_; ArrowSchema schema_; // converted from manifest_schema_ or manifest_list_schema_ int64_t size_ = 0; }; @@ -66,26 +66,29 @@ class ICEBERG_EXPORT ManifestAdapter { // append a list of `ManifestEntry`s to an `ArrowArray`. class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { public: - explicit ManifestEntryAdapter(std::shared_ptr partition_schema, - std::shared_ptr partition_spec) - : partition_spec_(std::move(partition_spec)), - partition_schema_(std::move(partition_schema)) {} + explicit ManifestEntryAdapter(std::shared_ptr partition_spec) + : partition_spec_(std::move(partition_spec)) {} ~ManifestEntryAdapter() override; virtual Status Append(const ManifestEntry& entry) = 0; - std::shared_ptr schema() const override { return manifest_schema_; } + const std::shared_ptr& schema() const override { return manifest_schema_; } protected: - virtual std::shared_ptr GetManifestEntryStructType(); + virtual Result> GetManifestEntryStructType(); + + /// \brief Init version-specific schema for each version. + /// + /// \param fields_ids each version of manifest schema has schema, we will init this + /// schema based on the fields_ids. Status InitSchema(const std::unordered_set& fields_ids); Status AppendInternal(const ManifestEntry& entry); Status AppendDataFile(ArrowArray* arrow_array, const std::shared_ptr& data_file_type, const std::shared_ptr& file); - static Status AppendPartitions(ArrowArray* arrow_array, - const std::shared_ptr& partition_type, - const std::vector& partitions); + static Status AppendPartition(ArrowArray* arrow_array, + const std::shared_ptr& partition_type, + const std::vector& partitions); static Status AppendList(ArrowArray* arrow_array, const std::vector& list_value); static Status AppendList(ArrowArray* arrow_array, @@ -95,8 +98,7 @@ class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { static Status AppendMap(ArrowArray* arrow_array, const std::map>& map_value); - virtual Result> GetWrappedSequenceNumber( - const ManifestEntry& entry); + virtual Result> GetSequenceNumber(const ManifestEntry& entry); virtual Result> GetWrappedReferenceDataFile( const std::shared_ptr& file); virtual Result> GetWrappedFirstRowId( @@ -108,7 +110,6 @@ class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { protected: std::shared_ptr partition_spec_; - std::shared_ptr partition_schema_; std::shared_ptr manifest_schema_; std::unordered_map metadata_; }; @@ -122,16 +123,20 @@ class ICEBERG_EXPORT ManifestFileAdapter : public ManifestAdapter { virtual Status Append(const ManifestFile& file) = 0; - std::shared_ptr schema() const override { return manifest_list_schema_; } + const std::shared_ptr& schema() const override { return manifest_list_schema_; } protected: + /// \brief Init version-specific schema for each version. + /// + /// \param fields_ids each version of manifest schema has schema, we will init this + /// schema based on the fields_ids. Status InitSchema(const std::unordered_set& fields_ids); Status AppendInternal(const ManifestFile& file); static Status AppendPartitions(ArrowArray* arrow_array, const std::shared_ptr& partition_type, const std::vector& partitions); - virtual Result GetWrappedSequenceNumber(const ManifestFile& file); + virtual Result GetSequenceNumber(const ManifestFile& file); virtual Result GetWrappedMinSequenceNumber(const ManifestFile& file); virtual Result> GetWrappedFirstRowId(const ManifestFile& file); diff --git a/src/iceberg/manifest_entry.cc b/src/iceberg/manifest_entry.cc index d387aad68..5a963b5d6 100644 --- a/src/iceberg/manifest_entry.cc +++ b/src/iceberg/manifest_entry.cc @@ -44,7 +44,8 @@ std::shared_ptr DataFile::Type(std::shared_ptr partition kContent, kFilePath, kFileFormat, - SchemaField::MakeRequired(102, kPartitionField, std::move(partition_type)), + SchemaField::MakeRequired(kPartitionFieldId, kPartitionField, + std::move(partition_type)), kRecordCount, kFileSize, kColumnSizes, @@ -70,9 +71,10 @@ std::shared_ptr ManifestEntry::TypeFromPartitionType( std::shared_ptr ManifestEntry::TypeFromDataFileType( std::shared_ptr datafile_type) { - return std::make_shared(std::vector{ - kStatus, kSnapshotId, kSequenceNumber, kFileSequenceNumber, - SchemaField::MakeRequired(2, kDataFileField, std::move(datafile_type))}); + return std::make_shared( + std::vector{kStatus, kSnapshotId, kSequenceNumber, kFileSequenceNumber, + SchemaField::MakeRequired(kDataFileFieldId, kDataFileField, + std::move(datafile_type))}); } } // namespace iceberg diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index 0aa697aa0..ebfcf300b 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -183,6 +183,7 @@ struct ICEBERG_EXPORT DataFile { 100, "file_path", iceberg::string(), "Location URI with FS scheme"); inline static const SchemaField kFileFormat = SchemaField::MakeRequired( 101, "file_format", iceberg::string(), "File format name: avro, orc, or parquet"); + inline static const int32_t kPartitionFieldId = 102; inline static const std::string kPartitionField = "partition"; inline static const SchemaField kRecordCount = SchemaField::MakeRequired( 103, "record_count", iceberg::int64(), "Number of records in the file"); @@ -300,8 +301,8 @@ struct ICEBERG_EXPORT ManifestEntry { SchemaField::MakeOptional(3, "sequence_number", iceberg::int64()); inline static const SchemaField kFileSequenceNumber = SchemaField::MakeOptional(4, "file_sequence_number", iceberg::int64()); + inline static const int32_t kDataFileFieldId = 2; inline static const std::string kDataFileField = "data_file"; - bool operator==(const ManifestEntry& other) const; static std::shared_ptr TypeFromPartitionType( diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index 82eeb3476..7a2ce0804 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -21,6 +21,7 @@ #include +#include "iceberg/arrow/nanoarrow_error_transform_internal.h" #include "iceberg/arrow_c_data_guard_internal.h" #include "iceberg/file_format.h" #include "iceberg/manifest_entry.h" @@ -203,12 +204,12 @@ Result> ParseManifestList(ArrowSchema* schema, ArrowError error; ArrowArrayView array_view; auto status = ArrowArrayViewInitFromSchema(&array_view, schema, &error); - NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); internal::ArrowArrayViewGuard view_guard(&array_view); status = ArrowArrayViewSetArray(&array_view, array_in, &error); - NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); status = ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error); - NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); std::vector manifest_files; manifest_files.resize(array_in->length); @@ -466,12 +467,12 @@ Result> ParseManifestEntry(ArrowSchema* schema, ArrowError error; ArrowArrayView array_view; auto status = ArrowArrayViewInitFromSchema(&array_view, schema, &error); - NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); internal::ArrowArrayViewGuard view_guard(&array_view); status = ArrowArrayViewSetArray(&array_view, array_in, &error); - NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); status = ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error); - NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); std::vector manifest_entries; manifest_entries.resize(array_in->length); diff --git a/src/iceberg/manifest_writer.cc b/src/iceberg/manifest_writer.cc index 02fbd3269..278d9907f 100644 --- a/src/iceberg/manifest_writer.cc +++ b/src/iceberg/manifest_writer.cc @@ -32,7 +32,7 @@ namespace iceberg { Status ManifestWriter::Add(const ManifestEntry& entry) { if (adapter_->size() >= kBatchSize) { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); - ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); + ICEBERG_RETURN_UNEXPECTED(writer_->Write(*array)); ICEBERG_RETURN_UNEXPECTED(adapter_->StartAppending()); } return adapter_->Append(entry); @@ -48,7 +48,7 @@ Status ManifestWriter::AddAll(const std::vector& entries) { Status ManifestWriter::Close() { if (adapter_->size() > 0) { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); - ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); + ICEBERG_RETURN_UNEXPECTED(writer_->Write(*array)); } return writer_->Close(); } @@ -66,10 +66,9 @@ Result> OpenFileWriter(std::string_view location, Result> ManifestWriter::MakeV1Writer( std::optional snapshot_id, std::string_view manifest_location, - std::shared_ptr file_io, std::shared_ptr partition_schema, - std::shared_ptr partition_spec) { - auto adapter = std::make_unique( - snapshot_id, std::move(partition_schema), std::move(partition_spec)); + std::shared_ptr file_io, std::shared_ptr partition_spec) { + auto adapter = + std::make_unique(snapshot_id, std::move(partition_spec)); ICEBERG_RETURN_UNEXPECTED(adapter->Init()); ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); ICEBERG_ASSIGN_OR_RAISE( @@ -80,10 +79,9 @@ Result> ManifestWriter::MakeV1Writer( Result> ManifestWriter::MakeV2Writer( std::optional snapshot_id, std::string_view manifest_location, - std::shared_ptr file_io, std::shared_ptr partition_schema, - std::shared_ptr partition_spec) { - auto adapter = std::make_unique( - snapshot_id, std::move(partition_schema), std::move(partition_spec)); + std::shared_ptr file_io, std::shared_ptr partition_spec) { + auto adapter = + std::make_unique(snapshot_id, std::move(partition_spec)); ICEBERG_RETURN_UNEXPECTED(adapter->Init()); ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); ICEBERG_ASSIGN_OR_RAISE( @@ -95,10 +93,9 @@ Result> ManifestWriter::MakeV2Writer( Result> ManifestWriter::MakeV3Writer( std::optional snapshot_id, std::optional first_row_id, std::string_view manifest_location, std::shared_ptr file_io, - std::shared_ptr partition_schema, std::shared_ptr partition_spec) { - auto adapter = std::make_unique( - snapshot_id, first_row_id, std::move(partition_schema), std::move(partition_spec)); + auto adapter = std::make_unique(snapshot_id, first_row_id, + std::move(partition_spec)); ICEBERG_RETURN_UNEXPECTED(adapter->Init()); ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); ICEBERG_ASSIGN_OR_RAISE( @@ -110,7 +107,7 @@ Result> ManifestWriter::MakeV3Writer( Status ManifestListWriter::Add(const ManifestFile& file) { if (adapter_->size() >= kBatchSize) { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); - ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); + ICEBERG_RETURN_UNEXPECTED(writer_->Write(*array)); ICEBERG_RETURN_UNEXPECTED(adapter_->StartAppending()); } return adapter_->Append(file); @@ -126,7 +123,7 @@ Status ManifestListWriter::AddAll(const std::vector& files) { Status ManifestListWriter::Close() { if (adapter_->size() > 0) { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); - ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); + ICEBERG_RETURN_UNEXPECTED(writer_->Write(*array)); } return writer_->Close(); } diff --git a/src/iceberg/manifest_writer.h b/src/iceberg/manifest_writer.h index a9403b732..69e191219 100644 --- a/src/iceberg/manifest_writer.h +++ b/src/iceberg/manifest_writer.h @@ -58,38 +58,32 @@ class ICEBERG_EXPORT ManifestWriter { /// \param snapshot_id ID of the snapshot. /// \param manifest_location Path to the manifest file. /// \param file_io File IO implementation to use. - /// \param partition_schema Schema of the partition columns. /// \param partition_spec Partition spec for the manifest. /// \return A Result containing the writer or an error. static Result> MakeV1Writer( std::optional snapshot_id, std::string_view manifest_location, - std::shared_ptr file_io, std::shared_ptr partition_schema, - std::shared_ptr partition_spec); + std::shared_ptr file_io, std::shared_ptr partition_spec); /// \brief Creates a writer for a manifest file. /// \param snapshot_id ID of the snapshot. /// \param manifest_location Path to the manifest file. /// \param file_io File IO implementation to use. - /// \param partition_schema Schema of the partition columns. /// \param partition_spec Partition spec for the manifest. /// \return A Result containing the writer or an error. static Result> MakeV2Writer( std::optional snapshot_id, std::string_view manifest_location, - std::shared_ptr file_io, std::shared_ptr partition_schema, - std::shared_ptr partition_spec); + std::shared_ptr file_io, std::shared_ptr partition_spec); /// \brief Creates a writer for a manifest file. /// \param snapshot_id ID of the snapshot. /// \param first_row_id First row ID of the snapshot. /// \param manifest_location Path to the manifest file. /// \param file_io File IO implementation to use. - /// \param partition_schema Schema of the partition columns. /// \param partition_spec Partition spec for the manifest. /// \return A Result containing the writer or an error. static Result> MakeV3Writer( std::optional snapshot_id, std::optional first_row_id, std::string_view manifest_location, std::shared_ptr file_io, - std::shared_ptr partition_schema, std::shared_ptr partition_spec); private: diff --git a/src/iceberg/parquet/parquet_writer.cc b/src/iceberg/parquet/parquet_writer.cc index 2d2cd5406..2f34b142c 100644 --- a/src/iceberg/parquet/parquet_writer.cc +++ b/src/iceberg/parquet/parquet_writer.cc @@ -76,7 +76,7 @@ class ParquetWriter::Impl { return {}; } - Status Write(ArrowArray array) { + Status Write(ArrowArray& array) { ICEBERG_ARROW_ASSIGN_OR_RETURN(auto batch, ::arrow::ImportRecordBatch(&array, arrow_schema_)); @@ -132,7 +132,7 @@ Status ParquetWriter::Open(const WriterOptions& options) { return impl_->Open(options); } -Status ParquetWriter::Write(ArrowArray array) { return impl_->Write(array); } +Status ParquetWriter::Write(ArrowArray& array) { return impl_->Write(array); } Status ParquetWriter::Close() { return impl_->Close(); } diff --git a/src/iceberg/parquet/parquet_writer.h b/src/iceberg/parquet/parquet_writer.h index 5371f3810..630f60b68 100644 --- a/src/iceberg/parquet/parquet_writer.h +++ b/src/iceberg/parquet/parquet_writer.h @@ -35,7 +35,7 @@ class ICEBERG_BUNDLE_EXPORT ParquetWriter : public Writer { Status Close() final; - Status Write(ArrowArray array) final; + Status Write(ArrowArray& array) final; std::optional metrics() final; diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 2b41950b4..8c084e00b 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -24,7 +24,10 @@ #include #include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/transform.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" namespace iceberg { @@ -57,6 +60,49 @@ int32_t PartitionSpec::spec_id() const { return spec_id_; } std::span PartitionSpec::fields() const { return fields_; } +Result> PartitionSpec::partition_schema() { + if (fields_.empty()) { + return nullptr; + } + { + std::lock_guard lock(mutex_); + if (partition_schema_ != nullptr) { + return partition_schema_; + } + } + + std::vector partition_fields; + for (const auto& partition_field : fields_) { + // Get the source field from the original schema by source_id + ICEBERG_ASSIGN_OR_RAISE(auto source_field, + schema_->FindFieldById(partition_field.source_id())); + if (!source_field.has_value()) { + return InvalidSchema("Cannot find source field for partition field:{}", + partition_field.field_id()); + } + auto source_field_type = source_field.value().get().type(); + // Bind the transform to the source field type to get the result type + ICEBERG_ASSIGN_OR_RAISE(auto transform_function, + partition_field.transform()->Bind(source_field_type)); + + auto result_type = transform_function->ResultType(); + + // Create the partition field with the transform result type + // Partition fields are always optional (can be null) + partition_fields.emplace_back(partition_field.field_id(), + std::string(partition_field.name()), + std::move(result_type), + true // optional + ); + } + + std::lock_guard lock(mutex_); + if (partition_schema_ == nullptr) { + partition_schema_ = std::make_shared(std::move(partition_fields)); + } + return partition_schema_; +} + std::string PartitionSpec::ToString() const { std::string repr = std::format("partition_spec[spec_id<{}>,\n", spec_id_); for (const auto& field : fields_) { diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index f105a27eb..bfda12ca3 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -23,6 +23,7 @@ /// Partition specs for Iceberg tables. #include +#include #include #include #include @@ -30,6 +31,7 @@ #include "iceberg/iceberg_export.h" #include "iceberg/partition_field.h" +#include "iceberg/result.h" #include "iceberg/util/formattable.h" namespace iceberg { @@ -67,6 +69,8 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { /// \brief Get a view of the partition fields. std::span fields() const; + Result> partition_schema(); + std::string ToString() const override; int32_t last_assigned_field_id() const { return last_assigned_field_id_; } @@ -83,6 +87,8 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { const int32_t spec_id_; std::vector fields_; int32_t last_assigned_field_id_; + std::mutex mutex_; + std::shared_ptr partition_schema_; }; } // namespace iceberg diff --git a/src/iceberg/test/manifest_reader_writer_test.cc b/src/iceberg/test/manifest_reader_writer_test.cc index c9865c0cc..7b1c71fbe 100644 --- a/src/iceberg/test/manifest_reader_writer_test.cc +++ b/src/iceberg/test/manifest_reader_writer_test.cc @@ -152,12 +152,10 @@ class ManifestReaderV1Test : public ManifestReaderTestBase { } void TestWriteManifest(const std::string& manifest_list_path, - std::shared_ptr partition_schema, std::shared_ptr partition_spec, const std::vector& manifest_entries) { std::cout << "Writing manifest list to " << manifest_list_path << std::endl; auto result = ManifestWriter::MakeV1Writer(1, manifest_list_path, file_io_, - std::move(partition_schema), std::move(partition_spec)); ASSERT_TRUE(result.has_value()) << result.error().message; auto writer = std::move(result.value()); @@ -183,13 +181,12 @@ TEST_F(ManifestReaderV1Test, WritePartitionedTest) { std::make_shared(std::vector({partition_field})); auto identity_transform = Transform::Identity(); std::vector fields{ - PartitionField(1, 1000, "order_ts_hour", identity_transform)}; + PartitionField(1000, 1000, "order_ts_hour", identity_transform)}; auto partition_spec = std::make_shared(partition_schema, 1, fields); auto expected_entries = PreparePartitionedTestData(); auto write_manifest_path = CreateNewTempFilePath(); - TestWriteManifest(write_manifest_path, partition_schema, partition_spec, - expected_entries); + TestWriteManifest(write_manifest_path, partition_spec, expected_entries); } class ManifestReaderV2Test : public ManifestReaderTestBase { @@ -259,12 +256,10 @@ class ManifestReaderV2Test : public ManifestReaderTestBase { } void TestWriteManifest(int64_t snapshot_id, const std::string& manifest_list_path, - std::shared_ptr partition_schema, std::shared_ptr partition_spec, const std::vector& manifest_entries) { std::cout << "Writing manifest list to " << manifest_list_path << std::endl; auto result = ManifestWriter::MakeV2Writer(snapshot_id, manifest_list_path, file_io_, - std::move(partition_schema), std::move(partition_spec)); ASSERT_TRUE(result.has_value()) << result.error().message; auto writer = std::move(result.value()); @@ -297,15 +292,13 @@ TEST_F(ManifestReaderV2Test, MetadataInheritanceTest) { TEST_F(ManifestReaderV2Test, WriteNonPartitionedTest) { auto expected_entries = PrepareNonPartitionedTestData(); auto write_manifest_path = CreateNewTempFilePath(); - TestWriteManifest(679879563479918846LL, write_manifest_path, nullptr, nullptr, - expected_entries); + TestWriteManifest(679879563479918846LL, write_manifest_path, nullptr, expected_entries); } TEST_F(ManifestReaderV2Test, WriteInheritancePartitionedTest) { auto expected_entries = PrepareMetadataInheritanceTestData(); auto write_manifest_path = CreateNewTempFilePath(); - TestWriteManifest(679879563479918846LL, write_manifest_path, nullptr, nullptr, - expected_entries); + TestWriteManifest(679879563479918846LL, write_manifest_path, nullptr, expected_entries); } } // namespace iceberg diff --git a/src/iceberg/util/macros.h b/src/iceberg/util/macros.h index 5e5e3de86..af7da7bc4 100644 --- a/src/iceberg/util/macros.h +++ b/src/iceberg/util/macros.h @@ -38,13 +38,3 @@ #define ICEBERG_ASSIGN_OR_RAISE(lhs, rexpr) \ ICEBERG_ASSIGN_OR_RAISE_IMPL(ICEBERG_ASSIGN_OR_RAISE_NAME(result_, __COUNTER__), lhs, \ rexpr) - -#define NANOARROW_RETURN_IF_FAILED(status) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return InvalidArrowData("Nanoarrow error code: {}", status); \ - } - -#define NANOARROW_RETURN_IF_NOT_OK(status, error) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return InvalidArrowData("Nanoarrow error msg: {}", error.message); \ - } diff --git a/src/iceberg/v1_metadata.cc b/src/iceberg/v1_metadata.cc index db2e287cc..f391e1535 100644 --- a/src/iceberg/v1_metadata.cc +++ b/src/iceberg/v1_metadata.cc @@ -22,12 +22,30 @@ #include "iceberg/manifest_entry.h" #include "iceberg/manifest_list.h" #include "iceberg/schema.h" +#include "iceberg/util/macros.h" namespace iceberg { Status ManifestEntryAdapterV1::Init() { - static std::unordered_set compatible_fields{ - 0, 1, 2, 100, 101, 102, 103, 104, 105, 108, 109, 110, 137, 125, 128, 131, 132, 140, + static std::unordered_set kManifestEntryFieldIds{ + ManifestEntry::kStatus.field_id(), + ManifestEntry::kSnapshotId.field_id(), + ManifestEntry::kDataFileFieldId, + DataFile::kFilePath.field_id(), + DataFile::kFileFormat.field_id(), + DataFile::kPartitionFieldId, + DataFile::kRecordCount.field_id(), + DataFile::kFileSize.field_id(), + 105, // kBlockSizeInBytes field id + DataFile::kColumnSizes.field_id(), + DataFile::kValueCounts.field_id(), + DataFile::kNullValueCounts.field_id(), + DataFile::kNanValueCounts.field_id(), + DataFile::kLowerBounds.field_id(), + DataFile::kUpperBounds.field_id(), + DataFile::kKeyMetadata.field_id(), + DataFile::kSplitOffsets.field_id(), + DataFile::kSortOrderId.field_id(), }; // TODO(xiao.dong) schema to json metadata_["schema"] = "{}"; @@ -37,20 +55,20 @@ Status ManifestEntryAdapterV1::Init() { metadata_["partition-spec-id"] = std::to_string(partition_spec_->spec_id()); } metadata_["format-version"] = "1"; - return InitSchema(compatible_fields); + return InitSchema(kManifestEntryFieldIds); } -Status ManifestEntryAdapterV1::Append(const iceberg::ManifestEntry& entry) { +Status ManifestEntryAdapterV1::Append(const ManifestEntry& entry) { return AppendInternal(entry); } -std::shared_ptr ManifestEntryAdapterV1::GetManifestEntryStructType() { +Result> ManifestEntryAdapterV1::GetManifestEntryStructType() { // 'block_size_in_bytes' (ID 105) is a deprecated field that is REQUIRED // in the v1 data_file schema for backward compatibility. // Deprecated. Always write a default in v1. Do not write in v2 or v3. static const SchemaField kBlockSizeInBytes = SchemaField::MakeRequired( - 105, "block_size_in_bytes", iceberg::int64(), "Block size in bytes"); - std::shared_ptr partition_type = partition_schema_; + 105, "block_size_in_bytes", int64(), "Block size in bytes"); + ICEBERG_ASSIGN_OR_RAISE(auto partition_type, partition_spec_->partition_schema()); if (!partition_type) { partition_type = PartitionSpec::Unpartitioned()->schema(); } @@ -70,18 +88,29 @@ std::shared_ptr ManifestEntryAdapterV1::GetManifestEntryStructType() } Status ManifestFileAdapterV1::Init() { - static std::unordered_set compatible_fields{ - 500, 501, 502, 503, 504, 505, 506, 512, 513, 514, 507, 519, + static std::unordered_set kManifestFileFieldIds{ + ManifestFile::kManifestPath.field_id(), + ManifestFile::kManifestLength.field_id(), + ManifestFile::kPartitionSpecId.field_id(), + ManifestFile::kAddedSnapshotId.field_id(), + ManifestFile::kAddedFilesCount.field_id(), + ManifestFile::kExistingFilesCount.field_id(), + ManifestFile::kDeletedFilesCount.field_id(), + ManifestFile::kAddedRowsCount.field_id(), + ManifestFile::kExistingRowsCount.field_id(), + ManifestFile::kDeletedRowsCount.field_id(), + ManifestFile::kPartitions.field_id(), + ManifestFile::kKeyMetadata.field_id(), }; metadata_["snapshot-id"] = std::to_string(snapshot_id_); metadata_["parent-snapshot-id"] = parent_snapshot_id_.has_value() ? std::to_string(parent_snapshot_id_.value()) : "null"; metadata_["format-version"] = "1"; - return InitSchema(compatible_fields); + return InitSchema(kManifestFileFieldIds); } -Status ManifestFileAdapterV1::Append(const iceberg::ManifestFile& file) { +Status ManifestFileAdapterV1::Append(const ManifestFile& file) { if (file.content != ManifestFile::Content::kData) { return InvalidManifestList("Cannot store delete manifests in a v1 table"); } diff --git a/src/iceberg/v1_metadata.h b/src/iceberg/v1_metadata.h index ea31993df..7b2dd1d05 100644 --- a/src/iceberg/v1_metadata.h +++ b/src/iceberg/v1_metadata.h @@ -28,15 +28,13 @@ namespace iceberg { class ManifestEntryAdapterV1 : public ManifestEntryAdapter { public: ManifestEntryAdapterV1(std::optional snapshot_id, - std::shared_ptr partition_schema, std::shared_ptr partition_spec) - : ManifestEntryAdapter(std::move(partition_schema), std::move(partition_spec)), - snapshot_id_(snapshot_id) {} + : ManifestEntryAdapter(std::move(partition_spec)), snapshot_id_(snapshot_id) {} Status Init() override; Status Append(const ManifestEntry& entry) override; protected: - std::shared_ptr GetManifestEntryStructType() override; + Result> GetManifestEntryStructType() override; private: std::optional snapshot_id_; diff --git a/src/iceberg/v2_metadata.cc b/src/iceberg/v2_metadata.cc index 54b74f1ee..0671894b9 100644 --- a/src/iceberg/v2_metadata.cc +++ b/src/iceberg/v2_metadata.cc @@ -26,9 +26,29 @@ namespace iceberg { Status ManifestEntryAdapterV2::Init() { - static std::unordered_set compatible_fields{ - 0, 1, 2, 3, 4, 134, 100, 101, 102, 103, 104, - 108, 109, 110, 137, 125, 128, 131, 132, 135, 140, 143, + static std::unordered_set kManifestEntryFieldIds{ + ManifestEntry::kStatus.field_id(), + ManifestEntry::kSnapshotId.field_id(), + ManifestEntry::kDataFileFieldId, + ManifestEntry::kSequenceNumber.field_id(), + ManifestEntry::kFileSequenceNumber.field_id(), + DataFile::kContent.field_id(), + DataFile::kFilePath.field_id(), + DataFile::kFileFormat.field_id(), + DataFile::kPartitionFieldId, + DataFile::kRecordCount.field_id(), + DataFile::kFileSize.field_id(), + DataFile::kColumnSizes.field_id(), + DataFile::kValueCounts.field_id(), + DataFile::kNullValueCounts.field_id(), + DataFile::kNanValueCounts.field_id(), + DataFile::kLowerBounds.field_id(), + DataFile::kUpperBounds.field_id(), + DataFile::kKeyMetadata.field_id(), + DataFile::kSplitOffsets.field_id(), + DataFile::kEqualityIds.field_id(), + DataFile::kSortOrderId.field_id(), + DataFile::kReferencedDataFile.field_id(), }; // TODO(xiao.dong) schema to json metadata_["schema"] = "{}"; @@ -39,15 +59,15 @@ Status ManifestEntryAdapterV2::Init() { } metadata_["format-version"] = "2"; metadata_["content"] = "data"; - return InitSchema(compatible_fields); + return InitSchema(kManifestEntryFieldIds); } -Status ManifestEntryAdapterV2::Append(const iceberg::ManifestEntry& entry) { +Status ManifestEntryAdapterV2::Append(const ManifestEntry& entry) { return AppendInternal(entry); } -Result> ManifestEntryAdapterV2::GetWrappedSequenceNumber( - const iceberg::ManifestEntry& entry) { +Result> ManifestEntryAdapterV2::GetSequenceNumber( + const ManifestEntry& entry) { if (!entry.sequence_number.has_value()) { // if the entry's data sequence number is null, // then it will inherit the sequence number of the current commit. @@ -79,8 +99,22 @@ Result> ManifestEntryAdapterV2::GetWrappedReferenceDa } Status ManifestFileAdapterV2::Init() { - static std::unordered_set compatible_fields{ - 500, 501, 502, 517, 515, 516, 503, 504, 505, 506, 512, 513, 514, 507, 519, + static std::unordered_set kManifestFileFieldIds{ + ManifestFile::kManifestPath.field_id(), + ManifestFile::kManifestLength.field_id(), + ManifestFile::kPartitionSpecId.field_id(), + ManifestFile::kContent.field_id(), + ManifestFile::kSequenceNumber.field_id(), + ManifestFile::kMinSequenceNumber.field_id(), + ManifestFile::kAddedSnapshotId.field_id(), + ManifestFile::kAddedFilesCount.field_id(), + ManifestFile::kExistingFilesCount.field_id(), + ManifestFile::kDeletedFilesCount.field_id(), + ManifestFile::kAddedRowsCount.field_id(), + ManifestFile::kExistingRowsCount.field_id(), + ManifestFile::kDeletedRowsCount.field_id(), + ManifestFile::kPartitions.field_id(), + ManifestFile::kKeyMetadata.field_id(), }; metadata_["snapshot-id"] = std::to_string(snapshot_id_); metadata_["parent-snapshot-id"] = parent_snapshot_id_.has_value() @@ -88,15 +122,14 @@ Status ManifestFileAdapterV2::Init() { : "null"; metadata_["sequence-number"] = std::to_string(sequence_number_); metadata_["format-version"] = "2"; - return InitSchema(compatible_fields); + return InitSchema(kManifestFileFieldIds); } -Status ManifestFileAdapterV2::Append(const iceberg::ManifestFile& file) { +Status ManifestFileAdapterV2::Append(const ManifestFile& file) { return AppendInternal(file); } -Result ManifestFileAdapterV2::GetWrappedSequenceNumber( - const iceberg::ManifestFile& file) { +Result ManifestFileAdapterV2::GetSequenceNumber(const ManifestFile& file) { if (file.sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( @@ -109,7 +142,7 @@ Result ManifestFileAdapterV2::GetWrappedSequenceNumber( } Result ManifestFileAdapterV2::GetWrappedMinSequenceNumber( - const iceberg::ManifestFile& file) { + const ManifestFile& file) { if (file.min_sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( diff --git a/src/iceberg/v2_metadata.h b/src/iceberg/v2_metadata.h index 788295427..966eb23a4 100644 --- a/src/iceberg/v2_metadata.h +++ b/src/iceberg/v2_metadata.h @@ -29,16 +29,13 @@ namespace iceberg { class ManifestEntryAdapterV2 : public ManifestEntryAdapter { public: ManifestEntryAdapterV2(std::optional snapshot_id, - std::shared_ptr partition_schema, std::shared_ptr partition_spec) - : ManifestEntryAdapter(std::move(partition_schema), std::move(partition_spec)), - snapshot_id_(snapshot_id) {} + : ManifestEntryAdapter(std::move(partition_spec)), snapshot_id_(snapshot_id) {} Status Init() override; Status Append(const ManifestEntry& entry) override; protected: - Result> GetWrappedSequenceNumber( - const ManifestEntry& entry) override; + Result> GetSequenceNumber(const ManifestEntry& entry) override; Result> GetWrappedReferenceDataFile( const std::shared_ptr& file) override; @@ -58,7 +55,7 @@ class ManifestFileAdapterV2 : public ManifestFileAdapter { Status Append(const ManifestFile& file) override; protected: - Result GetWrappedSequenceNumber(const ManifestFile& file) override; + Result GetSequenceNumber(const ManifestFile& file) override; Result GetWrappedMinSequenceNumber(const ManifestFile& file) override; private: diff --git a/src/iceberg/v3_metadata.cc b/src/iceberg/v3_metadata.cc index 534abe48d..1b5f1e15b 100644 --- a/src/iceberg/v3_metadata.cc +++ b/src/iceberg/v3_metadata.cc @@ -27,9 +27,32 @@ namespace iceberg { Status ManifestEntryAdapterV3::Init() { - static std::unordered_set compatible_fields{ - 0, 1, 2, 3, 4, 134, 100, 101, 102, 103, 104, 108, 109, - 110, 137, 125, 128, 131, 132, 135, 140, 142, 143, 144, 145, + static std::unordered_set kManifestEntryFieldIds{ + ManifestEntry::kStatus.field_id(), + ManifestEntry::kSnapshotId.field_id(), + ManifestEntry::kDataFileFieldId, + ManifestEntry::kSequenceNumber.field_id(), + ManifestEntry::kFileSequenceNumber.field_id(), + DataFile::kContent.field_id(), + DataFile::kFilePath.field_id(), + DataFile::kFileFormat.field_id(), + DataFile::kPartitionFieldId, + DataFile::kRecordCount.field_id(), + DataFile::kFileSize.field_id(), + DataFile::kColumnSizes.field_id(), + DataFile::kValueCounts.field_id(), + DataFile::kNullValueCounts.field_id(), + DataFile::kNanValueCounts.field_id(), + DataFile::kLowerBounds.field_id(), + DataFile::kUpperBounds.field_id(), + DataFile::kKeyMetadata.field_id(), + DataFile::kSplitOffsets.field_id(), + DataFile::kEqualityIds.field_id(), + DataFile::kSortOrderId.field_id(), + DataFile::kFirstRowId.field_id(), + DataFile::kReferencedDataFile.field_id(), + DataFile::kContentOffset.field_id(), + DataFile::kContentSize.field_id(), }; // TODO(xiao.dong) schema to json metadata_["schema"] = "{}"; @@ -40,15 +63,15 @@ Status ManifestEntryAdapterV3::Init() { } metadata_["format-version"] = "3"; metadata_["content"] = "data"; - return InitSchema(compatible_fields); + return InitSchema(kManifestEntryFieldIds); } -Status ManifestEntryAdapterV3::Append(const iceberg::ManifestEntry& entry) { +Status ManifestEntryAdapterV3::Append(const ManifestEntry& entry) { return AppendInternal(entry); } -Result> ManifestEntryAdapterV3::GetWrappedSequenceNumber( - const iceberg::ManifestEntry& entry) { +Result> ManifestEntryAdapterV3::GetSequenceNumber( + const ManifestEntry& entry) { if (!entry.sequence_number.has_value()) { // if the entry's data sequence number is null, // then it will inherit the sequence number of the current commit. @@ -104,8 +127,23 @@ Result> ManifestEntryAdapterV3::GetWrappedContentSizeInBy } Status ManifestFileAdapterV3::Init() { - static std::unordered_set compatible_fields{ - 500, 501, 502, 517, 515, 516, 503, 504, 505, 506, 512, 513, 514, 507, 519, 520, + static std::unordered_set kManifestFileFieldIds{ + ManifestFile::kManifestPath.field_id(), + ManifestFile::kManifestLength.field_id(), + ManifestFile::kPartitionSpecId.field_id(), + ManifestFile::kContent.field_id(), + ManifestFile::kSequenceNumber.field_id(), + ManifestFile::kMinSequenceNumber.field_id(), + ManifestFile::kAddedSnapshotId.field_id(), + ManifestFile::kAddedFilesCount.field_id(), + ManifestFile::kExistingFilesCount.field_id(), + ManifestFile::kDeletedFilesCount.field_id(), + ManifestFile::kAddedRowsCount.field_id(), + ManifestFile::kExistingRowsCount.field_id(), + ManifestFile::kDeletedRowsCount.field_id(), + ManifestFile::kPartitions.field_id(), + ManifestFile::kKeyMetadata.field_id(), + ManifestFile::kFirstRowId.field_id(), }; metadata_["snapshot-id"] = std::to_string(snapshot_id_); metadata_["parent-snapshot-id"] = parent_snapshot_id_.has_value() @@ -115,10 +153,10 @@ Status ManifestFileAdapterV3::Init() { metadata_["first-row-id"] = next_row_id_.has_value() ? std::to_string(next_row_id_.value()) : "null"; metadata_["format-version"] = "3"; - return InitSchema(compatible_fields); + return InitSchema(kManifestFileFieldIds); } -Status ManifestFileAdapterV3::Append(const iceberg::ManifestFile& file) { +Status ManifestFileAdapterV3::Append(const ManifestFile& file) { auto status = AppendInternal(file); ICEBERG_RETURN_UNEXPECTED(status); if (WrappedFirstRowId(file) && next_row_id_.has_value()) { @@ -128,8 +166,7 @@ Status ManifestFileAdapterV3::Append(const iceberg::ManifestFile& file) { return status; } -Result ManifestFileAdapterV3::GetWrappedSequenceNumber( - const iceberg::ManifestFile& file) { +Result ManifestFileAdapterV3::GetSequenceNumber(const ManifestFile& file) { if (file.sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( @@ -142,7 +179,7 @@ Result ManifestFileAdapterV3::GetWrappedSequenceNumber( } Result ManifestFileAdapterV3::GetWrappedMinSequenceNumber( - const iceberg::ManifestFile& file) { + const ManifestFile& file) { if (file.min_sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( @@ -155,7 +192,7 @@ Result ManifestFileAdapterV3::GetWrappedMinSequenceNumber( } Result> ManifestFileAdapterV3::GetWrappedFirstRowId( - const iceberg::ManifestFile& file) { + const ManifestFile& file) { if (WrappedFirstRowId(file)) { return next_row_id_; } else if (file.content != ManifestFile::Content::kData) { @@ -169,7 +206,7 @@ Result> ManifestFileAdapterV3::GetWrappedFirstRowId( } } -bool ManifestFileAdapterV3::WrappedFirstRowId(const iceberg::ManifestFile& file) { +bool ManifestFileAdapterV3::WrappedFirstRowId(const ManifestFile& file) { return file.content == ManifestFile::Content::kData && !file.first_row_id.has_value(); } diff --git a/src/iceberg/v3_metadata.h b/src/iceberg/v3_metadata.h index d27116416..03f8f95e0 100644 --- a/src/iceberg/v3_metadata.h +++ b/src/iceberg/v3_metadata.h @@ -30,17 +30,15 @@ class ManifestEntryAdapterV3 : public ManifestEntryAdapter { public: ManifestEntryAdapterV3(std::optional snapshot_id, std::optional first_row_id, - std::shared_ptr partition_schema, std::shared_ptr partition_spec) - : ManifestEntryAdapter(std::move(partition_schema), std::move(partition_spec)), + : ManifestEntryAdapter(std::move(partition_spec)), snapshot_id_(snapshot_id), first_row_id_(first_row_id) {} Status Init() override; Status Append(const ManifestEntry& entry) override; protected: - Result> GetWrappedSequenceNumber( - const ManifestEntry& entry) override; + Result> GetSequenceNumber(const ManifestEntry& entry) override; Result> GetWrappedReferenceDataFile( const std::shared_ptr& file) override; Result> GetWrappedFirstRowId( @@ -68,7 +66,7 @@ class ManifestFileAdapterV3 : public ManifestFileAdapter { Status Append(const ManifestFile& file) override; protected: - Result GetWrappedSequenceNumber(const ManifestFile& file) override; + Result GetSequenceNumber(const ManifestFile& file) override; Result GetWrappedMinSequenceNumber(const ManifestFile& file) override; Result> GetWrappedFirstRowId(const ManifestFile& file) override; From 5e6bcef0f3a4c45085cf02ce3d2f12dea1d95cc7 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Fri, 12 Sep 2025 16:22:29 +0800 Subject: [PATCH 09/17] fix lint --- src/iceberg/manifest_entry.h | 5 +++-- src/iceberg/partition_spec.cc | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/iceberg/manifest_entry.h b/src/iceberg/manifest_entry.h index ebfcf300b..2b9987b06 100644 --- a/src/iceberg/manifest_entry.h +++ b/src/iceberg/manifest_entry.h @@ -297,12 +297,13 @@ struct ICEBERG_EXPORT ManifestEntry { SchemaField::MakeRequired(0, "status", iceberg::int32()); inline static const SchemaField kSnapshotId = SchemaField::MakeOptional(1, "snapshot_id", iceberg::int64()); + inline static const int32_t kDataFileFieldId = 2; + inline static const std::string kDataFileField = "data_file"; inline static const SchemaField kSequenceNumber = SchemaField::MakeOptional(3, "sequence_number", iceberg::int64()); inline static const SchemaField kFileSequenceNumber = SchemaField::MakeOptional(4, "file_sequence_number", iceberg::int64()); - inline static const int32_t kDataFileFieldId = 2; - inline static const std::string kDataFileField = "data_file"; + bool operator==(const ManifestEntry& other) const; static std::shared_ptr TypeFromPartitionType( diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 8c084e00b..06726c7a7 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -65,7 +65,7 @@ Result> PartitionSpec::partition_schema() { return nullptr; } { - std::lock_guard lock(mutex_); + std::scoped_lock lock(mutex_); if (partition_schema_ != nullptr) { return partition_schema_; } @@ -96,7 +96,7 @@ Result> PartitionSpec::partition_schema() { ); } - std::lock_guard lock(mutex_); + std::scoped_lock lock(mutex_); if (partition_schema_ == nullptr) { partition_schema_ = std::make_shared(std::move(partition_fields)); } From 71f832a7766ffbddc28322da0d981164ba8be347 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Wed, 17 Sep 2025 10:32:11 +0800 Subject: [PATCH 10/17] fix cpplint --- src/iceberg/manifest_writer.cc | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/iceberg/manifest_writer.cc b/src/iceberg/manifest_writer.cc index 278d9907f..fed159d18 100644 --- a/src/iceberg/manifest_writer.cc +++ b/src/iceberg/manifest_writer.cc @@ -71,9 +71,11 @@ Result> ManifestWriter::MakeV1Writer( std::make_unique(snapshot_id, std::move(partition_spec)); ICEBERG_RETURN_UNEXPECTED(adapter->Init()); ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); + + auto schema = adapter->schema(); ICEBERG_ASSIGN_OR_RAISE( auto writer, - OpenFileWriter(manifest_location, adapter->schema(), std::move(file_io))); + OpenFileWriter(manifest_location, std::move(schema), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } @@ -84,9 +86,11 @@ Result> ManifestWriter::MakeV2Writer( std::make_unique(snapshot_id, std::move(partition_spec)); ICEBERG_RETURN_UNEXPECTED(adapter->Init()); ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); + + auto schema = adapter->schema(); ICEBERG_ASSIGN_OR_RAISE( auto writer, - OpenFileWriter(manifest_location, adapter->schema(), std::move(file_io))); + OpenFileWriter(manifest_location, std::move(schema), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } @@ -98,9 +102,11 @@ Result> ManifestWriter::MakeV3Writer( std::move(partition_spec)); ICEBERG_RETURN_UNEXPECTED(adapter->Init()); ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); + + auto schema = adapter->schema(); ICEBERG_ASSIGN_OR_RAISE( auto writer, - OpenFileWriter(manifest_location, adapter->schema(), std::move(file_io))); + OpenFileWriter(manifest_location, std::move(schema), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } @@ -134,9 +140,11 @@ Result> ManifestListWriter::MakeV1Writer( auto adapter = std::make_unique(snapshot_id, parent_snapshot_id); ICEBERG_RETURN_UNEXPECTED(adapter->Init()); ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); + + auto schema = adapter->schema(); ICEBERG_ASSIGN_OR_RAISE( auto writer, - OpenFileWriter(manifest_list_location, adapter->schema(), std::move(file_io))); + OpenFileWriter(manifest_list_location, std::move(schema), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } @@ -148,9 +156,11 @@ Result> ManifestListWriter::MakeV2Writer( sequence_number); ICEBERG_RETURN_UNEXPECTED(adapter->Init()); ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); + + auto schema = adapter->schema(); ICEBERG_ASSIGN_OR_RAISE( auto writer, - OpenFileWriter(manifest_list_location, adapter->schema(), std::move(file_io))); + OpenFileWriter(manifest_list_location, std::move(schema), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } @@ -163,9 +173,11 @@ Result> ManifestListWriter::MakeV3Writer( sequence_number, first_row_id); ICEBERG_RETURN_UNEXPECTED(adapter->Init()); ICEBERG_RETURN_UNEXPECTED(adapter->StartAppending()); + + auto schema = adapter->schema(); ICEBERG_ASSIGN_OR_RAISE( auto writer, - OpenFileWriter(manifest_list_location, adapter->schema(), std::move(file_io))); + OpenFileWriter(manifest_list_location, std::move(schema), std::move(file_io))); return std::make_unique(std::move(writer), std::move(adapter)); } From 968705023268b3d6c0e4013f8be39f531ad4eb74 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Wed, 17 Sep 2025 11:59:21 +0800 Subject: [PATCH 11/17] remove useless virtual define --- src/iceberg/manifest_adapter.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index 4f3a758f9..5307488ae 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -46,7 +46,6 @@ class ICEBERG_EXPORT ManifestAdapter { Status StartAppending(); Result> FinishAppending(); int64_t size() const { return size_; } - virtual const std::shared_ptr& schema() const = 0; protected: static Status AppendField(ArrowArray* arrowArray, int64_t value); @@ -72,7 +71,7 @@ class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { virtual Status Append(const ManifestEntry& entry) = 0; - const std::shared_ptr& schema() const override { return manifest_schema_; } + const std::shared_ptr& schema() const { return manifest_schema_; } protected: virtual Result> GetManifestEntryStructType(); @@ -123,7 +122,7 @@ class ICEBERG_EXPORT ManifestFileAdapter : public ManifestAdapter { virtual Status Append(const ManifestFile& file) = 0; - const std::shared_ptr& schema() const override { return manifest_list_schema_; } + const std::shared_ptr& schema() const { return manifest_list_schema_; } protected: /// \brief Init version-specific schema for each version. From bba92db355a7ee0a75e682260f8333c474685d0f Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Thu, 9 Oct 2025 15:00:58 +0800 Subject: [PATCH 12/17] merge conflict --- src/iceberg/util/macros.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/iceberg/util/macros.h b/src/iceberg/util/macros.h index af7da7bc4..278035d3f 100644 --- a/src/iceberg/util/macros.h +++ b/src/iceberg/util/macros.h @@ -38,3 +38,5 @@ #define ICEBERG_ASSIGN_OR_RAISE(lhs, rexpr) \ ICEBERG_ASSIGN_OR_RAISE_IMPL(ICEBERG_ASSIGN_OR_RAISE_NAME(result_, __COUNTER__), lhs, \ rexpr) + +#define ICEBERG_DCHECK(expr, message) assert((expr) && (message)) From d06f1403ad751f47a50474fa218c66bb5e5901ef Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Thu, 9 Oct 2025 16:23:22 +0800 Subject: [PATCH 13/17] fix comment and conflict --- .../nanoarrow_error_transform_internal.h | 6 +-- src/iceberg/avro/avro_writer.cc | 6 +-- src/iceberg/avro/avro_writer.h | 2 +- src/iceberg/file_writer.h | 2 +- src/iceberg/manifest_adapter.cc | 45 ++++++++++--------- src/iceberg/manifest_adapter.h | 4 +- src/iceberg/manifest_writer.cc | 8 ++-- src/iceberg/parquet/parquet_writer.cc | 6 +-- src/iceberg/parquet/parquet_writer.h | 2 +- src/iceberg/partition_spec.cc | 5 +-- src/iceberg/partition_spec.h | 2 +- src/iceberg/test/avro_test.cc | 2 +- .../test/manifest_reader_writer_test.cc | 6 +++ src/iceberg/test/parquet_test.cc | 2 +- src/iceberg/v1_metadata.cc | 2 +- src/iceberg/v2_metadata.cc | 2 +- 16 files changed, 55 insertions(+), 47 deletions(-) diff --git a/src/iceberg/arrow/nanoarrow_error_transform_internal.h b/src/iceberg/arrow/nanoarrow_error_transform_internal.h index 2b32fea0c..1bc0bf65a 100644 --- a/src/iceberg/arrow/nanoarrow_error_transform_internal.h +++ b/src/iceberg/arrow/nanoarrow_error_transform_internal.h @@ -19,7 +19,7 @@ #pragma once -#define ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return InvalidArrowData("Nanoarrow error msg: {}", error.message); \ +#define ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return InvalidArrowData("nanoarrow error: {}", error.message); \ } diff --git a/src/iceberg/avro/avro_writer.cc b/src/iceberg/avro/avro_writer.cc index bfa82c762..1db0614e7 100644 --- a/src/iceberg/avro/avro_writer.cc +++ b/src/iceberg/avro/avro_writer.cc @@ -75,9 +75,9 @@ class AvroWriter::Impl { return {}; } - Status Write(ArrowArray& data) { + Status Write(ArrowArray* data) { ICEBERG_ARROW_ASSIGN_OR_RETURN(auto result, - ::arrow::ImportArray(&data, &arrow_schema_)); + ::arrow::ImportArray(data, &arrow_schema_)); for (int64_t i = 0; i < result->length(); i++) { ICEBERG_RETURN_UNEXPECTED(ExtractDatumFromArray(*result, i, datum_.get())); @@ -119,7 +119,7 @@ class AvroWriter::Impl { AvroWriter::~AvroWriter() = default; -Status AvroWriter::Write(ArrowArray& data) { return impl_->Write(data); } +Status AvroWriter::Write(ArrowArray* data) { return impl_->Write(data); } Status AvroWriter::Open(const WriterOptions& options) { impl_ = std::make_unique(); diff --git a/src/iceberg/avro/avro_writer.h b/src/iceberg/avro/avro_writer.h index e539a7588..0a5dd0b43 100644 --- a/src/iceberg/avro/avro_writer.h +++ b/src/iceberg/avro/avro_writer.h @@ -35,7 +35,7 @@ class ICEBERG_BUNDLE_EXPORT AvroWriter : public Writer { Status Close() final; - Status Write(ArrowArray& data) final; + Status Write(ArrowArray* data) final; std::optional metrics() final; diff --git a/src/iceberg/file_writer.h b/src/iceberg/file_writer.h index 874c983fa..9ecfb8b5d 100644 --- a/src/iceberg/file_writer.h +++ b/src/iceberg/file_writer.h @@ -65,7 +65,7 @@ class ICEBERG_EXPORT Writer { /// \brief Write arrow data to the file. /// /// \return Status of write results. - virtual Status Write(ArrowArray& data) = 0; + virtual Status Write(ArrowArray* data) = 0; /// \brief Get the file statistics. /// Only valid after the file is closed. diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index 71296f8c0..ad8dfc237 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -19,6 +19,8 @@ #include "iceberg/manifest_adapter.h" +#include + #include "iceberg/arrow/nanoarrow_error_transform_internal.h" #include "iceberg/manifest_entry.h" #include "iceberg/manifest_list.h" @@ -26,11 +28,10 @@ #include "iceberg/schema_internal.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/macros.h" -#include "nanoarrow/nanoarrow.h" -#define NANOARROW_RETURN_IF_FAILED(status) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return InvalidArrowData("Nanoarrow error code: {}", status); \ +#define NANOARROW_RETURN_IF_FAILED(status) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return InvalidArrowData("nanoarrow error: {}", status); \ } namespace { @@ -43,20 +44,20 @@ Status ManifestAdapter::StartAppending() { if (size_ > 0) { return InvalidArgument("Adapter buffer not empty, cannot start appending."); } - array_ = std::make_shared(); + array_ = {}; size_ = 0; ArrowError error; - ICEBERG_NANOARROW_RETURN_IF_NOT_OK( - ArrowArrayInitFromSchema(array_.get(), &schema_, &error), error); - NANOARROW_RETURN_IF_FAILED(ArrowArrayStartAppending(array_.get())); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayInitFromSchema(&array_, &schema_, &error), + error); + NANOARROW_RETURN_IF_FAILED(ArrowArrayStartAppending(&array_)); return {}; } -Result> ManifestAdapter::FinishAppending() { +Result ManifestAdapter::FinishAppending() { ArrowError error; - ICEBERG_NANOARROW_RETURN_IF_NOT_OK( - ArrowArrayFinishBuildingDefault(array_.get(), &error), error); - return array_; + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishBuildingDefault(&array_, &error), + error); + return &array_; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, int64_t value) { @@ -90,8 +91,8 @@ Status ManifestAdapter::AppendField(ArrowArray* arrowArray, } ManifestEntryAdapter::~ManifestEntryAdapter() { - if (array_ != nullptr && array_->release != nullptr) { - ArrowArrayRelease(array_.get()); + if (array_.release != nullptr) { + ArrowArrayRelease(&array_); } if (schema_.release != nullptr) { ArrowSchemaRelease(&schema_); @@ -102,7 +103,7 @@ Result> ManifestEntryAdapter::GetManifestEntryStruct if (partition_spec_ == nullptr) { return ManifestEntry::TypeFromPartitionType(nullptr); } - ICEBERG_ASSIGN_OR_RAISE(auto partition_schema, partition_spec_->partition_schema()); + ICEBERG_ASSIGN_OR_RAISE(auto partition_schema, partition_spec_->GetPartitionSchema()); return ManifestEntry::TypeFromPartitionType(std::move(partition_schema)); } @@ -215,6 +216,7 @@ Status ManifestEntryAdapter::AppendMap(ArrowArray* arrow_array, auto value_array = map_array->children[1]; ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(map_array)); } NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); return {}; @@ -231,6 +233,7 @@ Status ManifestEntryAdapter::AppendMap( auto value_array = map_array->children[1]; ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(map_array)); } NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); return {}; @@ -380,7 +383,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { const auto& fields = manifest_schema_->fields(); for (int32_t i = 0; i < fields.size(); i++) { const auto& field = fields[i]; - auto array = array_->children[i]; + auto array = array_.children[i]; switch (field.field_id()) { case 0: // status (required int32) @@ -427,7 +430,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { } } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(array_.get())); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(&array_)); size_++; return {}; } @@ -464,8 +467,8 @@ Status ManifestEntryAdapter::InitSchema(const std::unordered_set& field } ManifestFileAdapter::~ManifestFileAdapter() { - if (array_ != nullptr && array_->release != nullptr) { - ArrowArrayRelease(array_.get()); + if (array_.release != nullptr) { + ArrowArrayRelease(&array_); } if (schema_.release != nullptr) { ArrowSchemaRelease(&schema_); @@ -553,7 +556,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { const auto& fields = manifest_list_schema_->fields(); for (int32_t i = 0; i < fields.size(); i++) { const auto& field = fields[i]; - auto array = array_->children[i]; + auto array = array_.children[i]; switch (field.field_id()) { case 500: // manifest_path ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.manifest_path)); @@ -662,7 +665,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { return InvalidManifestList("Unknown field id: {}", field.field_id()); } } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(array_.get())); + NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(&array_)); size_++; return {}; } diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index 5307488ae..234623c4e 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -44,7 +44,7 @@ class ICEBERG_EXPORT ManifestAdapter { virtual Status Init() = 0; Status StartAppending(); - Result> FinishAppending(); + Result FinishAppending(); int64_t size() const { return size_; } protected: @@ -56,7 +56,7 @@ class ICEBERG_EXPORT ManifestAdapter { const std::span& value); protected: - std::shared_ptr array_; + ArrowArray array_; ArrowSchema schema_; // converted from manifest_schema_ or manifest_list_schema_ int64_t size_ = 0; }; diff --git a/src/iceberg/manifest_writer.cc b/src/iceberg/manifest_writer.cc index fed159d18..7bdfee7b6 100644 --- a/src/iceberg/manifest_writer.cc +++ b/src/iceberg/manifest_writer.cc @@ -32,7 +32,7 @@ namespace iceberg { Status ManifestWriter::Add(const ManifestEntry& entry) { if (adapter_->size() >= kBatchSize) { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); - ICEBERG_RETURN_UNEXPECTED(writer_->Write(*array)); + ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); ICEBERG_RETURN_UNEXPECTED(adapter_->StartAppending()); } return adapter_->Append(entry); @@ -48,7 +48,7 @@ Status ManifestWriter::AddAll(const std::vector& entries) { Status ManifestWriter::Close() { if (adapter_->size() > 0) { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); - ICEBERG_RETURN_UNEXPECTED(writer_->Write(*array)); + ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); } return writer_->Close(); } @@ -113,7 +113,7 @@ Result> ManifestWriter::MakeV3Writer( Status ManifestListWriter::Add(const ManifestFile& file) { if (adapter_->size() >= kBatchSize) { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); - ICEBERG_RETURN_UNEXPECTED(writer_->Write(*array)); + ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); ICEBERG_RETURN_UNEXPECTED(adapter_->StartAppending()); } return adapter_->Append(file); @@ -129,7 +129,7 @@ Status ManifestListWriter::AddAll(const std::vector& files) { Status ManifestListWriter::Close() { if (adapter_->size() > 0) { ICEBERG_ASSIGN_OR_RAISE(auto array, adapter_->FinishAppending()); - ICEBERG_RETURN_UNEXPECTED(writer_->Write(*array)); + ICEBERG_RETURN_UNEXPECTED(writer_->Write(array)); } return writer_->Close(); } diff --git a/src/iceberg/parquet/parquet_writer.cc b/src/iceberg/parquet/parquet_writer.cc index 2f34b142c..ed0bb8fd7 100644 --- a/src/iceberg/parquet/parquet_writer.cc +++ b/src/iceberg/parquet/parquet_writer.cc @@ -76,9 +76,9 @@ class ParquetWriter::Impl { return {}; } - Status Write(ArrowArray& array) { + Status Write(ArrowArray* array) { ICEBERG_ARROW_ASSIGN_OR_RETURN(auto batch, - ::arrow::ImportRecordBatch(&array, arrow_schema_)); + ::arrow::ImportRecordBatch(array, arrow_schema_)); ICEBERG_ARROW_RETURN_NOT_OK(writer_->WriteRecordBatch(*batch)); @@ -132,7 +132,7 @@ Status ParquetWriter::Open(const WriterOptions& options) { return impl_->Open(options); } -Status ParquetWriter::Write(ArrowArray& array) { return impl_->Write(array); } +Status ParquetWriter::Write(ArrowArray* array) { return impl_->Write(array); } Status ParquetWriter::Close() { return impl_->Close(); } diff --git a/src/iceberg/parquet/parquet_writer.h b/src/iceberg/parquet/parquet_writer.h index 630f60b68..9be0a430e 100644 --- a/src/iceberg/parquet/parquet_writer.h +++ b/src/iceberg/parquet/parquet_writer.h @@ -35,7 +35,7 @@ class ICEBERG_BUNDLE_EXPORT ParquetWriter : public Writer { Status Close() final; - Status Write(ArrowArray& array) final; + Status Write(ArrowArray* array) final; std::optional metrics() final; diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 06726c7a7..c745e218d 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -60,7 +60,7 @@ int32_t PartitionSpec::spec_id() const { return spec_id_; } std::span PartitionSpec::fields() const { return fields_; } -Result> PartitionSpec::partition_schema() { +Result> PartitionSpec::GetPartitionSchema() { if (fields_.empty()) { return nullptr; } @@ -92,8 +92,7 @@ Result> PartitionSpec::partition_schema() { partition_fields.emplace_back(partition_field.field_id(), std::string(partition_field.name()), std::move(result_type), - true // optional - ); + /*optional=*/true); } std::scoped_lock lock(mutex_); diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index bfda12ca3..dafa71e7a 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -69,7 +69,7 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { /// \brief Get a view of the partition fields. std::span fields() const; - Result> partition_schema(); + Result> GetPartitionSchema(); std::string ToString() const override; diff --git a/src/iceberg/test/avro_test.cc b/src/iceberg/test/avro_test.cc index 4bc773cba..2bd09f925 100644 --- a/src/iceberg/test/avro_test.cc +++ b/src/iceberg/test/avro_test.cc @@ -128,7 +128,7 @@ class AvroReaderTest : public TempFileTestBase { {.path = temp_avro_file_, .schema = schema, .io = file_io_}); ASSERT_TRUE(writer_result.has_value()); auto writer = std::move(writer_result.value()); - ASSERT_THAT(writer->Write(arrow_array), IsOk()); + ASSERT_THAT(writer->Write(&arrow_array), IsOk()); ASSERT_THAT(writer->Close(), IsOk()); auto file_info_result = local_fs_->GetFileInfo(temp_avro_file_); diff --git a/src/iceberg/test/manifest_reader_writer_test.cc b/src/iceberg/test/manifest_reader_writer_test.cc index 7b1c71fbe..f9213b2d5 100644 --- a/src/iceberg/test/manifest_reader_writer_test.cc +++ b/src/iceberg/test/manifest_reader_writer_test.cc @@ -187,6 +187,7 @@ TEST_F(ManifestReaderV1Test, WritePartitionedTest) { auto expected_entries = PreparePartitionedTestData(); auto write_manifest_path = CreateNewTempFilePath(); TestWriteManifest(write_manifest_path, partition_spec, expected_entries); + TestManifestReadingByPath(write_manifest_path, expected_entries, partition_schema); } class ManifestReaderV2Test : public ManifestReaderTestBase { @@ -293,12 +294,17 @@ TEST_F(ManifestReaderV2Test, WriteNonPartitionedTest) { auto expected_entries = PrepareNonPartitionedTestData(); auto write_manifest_path = CreateNewTempFilePath(); TestWriteManifest(679879563479918846LL, write_manifest_path, nullptr, expected_entries); + TestManifestReadingByPath(write_manifest_path, expected_entries); } TEST_F(ManifestReaderV2Test, WriteInheritancePartitionedTest) { auto expected_entries = PrepareMetadataInheritanceTestData(); auto write_manifest_path = CreateNewTempFilePath(); TestWriteManifest(679879563479918846LL, write_manifest_path, nullptr, expected_entries); + for (auto& entry : expected_entries) { + entry.data_file->partition_spec_id = PartitionSpec::kInitialSpecId; + } + TestManifestReadingByPath(write_manifest_path, expected_entries); } } // namespace iceberg diff --git a/src/iceberg/test/parquet_test.cc b/src/iceberg/test/parquet_test.cc index 0c42b8463..755817567 100644 --- a/src/iceberg/test/parquet_test.cc +++ b/src/iceberg/test/parquet_test.cc @@ -51,7 +51,7 @@ namespace { Status WriteArray(std::shared_ptr<::arrow::Array> data, Writer& writer) { ArrowArray arr; ICEBERG_ARROW_RETURN_NOT_OK(::arrow::ExportArray(*data, &arr)); - ICEBERG_RETURN_UNEXPECTED(writer.Write(arr)); + ICEBERG_RETURN_UNEXPECTED(writer.Write(&arr)); return writer.Close(); } diff --git a/src/iceberg/v1_metadata.cc b/src/iceberg/v1_metadata.cc index f391e1535..efb556b23 100644 --- a/src/iceberg/v1_metadata.cc +++ b/src/iceberg/v1_metadata.cc @@ -68,7 +68,7 @@ Result> ManifestEntryAdapterV1::GetManifestEntryStru // Deprecated. Always write a default in v1. Do not write in v2 or v3. static const SchemaField kBlockSizeInBytes = SchemaField::MakeRequired( 105, "block_size_in_bytes", int64(), "Block size in bytes"); - ICEBERG_ASSIGN_OR_RAISE(auto partition_type, partition_spec_->partition_schema()); + ICEBERG_ASSIGN_OR_RAISE(auto partition_type, partition_spec_->GetPartitionSchema()); if (!partition_type) { partition_type = PartitionSpec::Unpartitioned()->schema(); } diff --git a/src/iceberg/v2_metadata.cc b/src/iceberg/v2_metadata.cc index 0671894b9..1b3b3b1b5 100644 --- a/src/iceberg/v2_metadata.cc +++ b/src/iceberg/v2_metadata.cc @@ -29,9 +29,9 @@ Status ManifestEntryAdapterV2::Init() { static std::unordered_set kManifestEntryFieldIds{ ManifestEntry::kStatus.field_id(), ManifestEntry::kSnapshotId.field_id(), - ManifestEntry::kDataFileFieldId, ManifestEntry::kSequenceNumber.field_id(), ManifestEntry::kFileSequenceNumber.field_id(), + ManifestEntry::kDataFileFieldId, DataFile::kContent.field_id(), DataFile::kFilePath.field_id(), DataFile::kFileFormat.field_id(), From a27d8512005350670c8d71ca3bf2246a9de538b8 Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Sat, 11 Oct 2025 11:17:24 +0800 Subject: [PATCH 14/17] fix comments --- .../nanoarrow_error_transform_internal.h | 12 +- src/iceberg/manifest_adapter.cc | 167 +++++++++--------- src/iceberg/manifest_adapter.h | 24 ++- src/iceberg/manifest_reader_internal.cc | 12 +- src/iceberg/partition_spec.cc | 14 +- src/iceberg/partition_spec.h | 4 +- .../test/manifest_reader_writer_test.cc | 22 ++- src/iceberg/test/partition_spec_test.cc | 19 ++ src/iceberg/v1_metadata.cc | 2 +- src/iceberg/v2_metadata.cc | 5 +- src/iceberg/v2_metadata.h | 4 +- src/iceberg/v3_metadata.cc | 13 +- src/iceberg/v3_metadata.h | 12 +- 13 files changed, 166 insertions(+), 144 deletions(-) diff --git a/src/iceberg/arrow/nanoarrow_error_transform_internal.h b/src/iceberg/arrow/nanoarrow_error_transform_internal.h index 1bc0bf65a..1a8f401d3 100644 --- a/src/iceberg/arrow/nanoarrow_error_transform_internal.h +++ b/src/iceberg/arrow/nanoarrow_error_transform_internal.h @@ -19,7 +19,13 @@ #pragma once -#define ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return InvalidArrowData("nanoarrow error: {}", error.message); \ +#define ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return iceberg::InvalidArrowData("nanoarrow error: {}", status); \ + } + +#define ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return iceberg::InvalidArrowData("nanoarrow error: {} msg:{}", status, \ + error.message); \ } diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index ad8dfc237..59237749b 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -29,11 +29,6 @@ #include "iceberg/util/checked_cast.h" #include "iceberg/util/macros.h" -#define NANOARROW_RETURN_IF_FAILED(status) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return InvalidArrowData("nanoarrow error: {}", status); \ - } - namespace { static constexpr int64_t kBlockSizeInBytesV1 = 64 * 1024 * 1024L; } @@ -47,46 +42,46 @@ Status ManifestAdapter::StartAppending() { array_ = {}; size_ = 0; ArrowError error; - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayInitFromSchema(&array_, &schema_, &error), - error); - NANOARROW_RETURN_IF_FAILED(ArrowArrayStartAppending(&array_)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR( + ArrowArrayInitFromSchema(&array_, &schema_, &error), error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayStartAppending(&array_)); return {}; } Result ManifestAdapter::FinishAppending() { ArrowError error; - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishBuildingDefault(&array_, &error), - error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR( + ArrowArrayFinishBuildingDefault(&array_, &error), error); return &array_; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, int64_t value) { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendInt(arrowArray, value)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendInt(arrowArray, value)); return {}; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, uint64_t value) { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendUInt(arrowArray, value)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendUInt(arrowArray, value)); return {}; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, double value) { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendDouble(arrowArray, value)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendDouble(arrowArray, value)); return {}; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, std::string_view value) { ArrowStringView view(value.data(), value.size()); - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendString(arrowArray, view)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendString(arrowArray, view)); return {}; } Status ManifestAdapter::AppendField(ArrowArray* arrowArray, - const std::span& value) { + std::span value) { ArrowBufferViewData data; data.as_char = reinterpret_cast(value.data()); ArrowBufferView view(data, value.size()); - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendBytes(arrowArray, view)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendBytes(arrowArray, view)); return {}; } @@ -100,10 +95,11 @@ ManifestEntryAdapter::~ManifestEntryAdapter() { } Result> ManifestEntryAdapter::GetManifestEntryStructType() { - if (partition_spec_ == nullptr) { + if (partition_spec_ == nullptr) [[unlikely]] { + // NOTICE: never reach here by design, partition_spec_ should always not null. return ManifestEntry::TypeFromPartitionType(nullptr); } - ICEBERG_ASSIGN_OR_RAISE(auto partition_schema, partition_spec_->GetPartitionSchema()); + ICEBERG_ASSIGN_OR_RAISE(auto partition_schema, partition_spec_->PartitionType()); return ManifestEntry::TypeFromPartitionType(std::move(partition_schema)); } @@ -123,7 +119,7 @@ Status ManifestEntryAdapter::AppendPartition( const auto& field = fields[i]; auto array = arrow_array->children[i]; if (partition.IsNull()) { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); continue; } switch (field.type()->type_id()) { @@ -180,7 +176,7 @@ Status ManifestEntryAdapter::AppendPartition( return InvalidManifest("Unsupported partition type: {}", field.ToString()); } } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -188,10 +184,10 @@ Status ManifestEntryAdapter::AppendList(ArrowArray* arrow_array, const std::vector& list_value) { auto list_array = arrow_array->children[0]; for (const auto& value : list_value) { - NANOARROW_RETURN_IF_FAILED( + ICEBERG_NANOARROW_RETURN_IF_NOT_OK( ArrowArrayAppendInt(list_array, static_cast(value))); } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -199,9 +195,9 @@ Status ManifestEntryAdapter::AppendList(ArrowArray* arrow_array, const std::vector& list_value) { auto list_array = arrow_array->children[0]; for (const auto& value : list_value) { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendInt(list_array, value)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendInt(list_array, value)); } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -216,9 +212,9 @@ Status ManifestEntryAdapter::AppendMap(ArrowArray* arrow_array, auto value_array = map_array->children[1]; ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(map_array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(map_array)); } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -233,9 +229,9 @@ Status ManifestEntryAdapter::AppendMap( auto value_array = map_array->children[1]; ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(map_array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(map_array)); } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -296,7 +292,7 @@ Status ManifestEntryAdapter::AppendDataFile( if (!file->key_metadata.empty()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->key_metadata)); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 132: // split_offsets (optional list) @@ -310,24 +306,23 @@ Status ManifestEntryAdapter::AppendDataFile( ICEBERG_RETURN_UNEXPECTED( AppendField(array, static_cast(file->sort_order_id.value()))); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 142: // first_row_id (optional int64) if (file->first_row_id.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->first_row_id.value())); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 143: // referenced_data_file (optional string) { - ICEBERG_ASSIGN_OR_RAISE(auto referenced_data_file, - GetWrappedReferenceDataFile(file)); + ICEBERG_ASSIGN_OR_RAISE(auto referenced_data_file, GetReferenceDataFile(file)); if (referenced_data_file.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, referenced_data_file.value())); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; } @@ -335,7 +330,7 @@ Status ManifestEntryAdapter::AppendDataFile( if (file->content_offset.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->content_offset.value())); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 145: // content_size_in_bytes (optional int64) @@ -343,14 +338,14 @@ Status ManifestEntryAdapter::AppendDataFile( ICEBERG_RETURN_UNEXPECTED( AppendField(array, file->content_size_in_bytes.value())); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; default: return InvalidManifest("Unknown data file field id: {} ", field.field_id()); } } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -359,22 +354,22 @@ Result> ManifestEntryAdapter::GetSequenceNumber( return entry.sequence_number; } -Result> ManifestEntryAdapter::GetWrappedReferenceDataFile( +Result> ManifestEntryAdapter::GetReferenceDataFile( const std::shared_ptr& file) { return file->referenced_data_file; } -Result> ManifestEntryAdapter::GetWrappedFirstRowId( +Result> ManifestEntryAdapter::GetFirstRowId( const std::shared_ptr& file) { return file->first_row_id; } -Result> ManifestEntryAdapter::GetWrappedContentOffset( +Result> ManifestEntryAdapter::GetContentOffset( const std::shared_ptr& file) { return file->content_offset; } -Result> ManifestEntryAdapter::GetWrappedContentSizeInBytes( +Result> ManifestEntryAdapter::GetContentSizeInBytes( const std::shared_ptr& file) { return file->content_size_in_bytes; } @@ -394,7 +389,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { if (entry.snapshot_id.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, entry.snapshot_id.value())); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 2: // data_file (required struct) @@ -404,7 +399,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { ICEBERG_RETURN_UNEXPECTED( AppendDataFile(array, data_file_type, entry.data_file)); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + return InvalidManifest("Missing required data file field."); } break; case 3: // sequence_number (optional int64) @@ -413,7 +408,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { if (sequence_num.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, sequence_num.value())); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; } @@ -422,7 +417,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { ICEBERG_RETURN_UNEXPECTED( AppendField(array, entry.file_sequence_number.value())); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; default: @@ -430,7 +425,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { } } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(&array_)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(&array_)); size_++; return {}; } @@ -475,66 +470,65 @@ ManifestFileAdapter::~ManifestFileAdapter() { } } -Status ManifestFileAdapter::AppendPartitions( - ArrowArray* arrow_array, const std::shared_ptr& partition_type, - const std::vector& partitions) { +Status ManifestFileAdapter::AppendPartitionSummary( + ArrowArray* arrow_array, const std::shared_ptr& summary_type, + const std::vector& summaries) { auto& array = arrow_array->children[0]; if (array->n_children != 4) { return InvalidManifestList("Invalid partition array."); } - auto partition_struct = - internal::checked_pointer_cast(partition_type->fields()[0].type()); - auto fields = partition_struct->fields(); - for (const auto& partition : partitions) { - for (const auto& field : fields) { - switch (field.field_id()) { + auto summary_struct = + internal::checked_pointer_cast(summary_type->fields()[0].type()); + auto summary_fields = summary_struct->fields(); + for (const auto& summary : summaries) { + for (const auto& summary_field : summary_fields) { + switch (summary_field.field_id()) { case 509: // contains_null (required bool) - ICEBERG_RETURN_UNEXPECTED( - AppendField(array->children[0], - static_cast(partition.contains_null ? 1 : 0))); + ICEBERG_RETURN_UNEXPECTED(AppendField( + array->children[0], static_cast(summary.contains_null ? 1 : 0))); break; case 518: // contains_nan (optional bool) { auto field_array = array->children[1]; - if (partition.contains_nan.has_value()) { - ICEBERG_RETURN_UNEXPECTED(AppendField( - field_array, - static_cast(partition.contains_nan.value() ? 1 : 0))); + if (summary.contains_nan.has_value()) { + ICEBERG_RETURN_UNEXPECTED( + AppendField(field_array, + static_cast(summary.contains_nan.value() ? 1 : 0))); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(field_array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(field_array, 1)); } break; } case 510: // lower_bound (optional binary) { auto field_array = array->children[2]; - if (partition.lower_bound.has_value() && !partition.lower_bound->empty()) { + if (summary.lower_bound.has_value() && !summary.lower_bound->empty()) { ICEBERG_RETURN_UNEXPECTED( - AppendField(field_array, partition.lower_bound.value())); + AppendField(field_array, summary.lower_bound.value())); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(field_array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(field_array, 1)); } break; } case 511: // upper_bound (optional binary) { auto field_array = array->children[3]; - if (partition.upper_bound.has_value() && !partition.upper_bound->empty()) { + if (summary.upper_bound.has_value() && !summary.upper_bound->empty()) { ICEBERG_RETURN_UNEXPECTED( - AppendField(field_array, partition.upper_bound.value())); + AppendField(field_array, summary.upper_bound.value())); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(field_array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(field_array, 1)); } break; } default: - return InvalidManifestList("Unknown field id: {}", field.field_id()); + return InvalidManifestList("Unknown field id: {}", summary_field.field_id()); } } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(array)); } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); return {}; } @@ -542,12 +536,11 @@ Result ManifestFileAdapter::GetSequenceNumber(const ManifestFile& file) return file.sequence_number; } -Result ManifestFileAdapter::GetWrappedMinSequenceNumber( - const ManifestFile& file) { +Result ManifestFileAdapter::GetMinSequenceNumber(const ManifestFile& file) { return file.min_sequence_number; } -Result> ManifestFileAdapter::GetWrappedFirstRowId( +Result> ManifestFileAdapter::GetFirstRowId( const ManifestFile& file) { return file.first_row_id; } @@ -580,7 +573,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { } case 516: // min_sequence_number { - ICEBERG_ASSIGN_OR_RAISE(auto min_sequence_num, GetWrappedMinSequenceNumber(file)); + ICEBERG_ASSIGN_OR_RAISE(auto min_sequence_num, GetMinSequenceNumber(file)); ICEBERG_RETURN_UNEXPECTED(AppendField(array, min_sequence_num)); break; } @@ -593,7 +586,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { AppendField(array, static_cast(file.added_files_count.value()))); } else { // Append null for optional field - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 505: // existing_files_count @@ -602,7 +595,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { array, static_cast(file.existing_files_count.value()))); } else { // Append null for optional field - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 506: // deleted_files_count @@ -611,7 +604,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { AppendField(array, static_cast(file.deleted_files_count.value()))); } else { // Append null for optional field - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 512: // added_rows_count @@ -619,7 +612,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.added_rows_count.value())); } else { // Append null for optional field - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 513: // existing_rows_count @@ -627,7 +620,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.existing_rows_count.value())); } else { // Append null for optional field - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 514: // deleted_rows_count @@ -635,11 +628,11 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.deleted_rows_count.value())); } else { // Append null for optional field - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 507: // partitions - ICEBERG_RETURN_UNEXPECTED(AppendPartitions( + ICEBERG_RETURN_UNEXPECTED(AppendPartitionSummary( array, internal::checked_pointer_cast(field.type()), file.partitions)); break; @@ -647,17 +640,17 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { if (!file.key_metadata.empty()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.key_metadata)); } else { - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 520: // first_row_id { - ICEBERG_ASSIGN_OR_RAISE(auto first_row_id, GetWrappedFirstRowId(file)); + ICEBERG_ASSIGN_OR_RAISE(auto first_row_id, GetFirstRowId(file)); if (first_row_id.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, first_row_id.value())); } else { // Append null for optional field - NANOARROW_RETURN_IF_FAILED(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; } @@ -665,7 +658,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { return InvalidManifestList("Unknown field id: {}", field.field_id()); } } - NANOARROW_RETURN_IF_FAILED(ArrowArrayFinishElement(&array_)); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(&array_)); size_++; return {}; } diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index 234623c4e..b141a95df 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -52,13 +52,13 @@ class ICEBERG_EXPORT ManifestAdapter { static Status AppendField(ArrowArray* arrowArray, uint64_t value); static Status AppendField(ArrowArray* arrowArray, double value); static Status AppendField(ArrowArray* arrowArray, std::string_view value); - static Status AppendField(ArrowArray* arrowArray, - const std::span& value); + static Status AppendField(ArrowArray* arrowArray, std::span value); protected: ArrowArray array_; ArrowSchema schema_; // converted from manifest_schema_ or manifest_list_schema_ int64_t size_ = 0; + std::unordered_map metadata_; }; // \brief Implemented by different versions with different schemas to @@ -98,19 +98,18 @@ class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { const std::map>& map_value); virtual Result> GetSequenceNumber(const ManifestEntry& entry); - virtual Result> GetWrappedReferenceDataFile( + virtual Result> GetReferenceDataFile( const std::shared_ptr& file); - virtual Result> GetWrappedFirstRowId( + virtual Result> GetFirstRowId( const std::shared_ptr& file); - virtual Result> GetWrappedContentOffset( + virtual Result> GetContentOffset( const std::shared_ptr& file); - virtual Result> GetWrappedContentSizeInBytes( + virtual Result> GetContentSizeInBytes( const std::shared_ptr& file); protected: std::shared_ptr partition_spec_; std::shared_ptr manifest_schema_; - std::unordered_map metadata_; }; // \brief Implemented by different versions with different schemas to @@ -131,17 +130,16 @@ class ICEBERG_EXPORT ManifestFileAdapter : public ManifestAdapter { /// schema based on the fields_ids. Status InitSchema(const std::unordered_set& fields_ids); Status AppendInternal(const ManifestFile& file); - static Status AppendPartitions(ArrowArray* arrow_array, - const std::shared_ptr& partition_type, - const std::vector& partitions); + static Status AppendPartitionSummary( + ArrowArray* arrow_array, const std::shared_ptr& summary_type, + const std::vector& summaries); virtual Result GetSequenceNumber(const ManifestFile& file); - virtual Result GetWrappedMinSequenceNumber(const ManifestFile& file); - virtual Result> GetWrappedFirstRowId(const ManifestFile& file); + virtual Result GetMinSequenceNumber(const ManifestFile& file); + virtual Result> GetFirstRowId(const ManifestFile& file); protected: std::shared_ptr manifest_list_schema_; - std::unordered_map metadata_; }; } // namespace iceberg diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index 7a2ce0804..4b20af9c3 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -204,12 +204,12 @@ Result> ParseManifestList(ArrowSchema* schema, ArrowError error; ArrowArrayView array_view; auto status = ArrowArrayViewInitFromSchema(&array_view, schema, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); internal::ArrowArrayViewGuard view_guard(&array_view); status = ArrowArrayViewSetArray(&array_view, array_in, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); status = ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); std::vector manifest_files; manifest_files.resize(array_in->length); @@ -467,12 +467,12 @@ Result> ParseManifestEntry(ArrowSchema* schema, ArrowError error; ArrowArrayView array_view; auto status = ArrowArrayViewInitFromSchema(&array_view, schema, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); internal::ArrowArrayViewGuard view_guard(&array_view); status = ArrowArrayViewSetArray(&array_view, array_in, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); status = ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status, error); + ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); std::vector manifest_entries; manifest_entries.resize(array_in->length); diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index c745e218d..3fa5d86eb 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -60,14 +60,14 @@ int32_t PartitionSpec::spec_id() const { return spec_id_; } std::span PartitionSpec::fields() const { return fields_; } -Result> PartitionSpec::GetPartitionSchema() { +Result> PartitionSpec::PartitionType() { if (fields_.empty()) { return nullptr; } { std::scoped_lock lock(mutex_); - if (partition_schema_ != nullptr) { - return partition_schema_; + if (partition_type_ != nullptr) { + return partition_type_; } } @@ -77,6 +77,8 @@ Result> PartitionSpec::GetPartitionSchema() { ICEBERG_ASSIGN_OR_RAISE(auto source_field, schema_->FindFieldById(partition_field.source_id())); if (!source_field.has_value()) { + // TODO(xiao.dong) when source field is missing, + // should return an error or just use UNKNOWN type return InvalidSchema("Cannot find source field for partition field:{}", partition_field.field_id()); } @@ -96,10 +98,10 @@ Result> PartitionSpec::GetPartitionSchema() { } std::scoped_lock lock(mutex_); - if (partition_schema_ == nullptr) { - partition_schema_ = std::make_shared(std::move(partition_fields)); + if (partition_type_ == nullptr) { + partition_type_ = std::make_shared(std::move(partition_fields)); } - return partition_schema_; + return partition_type_; } std::string PartitionSpec::ToString() const { diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index dafa71e7a..d4d4ea7f4 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -69,7 +69,7 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { /// \brief Get a view of the partition fields. std::span fields() const; - Result> GetPartitionSchema(); + Result> PartitionType(); std::string ToString() const override; @@ -88,7 +88,7 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { std::vector fields_; int32_t last_assigned_field_id_; std::mutex mutex_; - std::shared_ptr partition_schema_; + std::shared_ptr partition_type_; }; } // namespace iceberg diff --git a/src/iceberg/test/manifest_reader_writer_test.cc b/src/iceberg/test/manifest_reader_writer_test.cc index f9213b2d5..435b9bca2 100644 --- a/src/iceberg/test/manifest_reader_writer_test.cc +++ b/src/iceberg/test/manifest_reader_writer_test.cc @@ -154,7 +154,6 @@ class ManifestReaderV1Test : public ManifestReaderTestBase { void TestWriteManifest(const std::string& manifest_list_path, std::shared_ptr partition_spec, const std::vector& manifest_entries) { - std::cout << "Writing manifest list to " << manifest_list_path << std::endl; auto result = ManifestWriter::MakeV1Writer(1, manifest_list_path, file_io_, std::move(partition_spec)); ASSERT_TRUE(result.has_value()) << result.error().message; @@ -167,6 +166,7 @@ class ManifestReaderV1Test : public ManifestReaderTestBase { }; TEST_F(ManifestReaderV1Test, PartitionedTest) { + // TODO(xiao.dong) we need to add more cases for different partition types iceberg::SchemaField partition_field(1000, "order_ts_hour", iceberg::int32(), true); auto partition_schema = std::make_shared(std::vector({partition_field})); @@ -176,13 +176,15 @@ TEST_F(ManifestReaderV1Test, PartitionedTest) { } TEST_F(ManifestReaderV1Test, WritePartitionedTest) { + iceberg::SchemaField table_field(1, "order_ts_hour_source", iceberg::int32(), true); iceberg::SchemaField partition_field(1000, "order_ts_hour", iceberg::int32(), true); + auto table_schema = std::make_shared(std::vector({table_field})); auto partition_schema = std::make_shared(std::vector({partition_field})); auto identity_transform = Transform::Identity(); std::vector fields{ - PartitionField(1000, 1000, "order_ts_hour", identity_transform)}; - auto partition_spec = std::make_shared(partition_schema, 1, fields); + PartitionField(1, 1000, "order_ts_hour", identity_transform)}; + auto partition_spec = std::make_shared(table_schema, 1, fields); auto expected_entries = PreparePartitionedTestData(); auto write_manifest_path = CreateNewTempFilePath(); @@ -259,7 +261,6 @@ class ManifestReaderV2Test : public ManifestReaderTestBase { void TestWriteManifest(int64_t snapshot_id, const std::string& manifest_list_path, std::shared_ptr partition_spec, const std::vector& manifest_entries) { - std::cout << "Writing manifest list to " << manifest_list_path << std::endl; auto result = ManifestWriter::MakeV2Writer(snapshot_id, manifest_list_path, file_io_, std::move(partition_spec)); ASSERT_TRUE(result.has_value()) << result.error().message; @@ -301,10 +302,15 @@ TEST_F(ManifestReaderV2Test, WriteInheritancePartitionedTest) { auto expected_entries = PrepareMetadataInheritanceTestData(); auto write_manifest_path = CreateNewTempFilePath(); TestWriteManifest(679879563479918846LL, write_manifest_path, nullptr, expected_entries); - for (auto& entry : expected_entries) { - entry.data_file->partition_spec_id = PartitionSpec::kInitialSpecId; - } - TestManifestReadingByPath(write_manifest_path, expected_entries); + ManifestFile manifest_file{ + .manifest_path = write_manifest_path, + .manifest_length = 100, + .partition_spec_id = 12, + .content = ManifestFile::Content::kData, + .sequence_number = 15, + .added_snapshot_id = 679879563479918846LL, + }; + TestManifestReadingWithManifestFile(manifest_file, expected_entries); } } // namespace iceberg diff --git a/src/iceberg/test/partition_spec_test.cc b/src/iceberg/test/partition_spec_test.cc index 871bb0813..538d89a08 100644 --- a/src/iceberg/test/partition_spec_test.cc +++ b/src/iceberg/test/partition_spec_test.cc @@ -87,4 +87,23 @@ TEST(PartitionSpecTest, Equality) { ASSERT_NE(schema1, schema6); ASSERT_NE(schema6, schema1); } + +TEST(PartitionSpecTest, PartitionSchemaTest) { + SchemaField field1(5, "ts", iceberg::timestamp(), true); + SchemaField field2(7, "bar", iceberg::string(), true); + auto const schema = + std::make_shared(std::vector{field1, field2}, 100); + auto identity_transform = Transform::Identity(); + PartitionField pt_field1(5, 1000, "day", identity_transform); + PartitionField pt_field2(7, 1001, "hour", identity_transform); + PartitionSpec spec(schema, 100, {pt_field1, pt_field2}); + + auto partition_schema = spec.PartitionType(); + ASSERT_TRUE(partition_schema.has_value()); + ASSERT_EQ(2, partition_schema.value()->fields().size()); + EXPECT_EQ(pt_field1.name(), partition_schema.value()->fields()[0].name()); + EXPECT_EQ(pt_field1.field_id(), partition_schema.value()->fields()[0].field_id()); + EXPECT_EQ(pt_field2.name(), partition_schema.value()->fields()[1].name()); + EXPECT_EQ(pt_field2.field_id(), partition_schema.value()->fields()[1].field_id()); +} } // namespace iceberg diff --git a/src/iceberg/v1_metadata.cc b/src/iceberg/v1_metadata.cc index efb556b23..f8b022767 100644 --- a/src/iceberg/v1_metadata.cc +++ b/src/iceberg/v1_metadata.cc @@ -68,7 +68,7 @@ Result> ManifestEntryAdapterV1::GetManifestEntryStru // Deprecated. Always write a default in v1. Do not write in v2 or v3. static const SchemaField kBlockSizeInBytes = SchemaField::MakeRequired( 105, "block_size_in_bytes", int64(), "Block size in bytes"); - ICEBERG_ASSIGN_OR_RAISE(auto partition_type, partition_spec_->GetPartitionSchema()); + ICEBERG_ASSIGN_OR_RAISE(auto partition_type, partition_spec_->PartitionType()); if (!partition_type) { partition_type = PartitionSpec::Unpartitioned()->schema(); } diff --git a/src/iceberg/v2_metadata.cc b/src/iceberg/v2_metadata.cc index 1b3b3b1b5..940c1d90b 100644 --- a/src/iceberg/v2_metadata.cc +++ b/src/iceberg/v2_metadata.cc @@ -90,7 +90,7 @@ Result> ManifestEntryAdapterV2::GetSequenceNumber( return entry.sequence_number; } -Result> ManifestEntryAdapterV2::GetWrappedReferenceDataFile( +Result> ManifestEntryAdapterV2::GetReferenceDataFile( const std::shared_ptr& file) { if (file->content == DataFile::Content::kPositionDeletes) { return file->referenced_data_file; @@ -141,8 +141,7 @@ Result ManifestFileAdapterV2::GetSequenceNumber(const ManifestFile& fil return file.sequence_number; } -Result ManifestFileAdapterV2::GetWrappedMinSequenceNumber( - const ManifestFile& file) { +Result ManifestFileAdapterV2::GetMinSequenceNumber(const ManifestFile& file) { if (file.min_sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( diff --git a/src/iceberg/v2_metadata.h b/src/iceberg/v2_metadata.h index 966eb23a4..a1f140ba9 100644 --- a/src/iceberg/v2_metadata.h +++ b/src/iceberg/v2_metadata.h @@ -36,7 +36,7 @@ class ManifestEntryAdapterV2 : public ManifestEntryAdapter { protected: Result> GetSequenceNumber(const ManifestEntry& entry) override; - Result> GetWrappedReferenceDataFile( + Result> GetReferenceDataFile( const std::shared_ptr& file) override; private: @@ -56,7 +56,7 @@ class ManifestFileAdapterV2 : public ManifestFileAdapter { protected: Result GetSequenceNumber(const ManifestFile& file) override; - Result GetWrappedMinSequenceNumber(const ManifestFile& file) override; + Result GetMinSequenceNumber(const ManifestFile& file) override; private: int64_t snapshot_id_; diff --git a/src/iceberg/v3_metadata.cc b/src/iceberg/v3_metadata.cc index 1b5f1e15b..67d5ce670 100644 --- a/src/iceberg/v3_metadata.cc +++ b/src/iceberg/v3_metadata.cc @@ -94,7 +94,7 @@ Result> ManifestEntryAdapterV3::GetSequenceNumber( return entry.sequence_number; } -Result> ManifestEntryAdapterV3::GetWrappedReferenceDataFile( +Result> ManifestEntryAdapterV3::GetReferenceDataFile( const std::shared_ptr& file) { if (file->content == DataFile::Content::kPositionDeletes) { return file->referenced_data_file; @@ -102,7 +102,7 @@ Result> ManifestEntryAdapterV3::GetWrappedReferenceDa return std::nullopt; } -Result> ManifestEntryAdapterV3::GetWrappedFirstRowId( +Result> ManifestEntryAdapterV3::GetFirstRowId( const std::shared_ptr& file) { if (file->content == DataFile::Content::kData) { return file->first_row_id; @@ -110,7 +110,7 @@ Result> ManifestEntryAdapterV3::GetWrappedFirstRowId( return std::nullopt; } -Result> ManifestEntryAdapterV3::GetWrappedContentOffset( +Result> ManifestEntryAdapterV3::GetContentOffset( const std::shared_ptr& file) { if (file->content == DataFile::Content::kPositionDeletes) { return file->content_offset; @@ -118,7 +118,7 @@ Result> ManifestEntryAdapterV3::GetWrappedContentOffset( return std::nullopt; } -Result> ManifestEntryAdapterV3::GetWrappedContentSizeInBytes( +Result> ManifestEntryAdapterV3::GetContentSizeInBytes( const std::shared_ptr& file) { if (file->content == DataFile::Content::kPositionDeletes) { return file->content_size_in_bytes; @@ -178,8 +178,7 @@ Result ManifestFileAdapterV3::GetSequenceNumber(const ManifestFile& fil return file.sequence_number; } -Result ManifestFileAdapterV3::GetWrappedMinSequenceNumber( - const ManifestFile& file) { +Result ManifestFileAdapterV3::GetMinSequenceNumber(const ManifestFile& file) { if (file.min_sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( @@ -191,7 +190,7 @@ Result ManifestFileAdapterV3::GetWrappedMinSequenceNumber( return file.min_sequence_number; } -Result> ManifestFileAdapterV3::GetWrappedFirstRowId( +Result> ManifestFileAdapterV3::GetFirstRowId( const ManifestFile& file) { if (WrappedFirstRowId(file)) { return next_row_id_; diff --git a/src/iceberg/v3_metadata.h b/src/iceberg/v3_metadata.h index 03f8f95e0..f46bca66c 100644 --- a/src/iceberg/v3_metadata.h +++ b/src/iceberg/v3_metadata.h @@ -39,13 +39,13 @@ class ManifestEntryAdapterV3 : public ManifestEntryAdapter { protected: Result> GetSequenceNumber(const ManifestEntry& entry) override; - Result> GetWrappedReferenceDataFile( + Result> GetReferenceDataFile( const std::shared_ptr& file) override; - Result> GetWrappedFirstRowId( + Result> GetFirstRowId( const std::shared_ptr& file) override; - Result> GetWrappedContentOffset( + Result> GetContentOffset( const std::shared_ptr& file) override; - Result> GetWrappedContentSizeInBytes( + Result> GetContentSizeInBytes( const std::shared_ptr& file) override; private: @@ -67,8 +67,8 @@ class ManifestFileAdapterV3 : public ManifestFileAdapter { protected: Result GetSequenceNumber(const ManifestFile& file) override; - Result GetWrappedMinSequenceNumber(const ManifestFile& file) override; - Result> GetWrappedFirstRowId(const ManifestFile& file) override; + Result GetMinSequenceNumber(const ManifestFile& file) override; + Result> GetFirstRowId(const ManifestFile& file) override; private: bool WrappedFirstRowId(const ManifestFile& file); From 1bae2ca1c1c6382ccee08fde6a4968c3f4e5988a Mon Sep 17 00:00:00 2001 From: "xiao.dong" Date: Sat, 11 Oct 2025 11:27:17 +0800 Subject: [PATCH 15/17] fix comments --- src/iceberg/manifest_adapter.cc | 71 ++++++++++++++++----------------- src/iceberg/manifest_adapter.h | 14 +++---- src/iceberg/v2_metadata.cc | 6 +-- src/iceberg/v2_metadata.h | 3 +- src/iceberg/v3_metadata.cc | 24 +++++------ src/iceberg/v3_metadata.h | 12 ++---- 6 files changed, 59 insertions(+), 71 deletions(-) diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index 59237749b..3fe1c3128 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -237,7 +237,7 @@ Status ManifestEntryAdapter::AppendMap( Status ManifestEntryAdapter::AppendDataFile( ArrowArray* arrow_array, const std::shared_ptr& data_file_type, - const std::shared_ptr& file) { + const DataFile& file) { auto fields = data_file_type->fields(); for (int32_t i = 0; i < fields.size(); i++) { const auto& field = fields[i]; @@ -245,73 +245,71 @@ Status ManifestEntryAdapter::AppendDataFile( switch (field.field_id()) { case 134: // content (optional int32) - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, static_cast(file->content))); + ICEBERG_RETURN_UNEXPECTED(AppendField(array, static_cast(file.content))); break; case 100: // file_path (required string) - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->file_path)); + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.file_path)); break; case 101: // file_format (required string) - ICEBERG_RETURN_UNEXPECTED(AppendField(array, ToString(file->file_format))); + ICEBERG_RETURN_UNEXPECTED(AppendField(array, ToString(file.file_format))); break; case 102: // partition (required struct) { auto partition_type = internal::checked_pointer_cast(field.type()); - ICEBERG_RETURN_UNEXPECTED( - AppendPartition(array, partition_type, file->partition)); + ICEBERG_RETURN_UNEXPECTED(AppendPartition(array, partition_type, file.partition)); } break; case 103: // record_count (required int64) - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->record_count)); + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.record_count)); break; case 104: // file_size_in_bytes (required int64) - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->file_size_in_bytes)); + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.file_size_in_bytes)); break; case 105: // block_size_in_bytes (compatible in v1) // always 64MB for v1 ICEBERG_RETURN_UNEXPECTED(AppendField(array, kBlockSizeInBytesV1)); break; case 108: // column_sizes (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->column_sizes)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.column_sizes)); break; case 109: // value_counts (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->value_counts)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.value_counts)); break; case 110: // null_value_counts (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->null_value_counts)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.null_value_counts)); break; case 137: // nan_value_counts (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->nan_value_counts)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.nan_value_counts)); break; case 125: // lower_bounds (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->lower_bounds)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.lower_bounds)); break; case 128: // upper_bounds (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file->upper_bounds)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.upper_bounds)); break; case 131: // key_metadata (optional binary) - if (!file->key_metadata.empty()) { - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->key_metadata)); + if (!file.key_metadata.empty()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.key_metadata)); } else { ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 132: // split_offsets (optional list) - ICEBERG_RETURN_UNEXPECTED(AppendList(array, file->split_offsets)); + ICEBERG_RETURN_UNEXPECTED(AppendList(array, file.split_offsets)); break; case 135: // equality_ids (optional list) - ICEBERG_RETURN_UNEXPECTED(AppendList(array, file->equality_ids)); + ICEBERG_RETURN_UNEXPECTED(AppendList(array, file.equality_ids)); break; case 140: // sort_order_id (optional int32) - if (file->sort_order_id.has_value()) { + if (file.sort_order_id.has_value()) { ICEBERG_RETURN_UNEXPECTED( - AppendField(array, static_cast(file->sort_order_id.value()))); + AppendField(array, static_cast(file.sort_order_id.value()))); } else { ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 142: // first_row_id (optional int64) - if (file->first_row_id.has_value()) { - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->first_row_id.value())); + if (file.first_row_id.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.first_row_id.value())); } else { ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } @@ -327,16 +325,16 @@ Status ManifestEntryAdapter::AppendDataFile( break; } case 144: // content_offset (optional int64) - if (file->content_offset.has_value()) { - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file->content_offset.value())); + if (file.content_offset.has_value()) { + ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.content_offset.value())); } else { ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } break; case 145: // content_size_in_bytes (optional int64) - if (file->content_size_in_bytes.has_value()) { + if (file.content_size_in_bytes.has_value()) { ICEBERG_RETURN_UNEXPECTED( - AppendField(array, file->content_size_in_bytes.value())); + AppendField(array, file.content_size_in_bytes.value())); } else { ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); } @@ -355,23 +353,22 @@ Result> ManifestEntryAdapter::GetSequenceNumber( } Result> ManifestEntryAdapter::GetReferenceDataFile( - const std::shared_ptr& file) { - return file->referenced_data_file; + const DataFile& file) { + return file.referenced_data_file; } -Result> ManifestEntryAdapter::GetFirstRowId( - const std::shared_ptr& file) { - return file->first_row_id; +Result> ManifestEntryAdapter::GetFirstRowId(const DataFile& file) { + return file.first_row_id; } Result> ManifestEntryAdapter::GetContentOffset( - const std::shared_ptr& file) { - return file->content_offset; + const DataFile& file) { + return file.content_offset; } Result> ManifestEntryAdapter::GetContentSizeInBytes( - const std::shared_ptr& file) { - return file->content_size_in_bytes; + const DataFile& file) { + return file.content_size_in_bytes; } Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { @@ -397,7 +394,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { // Get the data file type from the field auto data_file_type = internal::checked_pointer_cast(field.type()); ICEBERG_RETURN_UNEXPECTED( - AppendDataFile(array, data_file_type, entry.data_file)); + AppendDataFile(array, data_file_type, *entry.data_file)); } else { return InvalidManifest("Missing required data file field."); } diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index b141a95df..cec6a0b41 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -84,7 +84,7 @@ class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { Status AppendInternal(const ManifestEntry& entry); Status AppendDataFile(ArrowArray* arrow_array, const std::shared_ptr& data_file_type, - const std::shared_ptr& file); + const DataFile& file); static Status AppendPartition(ArrowArray* arrow_array, const std::shared_ptr& partition_type, const std::vector& partitions); @@ -98,14 +98,10 @@ class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { const std::map>& map_value); virtual Result> GetSequenceNumber(const ManifestEntry& entry); - virtual Result> GetReferenceDataFile( - const std::shared_ptr& file); - virtual Result> GetFirstRowId( - const std::shared_ptr& file); - virtual Result> GetContentOffset( - const std::shared_ptr& file); - virtual Result> GetContentSizeInBytes( - const std::shared_ptr& file); + virtual Result> GetReferenceDataFile(const DataFile& file); + virtual Result> GetFirstRowId(const DataFile& file); + virtual Result> GetContentOffset(const DataFile& file); + virtual Result> GetContentSizeInBytes(const DataFile& file); protected: std::shared_ptr partition_spec_; diff --git a/src/iceberg/v2_metadata.cc b/src/iceberg/v2_metadata.cc index 940c1d90b..c0de24ef1 100644 --- a/src/iceberg/v2_metadata.cc +++ b/src/iceberg/v2_metadata.cc @@ -91,9 +91,9 @@ Result> ManifestEntryAdapterV2::GetSequenceNumber( } Result> ManifestEntryAdapterV2::GetReferenceDataFile( - const std::shared_ptr& file) { - if (file->content == DataFile::Content::kPositionDeletes) { - return file->referenced_data_file; + const DataFile& file) { + if (file.content == DataFile::Content::kPositionDeletes) { + return file.referenced_data_file; } return std::nullopt; } diff --git a/src/iceberg/v2_metadata.h b/src/iceberg/v2_metadata.h index a1f140ba9..caee8c428 100644 --- a/src/iceberg/v2_metadata.h +++ b/src/iceberg/v2_metadata.h @@ -36,8 +36,7 @@ class ManifestEntryAdapterV2 : public ManifestEntryAdapter { protected: Result> GetSequenceNumber(const ManifestEntry& entry) override; - Result> GetReferenceDataFile( - const std::shared_ptr& file) override; + Result> GetReferenceDataFile(const DataFile& file) override; private: std::optional snapshot_id_; diff --git a/src/iceberg/v3_metadata.cc b/src/iceberg/v3_metadata.cc index 67d5ce670..25399e02a 100644 --- a/src/iceberg/v3_metadata.cc +++ b/src/iceberg/v3_metadata.cc @@ -95,33 +95,33 @@ Result> ManifestEntryAdapterV3::GetSequenceNumber( } Result> ManifestEntryAdapterV3::GetReferenceDataFile( - const std::shared_ptr& file) { - if (file->content == DataFile::Content::kPositionDeletes) { - return file->referenced_data_file; + const DataFile& file) { + if (file.content == DataFile::Content::kPositionDeletes) { + return file.referenced_data_file; } return std::nullopt; } Result> ManifestEntryAdapterV3::GetFirstRowId( - const std::shared_ptr& file) { - if (file->content == DataFile::Content::kData) { - return file->first_row_id; + const DataFile& file) { + if (file.content == DataFile::Content::kData) { + return file.first_row_id; } return std::nullopt; } Result> ManifestEntryAdapterV3::GetContentOffset( - const std::shared_ptr& file) { - if (file->content == DataFile::Content::kPositionDeletes) { - return file->content_offset; + const DataFile& file) { + if (file.content == DataFile::Content::kPositionDeletes) { + return file.content_offset; } return std::nullopt; } Result> ManifestEntryAdapterV3::GetContentSizeInBytes( - const std::shared_ptr& file) { - if (file->content == DataFile::Content::kPositionDeletes) { - return file->content_size_in_bytes; + const DataFile& file) { + if (file.content == DataFile::Content::kPositionDeletes) { + return file.content_size_in_bytes; } return std::nullopt; } diff --git a/src/iceberg/v3_metadata.h b/src/iceberg/v3_metadata.h index f46bca66c..784345f1d 100644 --- a/src/iceberg/v3_metadata.h +++ b/src/iceberg/v3_metadata.h @@ -39,14 +39,10 @@ class ManifestEntryAdapterV3 : public ManifestEntryAdapter { protected: Result> GetSequenceNumber(const ManifestEntry& entry) override; - Result> GetReferenceDataFile( - const std::shared_ptr& file) override; - Result> GetFirstRowId( - const std::shared_ptr& file) override; - Result> GetContentOffset( - const std::shared_ptr& file) override; - Result> GetContentSizeInBytes( - const std::shared_ptr& file) override; + Result> GetReferenceDataFile(const DataFile& file) override; + Result> GetFirstRowId(const DataFile& file) override; + Result> GetContentOffset(const DataFile& file) override; + Result> GetContentSizeInBytes(const DataFile& file) override; private: std::optional snapshot_id_; From 229f61d0d9e6f3b5c7c490d95b16616bf0e7efa4 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 16 Oct 2025 22:52:33 +0800 Subject: [PATCH 16/17] resolve conflicts and refine some parts --- src/iceberg/arrow/arrow_fs_file_io.cc | 2 +- ...orm_internal.h => arrow_status_internal.h} | 0 ...internal.h => nanoarrow_status_internal.h} | 10 +- src/iceberg/avro/avro_data_util.cc | 2 +- src/iceberg/avro/avro_reader.cc | 2 +- src/iceberg/avro/avro_writer.cc | 2 +- src/iceberg/file_writer.h | 1 + src/iceberg/manifest_adapter.cc | 428 +++++++++--------- src/iceberg/manifest_adapter.h | 80 ++-- src/iceberg/manifest_reader_internal.cc | 14 +- src/iceberg/parquet/parquet_data_util.cc | 2 +- src/iceberg/parquet/parquet_reader.cc | 2 +- src/iceberg/parquet/parquet_writer.cc | 2 +- src/iceberg/partition_spec.h | 9 +- src/iceberg/test/parquet_test.cc | 2 +- src/iceberg/v1_metadata.cc | 2 +- src/iceberg/v1_metadata.h | 2 +- src/iceberg/v2_metadata.cc | 9 +- src/iceberg/v2_metadata.h | 10 +- src/iceberg/v3_metadata.cc | 19 +- src/iceberg/v3_metadata.h | 21 +- 21 files changed, 315 insertions(+), 306 deletions(-) rename src/iceberg/arrow/{arrow_error_transform_internal.h => arrow_status_internal.h} (100%) rename src/iceberg/arrow/{nanoarrow_error_transform_internal.h => nanoarrow_status_internal.h} (81%) diff --git a/src/iceberg/arrow/arrow_fs_file_io.cc b/src/iceberg/arrow/arrow_fs_file_io.cc index 35e0d4eee..be62b79af 100644 --- a/src/iceberg/arrow/arrow_fs_file_io.cc +++ b/src/iceberg/arrow/arrow_fs_file_io.cc @@ -22,9 +22,9 @@ #include #include -#include "iceberg/arrow/arrow_error_transform_internal.h" #include "iceberg/arrow/arrow_file_io.h" #include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" namespace iceberg::arrow { diff --git a/src/iceberg/arrow/arrow_error_transform_internal.h b/src/iceberg/arrow/arrow_status_internal.h similarity index 100% rename from src/iceberg/arrow/arrow_error_transform_internal.h rename to src/iceberg/arrow/arrow_status_internal.h diff --git a/src/iceberg/arrow/nanoarrow_error_transform_internal.h b/src/iceberg/arrow/nanoarrow_status_internal.h similarity index 81% rename from src/iceberg/arrow/nanoarrow_error_transform_internal.h rename to src/iceberg/arrow/nanoarrow_status_internal.h index 1a8f401d3..86f396a80 100644 --- a/src/iceberg/arrow/nanoarrow_error_transform_internal.h +++ b/src/iceberg/arrow/nanoarrow_status_internal.h @@ -19,13 +19,13 @@ #pragma once -#define ICEBERG_NANOARROW_RETURN_IF_NOT_OK(status) \ +#define ICEBERG_NANOARROW_RETURN_UNEXPECTED(status) \ if (status != NANOARROW_OK) [[unlikely]] { \ return iceberg::InvalidArrowData("nanoarrow error: {}", status); \ } -#define ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error) \ - if (status != NANOARROW_OK) [[unlikely]] { \ - return iceberg::InvalidArrowData("nanoarrow error: {} msg:{}", status, \ - error.message); \ +#define ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR(status, error) \ + if (status != NANOARROW_OK) [[unlikely]] { \ + return iceberg::InvalidArrowData("nanoarrow error: {} msg: {}", status, \ + error.message); \ } diff --git a/src/iceberg/avro/avro_data_util.cc b/src/iceberg/avro/avro_data_util.cc index 8853f747d..92a3e7a9b 100644 --- a/src/iceberg/avro/avro_data_util.cc +++ b/src/iceberg/avro/avro_data_util.cc @@ -32,7 +32,7 @@ #include #include -#include "iceberg/arrow/arrow_error_transform_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" #include "iceberg/avro/avro_data_util_internal.h" #include "iceberg/avro/avro_schema_util_internal.h" #include "iceberg/schema.h" diff --git a/src/iceberg/avro/avro_reader.cc b/src/iceberg/avro/avro_reader.cc index 64526123f..80087c8d5 100644 --- a/src/iceberg/avro/avro_reader.cc +++ b/src/iceberg/avro/avro_reader.cc @@ -31,8 +31,8 @@ #include #include -#include "iceberg/arrow/arrow_error_transform_internal.h" #include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" #include "iceberg/avro/avro_data_util_internal.h" #include "iceberg/avro/avro_register.h" #include "iceberg/avro/avro_schema_util_internal.h" diff --git a/src/iceberg/avro/avro_writer.cc b/src/iceberg/avro/avro_writer.cc index 1db0614e7..ded0d4810 100644 --- a/src/iceberg/avro/avro_writer.cc +++ b/src/iceberg/avro/avro_writer.cc @@ -29,8 +29,8 @@ #include #include -#include "iceberg/arrow/arrow_error_transform_internal.h" #include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" #include "iceberg/avro/avro_data_util_internal.h" #include "iceberg/avro/avro_register.h" #include "iceberg/avro/avro_schema_util_internal.h" diff --git a/src/iceberg/file_writer.h b/src/iceberg/file_writer.h index 9ecfb8b5d..fba97d3a2 100644 --- a/src/iceberg/file_writer.h +++ b/src/iceberg/file_writer.h @@ -65,6 +65,7 @@ class ICEBERG_EXPORT Writer { /// \brief Write arrow data to the file. /// /// \return Status of write results. + /// \note Ownership of the data is transferred to the writer. virtual Status Write(ArrowArray* data) = 0; /// \brief Get the file statistics. diff --git a/src/iceberg/manifest_adapter.cc b/src/iceberg/manifest_adapter.cc index 3fe1c3128..bc0f834e9 100644 --- a/src/iceberg/manifest_adapter.cc +++ b/src/iceberg/manifest_adapter.cc @@ -21,70 +21,124 @@ #include -#include "iceberg/arrow/nanoarrow_error_transform_internal.h" +#include "iceberg/arrow/nanoarrow_status_internal.h" #include "iceberg/manifest_entry.h" #include "iceberg/manifest_list.h" +#include "iceberg/result.h" #include "iceberg/schema.h" #include "iceberg/schema_internal.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/macros.h" +namespace iceberg { + namespace { -static constexpr int64_t kBlockSizeInBytesV1 = 64 * 1024 * 1024L; + +constexpr int64_t kBlockSizeInBytesV1 = 64 * 1024 * 1024L; + +Status AppendField(ArrowArray* array, int64_t value) { + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendInt(array, value)); + return {}; } -namespace iceberg { +Status AppendField(ArrowArray* array, uint64_t value) { + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendUInt(array, value)); + return {}; +} -Status ManifestAdapter::StartAppending() { - if (size_ > 0) { - return InvalidArgument("Adapter buffer not empty, cannot start appending."); - } - array_ = {}; - size_ = 0; - ArrowError error; - ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR( - ArrowArrayInitFromSchema(&array_, &schema_, &error), error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayStartAppending(&array_)); +Status AppendField(ArrowArray* array, double value) { + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendDouble(array, value)); return {}; } -Result ManifestAdapter::FinishAppending() { - ArrowError error; - ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR( - ArrowArrayFinishBuildingDefault(&array_, &error), error); - return &array_; +Status AppendField(ArrowArray* array, std::string_view value) { + ArrowStringView view(value.data(), value.size()); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendString(array, view)); + return {}; } -Status ManifestAdapter::AppendField(ArrowArray* arrowArray, int64_t value) { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendInt(arrowArray, value)); +Status AppendField(ArrowArray* array, std::span value) { + ArrowBufferViewData data; + data.as_char = reinterpret_cast(value.data()); + ArrowBufferView view(data, value.size()); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendBytes(array, view)); return {}; } -Status ManifestAdapter::AppendField(ArrowArray* arrowArray, uint64_t value) { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendUInt(arrowArray, value)); +Status AppendList(ArrowArray* array, const std::vector& list_value) { + auto list_array = array->children[0]; + for (const auto& value : list_value) { + ICEBERG_NANOARROW_RETURN_UNEXPECTED( + ArrowArrayAppendInt(list_array, static_cast(value))); + } + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(array)); return {}; } -Status ManifestAdapter::AppendField(ArrowArray* arrowArray, double value) { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendDouble(arrowArray, value)); +Status AppendList(ArrowArray* array, const std::vector& list_value) { + auto list_array = array->children[0]; + for (const auto& value : list_value) { + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendInt(list_array, value)); + } + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(array)); return {}; } -Status ManifestAdapter::AppendField(ArrowArray* arrowArray, std::string_view value) { - ArrowStringView view(value.data(), value.size()); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendString(arrowArray, view)); +Status AppendMap(ArrowArray* array, const std::map& map_value) { + auto map_array = array->children[0]; + if (map_array->n_children != 2) { + return InvalidArrowData("Map array must have exactly 2 children."); + } + for (const auto& [key, value] : map_value) { + auto key_array = map_array->children[0]; + auto value_array = map_array->children[1]; + ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); + ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(map_array)); + } + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(array)); return {}; } -Status ManifestAdapter::AppendField(ArrowArray* arrowArray, - std::span value) { - ArrowBufferViewData data; - data.as_char = reinterpret_cast(value.data()); - ArrowBufferView view(data, value.size()); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendBytes(arrowArray, view)); +Status AppendMap(ArrowArray* array, + const std::map>& map_value) { + auto map_array = array->children[0]; + if (map_array->n_children != 2) { + return InvalidArrowData("Map array must have exactly 2 children."); + } + for (const auto& [key, value] : map_value) { + auto key_array = map_array->children[0]; + auto value_array = map_array->children[1]; + ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); + ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(map_array)); + } + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(array)); + return {}; +} + +} // namespace + +Status ManifestAdapter::StartAppending() { + if (size_ > 0) { + return InvalidArgument("Adapter buffer not empty, cannot start appending."); + } + array_ = {}; + size_ = 0; + ArrowError error; + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR( + ArrowArrayInitFromSchema(&array_, &schema_, &error), error); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayStartAppending(&array_)); return {}; } +Result ManifestAdapter::FinishAppending() { + ArrowError error; + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR( + ArrowArrayFinishBuildingDefault(&array_, &error), error); + return &array_; +} + ManifestEntryAdapter::~ManifestEntryAdapter() { if (array_.release != nullptr) { ArrowArrayRelease(&array_); @@ -94,280 +148,232 @@ ManifestEntryAdapter::~ManifestEntryAdapter() { } } -Result> ManifestEntryAdapter::GetManifestEntryStructType() { +Result> ManifestEntryAdapter::GetManifestEntryType() { if (partition_spec_ == nullptr) [[unlikely]] { - // NOTICE: never reach here by design, partition_spec_ should always not null. return ManifestEntry::TypeFromPartitionType(nullptr); } - ICEBERG_ASSIGN_OR_RAISE(auto partition_schema, partition_spec_->PartitionType()); - return ManifestEntry::TypeFromPartitionType(std::move(partition_schema)); + ICEBERG_ASSIGN_OR_RAISE(auto partition_type, partition_spec_->PartitionType()); + return ManifestEntry::TypeFromPartitionType(std::move(partition_type)); } -Status ManifestEntryAdapter::AppendPartition( - ArrowArray* arrow_array, const std::shared_ptr& partition_type, - const std::vector& partitions) { - if (arrow_array->n_children != partition_type->fields().size()) [[unlikely]] { - return InvalidManifest("Arrow array of partition does not match partition type."); +Status ManifestEntryAdapter::AppendPartitionValues( + ArrowArray* array, const std::shared_ptr& partition_type, + const std::vector& partition_values) { + if (array->n_children != partition_type->fields().size()) [[unlikely]] { + return InvalidArrowData("Arrow array of partition does not match partition type."); } - if (partitions.size() != partition_type->fields().size()) [[unlikely]] { - return InvalidManifest("Literal list of partition does not match partition type."); + if (partition_values.size() != partition_type->fields().size()) [[unlikely]] { + return InvalidArrowData("Literal list of partition does not match partition type."); } auto fields = partition_type->fields(); for (int32_t i = 0; i < fields.size(); i++) { - const auto& partition = partitions[i]; - const auto& field = fields[i]; - auto array = arrow_array->children[i]; - if (partition.IsNull()) { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + const auto& partition_value = partition_values[i]; + const auto& partition_field = fields[i]; + auto child_array = array->children[i]; + if (partition_value.IsNull()) { + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(child_array, 1)); continue; } - switch (field.type()->type_id()) { + switch (partition_field.type()->type_id()) { case TypeId::kBoolean: ICEBERG_RETURN_UNEXPECTED(AppendField( - array, - static_cast(std::get(partition.value()) == true ? 1L : 0L))); + child_array, static_cast( + std::get(partition_value.value()) == true ? 1L : 0L))); break; case TypeId::kInt: ICEBERG_RETURN_UNEXPECTED(AppendField( - array, static_cast(std::get(partition.value())))); + child_array, + static_cast(std::get(partition_value.value())))); break; case TypeId::kLong: ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get(partition.value()))); + AppendField(child_array, std::get(partition_value.value()))); break; case TypeId::kFloat: - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, static_cast(std::get(partition.value())))); + ICEBERG_RETURN_UNEXPECTED(AppendField( + child_array, static_cast(std::get(partition_value.value())))); break; case TypeId::kDouble: ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get(partition.value()))); + AppendField(child_array, std::get(partition_value.value()))); break; case TypeId::kString: ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get(partition.value()))); + AppendField(child_array, std::get(partition_value.value()))); break; case TypeId::kFixed: case TypeId::kBinary: - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get>(partition.value()))); + ICEBERG_RETURN_UNEXPECTED(AppendField( + child_array, std::get>(partition_value.value()))); break; case TypeId::kDate: ICEBERG_RETURN_UNEXPECTED(AppendField( - array, static_cast(std::get(partition.value())))); + child_array, + static_cast(std::get(partition_value.value())))); break; case TypeId::kTime: case TypeId::kTimestamp: case TypeId::kTimestampTz: ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get(partition.value()))); + AppendField(child_array, std::get(partition_value.value()))); break; case TypeId::kDecimal: - ICEBERG_RETURN_UNEXPECTED( - AppendField(array, std::get>(partition.value()))); + ICEBERG_RETURN_UNEXPECTED(AppendField( + child_array, std::get>(partition_value.value()))); break; case TypeId::kUuid: case TypeId::kStruct: case TypeId::kList: case TypeId::kMap: - // TODO(xiao.dong) currently literal does not support those types + // TODO(xiao.dong) Literals do not currently support these types default: - return InvalidManifest("Unsupported partition type: {}", field.ToString()); + return InvalidManifest("Unsupported partition type: {}", + partition_field.ToString()); } } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); - return {}; -} - -Status ManifestEntryAdapter::AppendList(ArrowArray* arrow_array, - const std::vector& list_value) { - auto list_array = arrow_array->children[0]; - for (const auto& value : list_value) { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK( - ArrowArrayAppendInt(list_array, static_cast(value))); - } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); - return {}; -} - -Status ManifestEntryAdapter::AppendList(ArrowArray* arrow_array, - const std::vector& list_value) { - auto list_array = arrow_array->children[0]; - for (const auto& value : list_value) { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendInt(list_array, value)); - } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); - return {}; -} - -Status ManifestEntryAdapter::AppendMap(ArrowArray* arrow_array, - const std::map& map_value) { - auto map_array = arrow_array->children[0]; - if (map_array->n_children != 2) { - return InvalidManifest("Invalid map array."); - } - for (const auto& [key, value] : map_value) { - auto key_array = map_array->children[0]; - auto value_array = map_array->children[1]; - ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); - ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(map_array)); - } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); - return {}; -} - -Status ManifestEntryAdapter::AppendMap( - ArrowArray* arrow_array, const std::map>& map_value) { - auto map_array = arrow_array->children[0]; - if (map_array->n_children != 2) { - return InvalidManifest("Invalid map array."); - } - for (const auto& [key, value] : map_value) { - auto key_array = map_array->children[0]; - auto value_array = map_array->children[1]; - ICEBERG_RETURN_UNEXPECTED(AppendField(key_array, static_cast(key))); - ICEBERG_RETURN_UNEXPECTED(AppendField(value_array, value)); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(map_array)); - } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(array)); return {}; } Status ManifestEntryAdapter::AppendDataFile( - ArrowArray* arrow_array, const std::shared_ptr& data_file_type, + ArrowArray* array, const std::shared_ptr& data_file_type, const DataFile& file) { auto fields = data_file_type->fields(); for (int32_t i = 0; i < fields.size(); i++) { const auto& field = fields[i]; - auto array = arrow_array->children[i]; + auto child_array = array->children[i]; switch (field.field_id()) { case 134: // content (optional int32) - ICEBERG_RETURN_UNEXPECTED(AppendField(array, static_cast(file.content))); + ICEBERG_RETURN_UNEXPECTED( + AppendField(child_array, static_cast(file.content))); break; case 100: // file_path (required string) - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.file_path)); + ICEBERG_RETURN_UNEXPECTED(AppendField(child_array, file.file_path)); break; case 101: // file_format (required string) - ICEBERG_RETURN_UNEXPECTED(AppendField(array, ToString(file.file_format))); + ICEBERG_RETURN_UNEXPECTED(AppendField(child_array, ToString(file.file_format))); break; - case 102: // partition (required struct) - { + case 102: { + // partition (required struct) auto partition_type = internal::checked_pointer_cast(field.type()); - ICEBERG_RETURN_UNEXPECTED(AppendPartition(array, partition_type, file.partition)); + ICEBERG_RETURN_UNEXPECTED( + AppendPartitionValues(child_array, partition_type, file.partition)); } break; case 103: // record_count (required int64) - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.record_count)); + ICEBERG_RETURN_UNEXPECTED(AppendField(child_array, file.record_count)); break; case 104: // file_size_in_bytes (required int64) - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.file_size_in_bytes)); + ICEBERG_RETURN_UNEXPECTED(AppendField(child_array, file.file_size_in_bytes)); break; case 105: // block_size_in_bytes (compatible in v1) // always 64MB for v1 - ICEBERG_RETURN_UNEXPECTED(AppendField(array, kBlockSizeInBytesV1)); + ICEBERG_RETURN_UNEXPECTED(AppendField(child_array, kBlockSizeInBytesV1)); break; case 108: // column_sizes (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.column_sizes)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(child_array, file.column_sizes)); break; case 109: // value_counts (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.value_counts)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(child_array, file.value_counts)); break; case 110: // null_value_counts (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.null_value_counts)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(child_array, file.null_value_counts)); break; case 137: // nan_value_counts (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.nan_value_counts)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(child_array, file.nan_value_counts)); break; case 125: // lower_bounds (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.lower_bounds)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(child_array, file.lower_bounds)); break; case 128: // upper_bounds (optional map) - ICEBERG_RETURN_UNEXPECTED(AppendMap(array, file.upper_bounds)); + ICEBERG_RETURN_UNEXPECTED(AppendMap(child_array, file.upper_bounds)); break; case 131: // key_metadata (optional binary) if (!file.key_metadata.empty()) { - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.key_metadata)); + ICEBERG_RETURN_UNEXPECTED(AppendField(child_array, file.key_metadata)); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(child_array, 1)); } break; case 132: // split_offsets (optional list) - ICEBERG_RETURN_UNEXPECTED(AppendList(array, file.split_offsets)); + ICEBERG_RETURN_UNEXPECTED(AppendList(child_array, file.split_offsets)); break; case 135: // equality_ids (optional list) - ICEBERG_RETURN_UNEXPECTED(AppendList(array, file.equality_ids)); + ICEBERG_RETURN_UNEXPECTED(AppendList(child_array, file.equality_ids)); break; case 140: // sort_order_id (optional int32) if (file.sort_order_id.has_value()) { ICEBERG_RETURN_UNEXPECTED( - AppendField(array, static_cast(file.sort_order_id.value()))); + AppendField(child_array, static_cast(file.sort_order_id.value()))); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(child_array, 1)); } break; case 142: // first_row_id (optional int64) if (file.first_row_id.has_value()) { - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.first_row_id.value())); + ICEBERG_RETURN_UNEXPECTED(AppendField(child_array, file.first_row_id.value())); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(child_array, 1)); } break; - case 143: // referenced_data_file (optional string) - { + case 143: { + // referenced_data_file (optional string) ICEBERG_ASSIGN_OR_RAISE(auto referenced_data_file, GetReferenceDataFile(file)); if (referenced_data_file.has_value()) { - ICEBERG_RETURN_UNEXPECTED(AppendField(array, referenced_data_file.value())); + ICEBERG_RETURN_UNEXPECTED( + AppendField(child_array, referenced_data_file.value())); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(child_array, 1)); } break; } case 144: // content_offset (optional int64) if (file.content_offset.has_value()) { - ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.content_offset.value())); + ICEBERG_RETURN_UNEXPECTED( + AppendField(child_array, file.content_offset.value())); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(child_array, 1)); } break; case 145: // content_size_in_bytes (optional int64) if (file.content_size_in_bytes.has_value()) { ICEBERG_RETURN_UNEXPECTED( - AppendField(array, file.content_size_in_bytes.value())); + AppendField(child_array, file.content_size_in_bytes.value())); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(child_array, 1)); } break; default: return InvalidManifest("Unknown data file field id: {} ", field.field_id()); } } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(array)); return {}; } Result> ManifestEntryAdapter::GetSequenceNumber( - const ManifestEntry& entry) { + const ManifestEntry& entry) const { return entry.sequence_number; } Result> ManifestEntryAdapter::GetReferenceDataFile( - const DataFile& file) { + const DataFile& file) const { return file.referenced_data_file; } -Result> ManifestEntryAdapter::GetFirstRowId(const DataFile& file) { +Result> ManifestEntryAdapter::GetFirstRowId( + const DataFile& file) const { return file.first_row_id; } Result> ManifestEntryAdapter::GetContentOffset( - const DataFile& file) { + const DataFile& file) const { return file.content_offset; } Result> ManifestEntryAdapter::GetContentSizeInBytes( - const DataFile& file) { + const DataFile& file) const { return file.content_size_in_bytes; } @@ -386,7 +392,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { if (entry.snapshot_id.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, entry.snapshot_id.value())); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; case 2: // data_file (required struct) @@ -396,16 +402,16 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { ICEBERG_RETURN_UNEXPECTED( AppendDataFile(array, data_file_type, *entry.data_file)); } else { - return InvalidManifest("Missing required data file field."); + return InvalidManifest("Missing required data_file field from manifest entry."); } break; - case 3: // sequence_number (optional int64) - { + case 3: { + // sequence_number (optional int64) ICEBERG_ASSIGN_OR_RAISE(auto sequence_num, GetSequenceNumber(entry)); if (sequence_num.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, sequence_num.value())); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; } @@ -414,7 +420,7 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { ICEBERG_RETURN_UNEXPECTED( AppendField(array, entry.file_sequence_number.value())); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; default: @@ -422,17 +428,17 @@ Status ManifestEntryAdapter::AppendInternal(const ManifestEntry& entry) { } } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(&array_)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(&array_)); size_++; return {}; } Status ManifestEntryAdapter::InitSchema(const std::unordered_set& fields_ids) { - ICEBERG_ASSIGN_OR_RAISE(auto manifest_entry_schema, GetManifestEntryStructType()) - auto fields_span = manifest_entry_schema->fields(); + ICEBERG_ASSIGN_OR_RAISE(auto manifest_entry_type, GetManifestEntryType()) + auto fields_span = manifest_entry_type->fields(); std::vector fields; - // TODO(xiao.dong) make this a common function to recursive handle - // all nested fields in schema + // TODO(xiao.dong) Make this a common function to recursively handle + // all nested fields in the schema for (const auto& field : fields_span) { if (field.field_id() == 2) { // handle data_file field @@ -468,11 +474,11 @@ ManifestFileAdapter::~ManifestFileAdapter() { } Status ManifestFileAdapter::AppendPartitionSummary( - ArrowArray* arrow_array, const std::shared_ptr& summary_type, + ArrowArray* array, const std::shared_ptr& summary_type, const std::vector& summaries) { - auto& array = arrow_array->children[0]; - if (array->n_children != 4) { - return InvalidManifestList("Invalid partition array."); + auto& summary_array = array->children[0]; + if (summary_array->n_children != 4) { + return InvalidManifestList("Invalid partition summary array."); } auto summary_struct = internal::checked_pointer_cast(summary_type->fields()[0].type()); @@ -481,40 +487,41 @@ Status ManifestFileAdapter::AppendPartitionSummary( for (const auto& summary_field : summary_fields) { switch (summary_field.field_id()) { case 509: // contains_null (required bool) - ICEBERG_RETURN_UNEXPECTED(AppendField( - array->children[0], static_cast(summary.contains_null ? 1 : 0))); + ICEBERG_RETURN_UNEXPECTED( + AppendField(summary_array->children[0], + static_cast(summary.contains_null ? 1 : 0))); break; - case 518: // contains_nan (optional bool) - { - auto field_array = array->children[1]; + case 518: { + // contains_nan (optional bool) + auto field_array = summary_array->children[1]; if (summary.contains_nan.has_value()) { ICEBERG_RETURN_UNEXPECTED( AppendField(field_array, static_cast(summary.contains_nan.value() ? 1 : 0))); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(field_array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(field_array, 1)); } break; } - case 510: // lower_bound (optional binary) - { - auto field_array = array->children[2]; + case 510: { + // lower_bound (optional binary) + auto field_array = summary_array->children[2]; if (summary.lower_bound.has_value() && !summary.lower_bound->empty()) { ICEBERG_RETURN_UNEXPECTED( AppendField(field_array, summary.lower_bound.value())); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(field_array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(field_array, 1)); } break; } - case 511: // upper_bound (optional binary) - { - auto field_array = array->children[3]; + case 511: { + // upper_bound (optional binary) + auto field_array = summary_array->children[3]; if (summary.upper_bound.has_value() && !summary.upper_bound->empty()) { ICEBERG_RETURN_UNEXPECTED( AppendField(field_array, summary.upper_bound.value())); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(field_array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(field_array, 1)); } break; } @@ -522,23 +529,24 @@ Status ManifestFileAdapter::AppendPartitionSummary( return InvalidManifestList("Unknown field id: {}", summary_field.field_id()); } } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(array)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(summary_array)); } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(arrow_array)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(array)); return {}; } -Result ManifestFileAdapter::GetSequenceNumber(const ManifestFile& file) { +Result ManifestFileAdapter::GetSequenceNumber(const ManifestFile& file) const { return file.sequence_number; } -Result ManifestFileAdapter::GetMinSequenceNumber(const ManifestFile& file) { +Result ManifestFileAdapter::GetMinSequenceNumber( + const ManifestFile& file) const { return file.min_sequence_number; } Result> ManifestFileAdapter::GetFirstRowId( - const ManifestFile& file) { + const ManifestFile& file) const { return file.first_row_id; } @@ -562,14 +570,14 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED( AppendField(array, static_cast(static_cast(file.content)))); break; - case 515: // sequence_number - { + case 515: { + // sequence_number ICEBERG_ASSIGN_OR_RAISE(auto sequence_num, GetSequenceNumber(file)); ICEBERG_RETURN_UNEXPECTED(AppendField(array, sequence_num)); break; } - case 516: // min_sequence_number - { + case 516: { + // min_sequence_number ICEBERG_ASSIGN_OR_RAISE(auto min_sequence_num, GetMinSequenceNumber(file)); ICEBERG_RETURN_UNEXPECTED(AppendField(array, min_sequence_num)); break; @@ -583,7 +591,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { AppendField(array, static_cast(file.added_files_count.value()))); } else { // Append null for optional field - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; case 505: // existing_files_count @@ -592,7 +600,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { array, static_cast(file.existing_files_count.value()))); } else { // Append null for optional field - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; case 506: // deleted_files_count @@ -601,7 +609,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { AppendField(array, static_cast(file.deleted_files_count.value()))); } else { // Append null for optional field - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; case 512: // added_rows_count @@ -609,7 +617,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.added_rows_count.value())); } else { // Append null for optional field - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; case 513: // existing_rows_count @@ -617,7 +625,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.existing_rows_count.value())); } else { // Append null for optional field - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; case 514: // deleted_rows_count @@ -625,7 +633,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.deleted_rows_count.value())); } else { // Append null for optional field - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; case 507: // partitions @@ -637,17 +645,17 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { if (!file.key_metadata.empty()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, file.key_metadata)); } else { - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; - case 520: // first_row_id - { + case 520: { + // first_row_id ICEBERG_ASSIGN_OR_RAISE(auto first_row_id, GetFirstRowId(file)); if (first_row_id.has_value()) { ICEBERG_RETURN_UNEXPECTED(AppendField(array, first_row_id.value())); } else { // Append null for optional field - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayAppendNull(array, 1)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayAppendNull(array, 1)); } break; } @@ -655,7 +663,7 @@ Status ManifestFileAdapter::AppendInternal(const ManifestFile& file) { return InvalidManifestList("Unknown field id: {}", field.field_id()); } } - ICEBERG_NANOARROW_RETURN_IF_NOT_OK(ArrowArrayFinishElement(&array_)); + ICEBERG_NANOARROW_RETURN_UNEXPECTED(ArrowArrayFinishElement(&array_)); size_++; return {}; } diff --git a/src/iceberg/manifest_adapter.h b/src/iceberg/manifest_adapter.h index cec6a0b41..7c2a8bbec 100644 --- a/src/iceberg/manifest_adapter.h +++ b/src/iceberg/manifest_adapter.h @@ -19,13 +19,11 @@ #pragma once -/// \file iceberg/metadata_adapter.h -/// Base class of adapter for v1v2v3v4 metadata. +/// \file iceberg/manifest_adapter.h +/// Base class for adapters handling v1/v2/v3/v4 manifest metadata. -#include #include #include -#include #include #include #include @@ -36,7 +34,7 @@ namespace iceberg { -// \brief Base class to append manifest metadata to Arrow array. +/// \brief Base class for appending manifest metadata to Arrow arrays. class ICEBERG_EXPORT ManifestAdapter { public: ManifestAdapter() = default; @@ -47,22 +45,17 @@ class ICEBERG_EXPORT ManifestAdapter { Result FinishAppending(); int64_t size() const { return size_; } - protected: - static Status AppendField(ArrowArray* arrowArray, int64_t value); - static Status AppendField(ArrowArray* arrowArray, uint64_t value); - static Status AppendField(ArrowArray* arrowArray, double value); - static Status AppendField(ArrowArray* arrowArray, std::string_view value); - static Status AppendField(ArrowArray* arrowArray, std::span value); - protected: ArrowArray array_; - ArrowSchema schema_; // converted from manifest_schema_ or manifest_list_schema_ + // Arrow schema of manifest or manifest list depending on the subclass + ArrowSchema schema_; + // Number of appended elements in the array int64_t size_ = 0; std::unordered_map metadata_; }; -// \brief Implemented by different versions with different schemas to -// append a list of `ManifestEntry`s to an `ArrowArray`. +/// \brief Adapter for appending a list of `ManifestEntry`s to an `ArrowArray`. +/// Implemented by different versions with version-specific schemas. class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { public: explicit ManifestEntryAdapter(std::shared_ptr partition_spec) @@ -74,42 +67,37 @@ class ICEBERG_EXPORT ManifestEntryAdapter : public ManifestAdapter { const std::shared_ptr& schema() const { return manifest_schema_; } protected: - virtual Result> GetManifestEntryStructType(); + virtual Result> GetManifestEntryType(); - /// \brief Init version-specific schema for each version. + /// \brief Initialize version-specific schema. /// - /// \param fields_ids each version of manifest schema has schema, we will init this - /// schema based on the fields_ids. + /// \param fields_ids Field IDs to include in the manifest schema. The schema will be + /// initialized to include only the fields with these IDs. Status InitSchema(const std::unordered_set& fields_ids); Status AppendInternal(const ManifestEntry& entry); - Status AppendDataFile(ArrowArray* arrow_array, + Status AppendDataFile(ArrowArray* array, const std::shared_ptr& data_file_type, const DataFile& file); - static Status AppendPartition(ArrowArray* arrow_array, - const std::shared_ptr& partition_type, - const std::vector& partitions); - static Status AppendList(ArrowArray* arrow_array, - const std::vector& list_value); - static Status AppendList(ArrowArray* arrow_array, - const std::vector& list_value); - static Status AppendMap(ArrowArray* arrow_array, - const std::map& map_value); - static Status AppendMap(ArrowArray* arrow_array, - const std::map>& map_value); - - virtual Result> GetSequenceNumber(const ManifestEntry& entry); - virtual Result> GetReferenceDataFile(const DataFile& file); - virtual Result> GetFirstRowId(const DataFile& file); - virtual Result> GetContentOffset(const DataFile& file); - virtual Result> GetContentSizeInBytes(const DataFile& file); + static Status AppendPartitionValues(ArrowArray* array, + const std::shared_ptr& partition_type, + const std::vector& partition_values); + + virtual Result> GetSequenceNumber( + const ManifestEntry& entry) const; + virtual Result> GetReferenceDataFile( + const DataFile& file) const; + virtual Result> GetFirstRowId(const DataFile& file) const; + virtual Result> GetContentOffset(const DataFile& file) const; + virtual Result> GetContentSizeInBytes( + const DataFile& file) const; protected: std::shared_ptr partition_spec_; std::shared_ptr manifest_schema_; }; -// \brief Implemented by different versions with different schemas to -// append a list of `ManifestFile`s to an `ArrowArray`. +/// \brief Adapter for appending a list of `ManifestFile`s to an `ArrowArray`. +/// Implemented by different versions with version-specific schemas. class ICEBERG_EXPORT ManifestFileAdapter : public ManifestAdapter { public: ManifestFileAdapter() = default; @@ -120,19 +108,19 @@ class ICEBERG_EXPORT ManifestFileAdapter : public ManifestAdapter { const std::shared_ptr& schema() const { return manifest_list_schema_; } protected: - /// \brief Init version-specific schema for each version. + /// \brief Initialize version-specific schema. /// - /// \param fields_ids each version of manifest schema has schema, we will init this - /// schema based on the fields_ids. + /// \param fields_ids Field IDs to include in the manifest list schema. The schema will + /// be initialized to include only the fields with these IDs. Status InitSchema(const std::unordered_set& fields_ids); Status AppendInternal(const ManifestFile& file); static Status AppendPartitionSummary( - ArrowArray* arrow_array, const std::shared_ptr& summary_type, + ArrowArray* array, const std::shared_ptr& summary_type, const std::vector& summaries); - virtual Result GetSequenceNumber(const ManifestFile& file); - virtual Result GetMinSequenceNumber(const ManifestFile& file); - virtual Result> GetFirstRowId(const ManifestFile& file); + virtual Result GetSequenceNumber(const ManifestFile& file) const; + virtual Result GetMinSequenceNumber(const ManifestFile& file) const; + virtual Result> GetFirstRowId(const ManifestFile& file) const; protected: std::shared_ptr manifest_list_schema_; diff --git a/src/iceberg/manifest_reader_internal.cc b/src/iceberg/manifest_reader_internal.cc index 4b20af9c3..fece00734 100644 --- a/src/iceberg/manifest_reader_internal.cc +++ b/src/iceberg/manifest_reader_internal.cc @@ -21,7 +21,7 @@ #include -#include "iceberg/arrow/nanoarrow_error_transform_internal.h" +#include "iceberg/arrow/nanoarrow_status_internal.h" #include "iceberg/arrow_c_data_guard_internal.h" #include "iceberg/file_format.h" #include "iceberg/manifest_entry.h" @@ -204,12 +204,12 @@ Result> ParseManifestList(ArrowSchema* schema, ArrowError error; ArrowArrayView array_view; auto status = ArrowArrayViewInitFromSchema(&array_view, schema, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR(status, error); internal::ArrowArrayViewGuard view_guard(&array_view); status = ArrowArrayViewSetArray(&array_view, array_in, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR(status, error); status = ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR(status, error); std::vector manifest_files; manifest_files.resize(array_in->length); @@ -467,12 +467,12 @@ Result> ParseManifestEntry(ArrowSchema* schema, ArrowError error; ArrowArrayView array_view; auto status = ArrowArrayViewInitFromSchema(&array_view, schema, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR(status, error); internal::ArrowArrayViewGuard view_guard(&array_view); status = ArrowArrayViewSetArray(&array_view, array_in, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR(status, error); status = ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error); - ICEBERG_NANOARROW_RETURN_IF_NOT_OK_WITH_ERROR(status, error); + ICEBERG_NANOARROW_RETURN_UNEXPECTED_WITH_ERROR(status, error); std::vector manifest_entries; manifest_entries.resize(array_in->length); diff --git a/src/iceberg/parquet/parquet_data_util.cc b/src/iceberg/parquet/parquet_data_util.cc index c822daa5b..b503b944d 100644 --- a/src/iceberg/parquet/parquet_data_util.cc +++ b/src/iceberg/parquet/parquet_data_util.cc @@ -23,7 +23,7 @@ #include #include -#include "iceberg/arrow/arrow_error_transform_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" #include "iceberg/parquet/parquet_data_util_internal.h" #include "iceberg/schema.h" #include "iceberg/schema_util.h" diff --git a/src/iceberg/parquet/parquet_reader.cc b/src/iceberg/parquet/parquet_reader.cc index e57b98e87..64373d044 100644 --- a/src/iceberg/parquet/parquet_reader.cc +++ b/src/iceberg/parquet/parquet_reader.cc @@ -32,8 +32,8 @@ #include #include -#include "iceberg/arrow/arrow_error_transform_internal.h" #include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" #include "iceberg/parquet/parquet_data_util_internal.h" #include "iceberg/parquet/parquet_register.h" #include "iceberg/parquet/parquet_schema_util_internal.h" diff --git a/src/iceberg/parquet/parquet_writer.cc b/src/iceberg/parquet/parquet_writer.cc index ed0bb8fd7..61f46711b 100644 --- a/src/iceberg/parquet/parquet_writer.cc +++ b/src/iceberg/parquet/parquet_writer.cc @@ -29,8 +29,8 @@ #include #include -#include "iceberg/arrow/arrow_error_transform_internal.h" #include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" #include "iceberg/schema_internal.h" #include "iceberg/util/checked_cast.h" #include "iceberg/util/macros.h" diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index d4d4ea7f4..554692264 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -64,11 +64,14 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { /// \brief Get the table schema const std::shared_ptr& schema() const; + /// \brief Get the spec ID. int32_t spec_id() const; - /// \brief Get a view of the partition fields. + + /// \brief Get a list view of the partition fields. std::span fields() const; + /// \brief Get the partition type. Result> PartitionType(); std::string ToString() const override; @@ -81,12 +84,14 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { private: /// \brief Compare two partition specs for equality. - [[nodiscard]] bool Equals(const PartitionSpec& other) const; + bool Equals(const PartitionSpec& other) const; std::shared_ptr schema_; const int32_t spec_id_; std::vector fields_; int32_t last_assigned_field_id_; + + // FIXME: use similar lazy initialization pattern as in StructType std::mutex mutex_; std::shared_ptr partition_type_; }; diff --git a/src/iceberg/test/parquet_test.cc b/src/iceberg/test/parquet_test.cc index 755817567..c6df838a5 100644 --- a/src/iceberg/test/parquet_test.cc +++ b/src/iceberg/test/parquet_test.cc @@ -30,8 +30,8 @@ #include #include -#include "iceberg/arrow/arrow_error_transform_internal.h" #include "iceberg/arrow/arrow_fs_file_io_internal.h" +#include "iceberg/arrow/arrow_status_internal.h" #include "iceberg/file_reader.h" #include "iceberg/file_writer.h" #include "iceberg/parquet/parquet_register.h" diff --git a/src/iceberg/v1_metadata.cc b/src/iceberg/v1_metadata.cc index f8b022767..ba381f836 100644 --- a/src/iceberg/v1_metadata.cc +++ b/src/iceberg/v1_metadata.cc @@ -62,7 +62,7 @@ Status ManifestEntryAdapterV1::Append(const ManifestEntry& entry) { return AppendInternal(entry); } -Result> ManifestEntryAdapterV1::GetManifestEntryStructType() { +Result> ManifestEntryAdapterV1::GetManifestEntryType() { // 'block_size_in_bytes' (ID 105) is a deprecated field that is REQUIRED // in the v1 data_file schema for backward compatibility. // Deprecated. Always write a default in v1. Do not write in v2 or v3. diff --git a/src/iceberg/v1_metadata.h b/src/iceberg/v1_metadata.h index 7b2dd1d05..3c095a9e5 100644 --- a/src/iceberg/v1_metadata.h +++ b/src/iceberg/v1_metadata.h @@ -34,7 +34,7 @@ class ManifestEntryAdapterV1 : public ManifestEntryAdapter { Status Append(const ManifestEntry& entry) override; protected: - Result> GetManifestEntryStructType() override; + Result> GetManifestEntryType() override; private: std::optional snapshot_id_; diff --git a/src/iceberg/v2_metadata.cc b/src/iceberg/v2_metadata.cc index c0de24ef1..e2bbb91b6 100644 --- a/src/iceberg/v2_metadata.cc +++ b/src/iceberg/v2_metadata.cc @@ -67,7 +67,7 @@ Status ManifestEntryAdapterV2::Append(const ManifestEntry& entry) { } Result> ManifestEntryAdapterV2::GetSequenceNumber( - const ManifestEntry& entry) { + const ManifestEntry& entry) const { if (!entry.sequence_number.has_value()) { // if the entry's data sequence number is null, // then it will inherit the sequence number of the current commit. @@ -91,7 +91,7 @@ Result> ManifestEntryAdapterV2::GetSequenceNumber( } Result> ManifestEntryAdapterV2::GetReferenceDataFile( - const DataFile& file) { + const DataFile& file) const { if (file.content == DataFile::Content::kPositionDeletes) { return file.referenced_data_file; } @@ -129,7 +129,7 @@ Status ManifestFileAdapterV2::Append(const ManifestFile& file) { return AppendInternal(file); } -Result ManifestFileAdapterV2::GetSequenceNumber(const ManifestFile& file) { +Result ManifestFileAdapterV2::GetSequenceNumber(const ManifestFile& file) const { if (file.sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( @@ -141,7 +141,8 @@ Result ManifestFileAdapterV2::GetSequenceNumber(const ManifestFile& fil return file.sequence_number; } -Result ManifestFileAdapterV2::GetMinSequenceNumber(const ManifestFile& file) { +Result ManifestFileAdapterV2::GetMinSequenceNumber( + const ManifestFile& file) const { if (file.min_sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( diff --git a/src/iceberg/v2_metadata.h b/src/iceberg/v2_metadata.h index caee8c428..164a497a8 100644 --- a/src/iceberg/v2_metadata.h +++ b/src/iceberg/v2_metadata.h @@ -35,8 +35,10 @@ class ManifestEntryAdapterV2 : public ManifestEntryAdapter { Status Append(const ManifestEntry& entry) override; protected: - Result> GetSequenceNumber(const ManifestEntry& entry) override; - Result> GetReferenceDataFile(const DataFile& file) override; + Result> GetSequenceNumber( + const ManifestEntry& entry) const override; + Result> GetReferenceDataFile( + const DataFile& file) const override; private: std::optional snapshot_id_; @@ -54,8 +56,8 @@ class ManifestFileAdapterV2 : public ManifestFileAdapter { Status Append(const ManifestFile& file) override; protected: - Result GetSequenceNumber(const ManifestFile& file) override; - Result GetMinSequenceNumber(const ManifestFile& file) override; + Result GetSequenceNumber(const ManifestFile& file) const override; + Result GetMinSequenceNumber(const ManifestFile& file) const override; private: int64_t snapshot_id_; diff --git a/src/iceberg/v3_metadata.cc b/src/iceberg/v3_metadata.cc index 25399e02a..e4605987b 100644 --- a/src/iceberg/v3_metadata.cc +++ b/src/iceberg/v3_metadata.cc @@ -71,7 +71,7 @@ Status ManifestEntryAdapterV3::Append(const ManifestEntry& entry) { } Result> ManifestEntryAdapterV3::GetSequenceNumber( - const ManifestEntry& entry) { + const ManifestEntry& entry) const { if (!entry.sequence_number.has_value()) { // if the entry's data sequence number is null, // then it will inherit the sequence number of the current commit. @@ -95,7 +95,7 @@ Result> ManifestEntryAdapterV3::GetSequenceNumber( } Result> ManifestEntryAdapterV3::GetReferenceDataFile( - const DataFile& file) { + const DataFile& file) const { if (file.content == DataFile::Content::kPositionDeletes) { return file.referenced_data_file; } @@ -103,7 +103,7 @@ Result> ManifestEntryAdapterV3::GetReferenceDataFile( } Result> ManifestEntryAdapterV3::GetFirstRowId( - const DataFile& file) { + const DataFile& file) const { if (file.content == DataFile::Content::kData) { return file.first_row_id; } @@ -111,7 +111,7 @@ Result> ManifestEntryAdapterV3::GetFirstRowId( } Result> ManifestEntryAdapterV3::GetContentOffset( - const DataFile& file) { + const DataFile& file) const { if (file.content == DataFile::Content::kPositionDeletes) { return file.content_offset; } @@ -119,7 +119,7 @@ Result> ManifestEntryAdapterV3::GetContentOffset( } Result> ManifestEntryAdapterV3::GetContentSizeInBytes( - const DataFile& file) { + const DataFile& file) const { if (file.content == DataFile::Content::kPositionDeletes) { return file.content_size_in_bytes; } @@ -166,7 +166,7 @@ Status ManifestFileAdapterV3::Append(const ManifestFile& file) { return status; } -Result ManifestFileAdapterV3::GetSequenceNumber(const ManifestFile& file) { +Result ManifestFileAdapterV3::GetSequenceNumber(const ManifestFile& file) const { if (file.sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( @@ -178,7 +178,8 @@ Result ManifestFileAdapterV3::GetSequenceNumber(const ManifestFile& fil return file.sequence_number; } -Result ManifestFileAdapterV3::GetMinSequenceNumber(const ManifestFile& file) { +Result ManifestFileAdapterV3::GetMinSequenceNumber( + const ManifestFile& file) const { if (file.min_sequence_number == TableMetadata::kInvalidSequenceNumber) { if (snapshot_id_ != file.added_snapshot_id) { return InvalidManifestList( @@ -191,7 +192,7 @@ Result ManifestFileAdapterV3::GetMinSequenceNumber(const ManifestFile& } Result> ManifestFileAdapterV3::GetFirstRowId( - const ManifestFile& file) { + const ManifestFile& file) const { if (WrappedFirstRowId(file)) { return next_row_id_; } else if (file.content != ManifestFile::Content::kData) { @@ -205,7 +206,7 @@ Result> ManifestFileAdapterV3::GetFirstRowId( } } -bool ManifestFileAdapterV3::WrappedFirstRowId(const ManifestFile& file) { +bool ManifestFileAdapterV3::WrappedFirstRowId(const ManifestFile& file) const { return file.content == ManifestFile::Content::kData && !file.first_row_id.has_value(); } diff --git a/src/iceberg/v3_metadata.h b/src/iceberg/v3_metadata.h index 784345f1d..a1076105b 100644 --- a/src/iceberg/v3_metadata.h +++ b/src/iceberg/v3_metadata.h @@ -38,11 +38,14 @@ class ManifestEntryAdapterV3 : public ManifestEntryAdapter { Status Append(const ManifestEntry& entry) override; protected: - Result> GetSequenceNumber(const ManifestEntry& entry) override; - Result> GetReferenceDataFile(const DataFile& file) override; - Result> GetFirstRowId(const DataFile& file) override; - Result> GetContentOffset(const DataFile& file) override; - Result> GetContentSizeInBytes(const DataFile& file) override; + Result> GetSequenceNumber( + const ManifestEntry& entry) const override; + Result> GetReferenceDataFile( + const DataFile& file) const override; + Result> GetFirstRowId(const DataFile& file) const override; + Result> GetContentOffset(const DataFile& file) const override; + Result> GetContentSizeInBytes( + const DataFile& file) const override; private: std::optional snapshot_id_; @@ -62,12 +65,12 @@ class ManifestFileAdapterV3 : public ManifestFileAdapter { Status Append(const ManifestFile& file) override; protected: - Result GetSequenceNumber(const ManifestFile& file) override; - Result GetMinSequenceNumber(const ManifestFile& file) override; - Result> GetFirstRowId(const ManifestFile& file) override; + Result GetSequenceNumber(const ManifestFile& file) const override; + Result GetMinSequenceNumber(const ManifestFile& file) const override; + Result> GetFirstRowId(const ManifestFile& file) const override; private: - bool WrappedFirstRowId(const ManifestFile& file); + bool WrappedFirstRowId(const ManifestFile& file) const; private: int64_t snapshot_id_; From 6b31e28a654c7328655bf105fcaea7209f44318f Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 16 Oct 2025 22:56:46 +0800 Subject: [PATCH 17/17] update meson --- src/iceberg/meson.build | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build index 318c05523..df64ae023 100644 --- a/src/iceberg/meson.build +++ b/src/iceberg/meson.build @@ -51,6 +51,7 @@ iceberg_sources = files( 'file_writer.cc', 'inheritable_metadata.cc', 'json_internal.cc', + 'manifest_adapter.cc', 'manifest_entry.cc', 'manifest_list.cc', 'manifest_reader.cc', @@ -83,6 +84,9 @@ iceberg_sources = files( 'util/murmurhash3_internal.cc', 'util/timepoint.cc', 'util/uuid.cc', + 'v1_metadata.cc', + 'v2_metadata.cc', + 'v3_metadata.cc', ) # CRoaring does not export symbols, so on Windows it must