From 9c096d6e6bd3bdf21a3e9c67baeee95dd5cedd8a Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 18 Apr 2025 23:19:06 +0800 Subject: [PATCH 1/4] test: add table metadata deserialization test --- .github/.licenserc.yaml | 1 + .pre-commit-config.yaml | 1 + src/iceberg/constants.h | 38 ++++ src/iceberg/json_internal.cc | 92 +++++--- src/iceberg/json_internal.h | 4 +- src/iceberg/partition_spec.cc | 30 ++- src/iceberg/partition_spec.h | 22 +- src/iceberg/schema.cc | 4 +- src/iceberg/schema.h | 7 +- src/iceberg/schema_internal.cc | 7 +- src/iceberg/schema_internal.h | 6 +- src/iceberg/sort_order.cc | 11 +- src/iceberg/sort_order.h | 9 +- src/iceberg/table_metadata.cc | 31 +++ src/iceberg/table_metadata.h | 20 +- test/CMakeLists.txt | 17 +- test/arrow_test.cc | 18 +- test/json_internal_test.cc | 7 +- test/metadata_serde_test.cc | 206 ++++++++++++++++++ test/partition_spec_test.cc | 7 +- ...TableMetadataPartitionStatisticsFiles.json | 61 ++++++ .../TableMetadataStatisticsFiles.json | 70 ++++++ .../TableMetadataUnsupportedVersion.json | 36 +++ .../TableMetadataV1MissingSchemaType.json | 41 ++++ test/resources/TableMetadataV1Valid.json | 42 ++++ .../TableMetadataV2CurrentSchemaNotFound.json | 87 ++++++++ ...TableMetadataV2MissingLastPartitionId.json | 73 +++++++ .../TableMetadataV2MissingPartitionSpecs.json | 67 ++++++ .../TableMetadataV2MissingSchemas.json | 71 ++++++ .../TableMetadataV2MissingSortOrder.json | 54 +++++ test/resources/TableMetadataV2Valid.json | 122 +++++++++++ .../TableMetadataV2ValidMinimal.json | 71 ++++++ .../TableMetadataV3ValidMinimal.json | 73 +++++++ test/schema_test.cc | 14 +- test/test_config.h.in | 22 ++ 35 files changed, 1355 insertions(+), 87 deletions(-) create mode 100644 src/iceberg/constants.h create mode 100644 test/metadata_serde_test.cc create mode 100644 test/resources/TableMetadataPartitionStatisticsFiles.json create mode 100644 test/resources/TableMetadataStatisticsFiles.json create mode 100644 test/resources/TableMetadataUnsupportedVersion.json create mode 100644 test/resources/TableMetadataV1MissingSchemaType.json create mode 100644 test/resources/TableMetadataV1Valid.json create mode 100644 test/resources/TableMetadataV2CurrentSchemaNotFound.json create mode 100644 test/resources/TableMetadataV2MissingLastPartitionId.json create mode 100644 test/resources/TableMetadataV2MissingPartitionSpecs.json create mode 100644 test/resources/TableMetadataV2MissingSchemas.json create mode 100644 test/resources/TableMetadataV2MissingSortOrder.json create mode 100644 test/resources/TableMetadataV2Valid.json create mode 100644 test/resources/TableMetadataV2ValidMinimal.json create mode 100644 test/resources/TableMetadataV3ValidMinimal.json create mode 100644 test/test_config.h.in diff --git a/.github/.licenserc.yaml b/.github/.licenserc.yaml index 8b2055400..72f329ebb 100644 --- a/.github/.licenserc.yaml +++ b/.github/.licenserc.yaml @@ -13,5 +13,6 @@ header: - 'NOTICE' - 'src/iceberg/expected.h' - 'src/iceberg/util/murmurhash3_internal.*' + - 'test/resources/**' comment: on-failure diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 94e2fa268..52c8fe53f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,6 +33,7 @@ repos: rev: v19.1.5 hooks: - id: clang-format + exclude: ^test/resources/.*\.json$ - repo: https://github.com/cheshirekow/cmake-format-precommit rev: v0.6.10 diff --git a/src/iceberg/constants.h b/src/iceberg/constants.h new file mode 100644 index 000000000..4534ad84f --- /dev/null +++ b/src/iceberg/constants.h @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +namespace iceberg { + +constexpr int8_t kDefaultTableFormatVersion = 2; +constexpr int8_t kSupportedTableFormatVersion = 3; +constexpr int8_t kMinFormatVersionRowLineage = 3; +constexpr int32_t kInitialSpecId = 0; +constexpr int32_t kInitialSortOrderId = 1; +constexpr int32_t kInitialSchemaId = 0; +constexpr int64_t kInitialRowId = 0; +constexpr int64_t kInitialSequenceNumber = 0; +constexpr int64_t kInvalidSequenceNumber = -1; +constexpr int64_t kInvalidSnapshotId = -1; +constexpr int32_t kInvalidFieldId = -1; + +} // namespace iceberg diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index 5fd76c60d..600506db0 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -22,12 +22,15 @@ #include #include #include +#include #include #include #include #include +#include "iceberg/constants.h" +#include "iceberg/partition_field.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/schema.h" @@ -248,7 +251,7 @@ Result> FromJsonList( list.emplace_back(std::move(entry)); } } - return {}; + return list; } /// \brief Parse a list of items from a JSON object. @@ -471,7 +474,7 @@ nlohmann::json ToJson(const Type& type) { nlohmann::json ToJson(const Schema& schema) { nlohmann::json json = ToJson(static_cast(schema)); - json[kSchemaId] = schema.schema_id(); + SetOptionalField(json, kSchemaId, schema.schema_id()); // TODO(gangwu): add identifier-field-ids. return json; } @@ -496,7 +499,7 @@ nlohmann::json ToJson(const Snapshot& snapshot) { nlohmann::json json; json[kSnapshotId] = snapshot.snapshot_id; SetOptionalField(json, kParentSnapshotId, snapshot.parent_snapshot_id); - if (snapshot.sequence_number > TableMetadata::kInitialSequenceNumber) { + if (snapshot.sequence_number > kInitialSequenceNumber) { json[kSequenceNumber] = snapshot.sequence_number; } json[kTimestampMs] = snapshot.timestamp_ms; @@ -625,7 +628,7 @@ Result> FieldFromJson(const nlohmann::json& json) { } Result> SchemaFromJson(const nlohmann::json& json) { - ICEBERG_ASSIGN_OR_RAISE(auto schema_id, GetJsonValue(json, kSchemaId)); + ICEBERG_ASSIGN_OR_RAISE(auto schema_id, GetJsonValueOptional(json, kSchemaId)); ICEBERG_ASSIGN_OR_RAISE(auto type, TypeFromJson(json)); if (type->type_id() != TypeId::kStruct) [[unlikely]] { @@ -658,9 +661,16 @@ nlohmann::json ToJson(const PartitionSpec& partition_spec) { } Result> PartitionFieldFromJson( - const nlohmann::json& json) { + const nlohmann::json& json, bool allow_field_id_missing) { ICEBERG_ASSIGN_OR_RAISE(auto source_id, GetJsonValue(json, kSourceId)); - ICEBERG_ASSIGN_OR_RAISE(auto field_id, GetJsonValue(json, kFieldId)); + int32_t field_id; + if (allow_field_id_missing) { + // Partition field id in v1 is not tracked, so we use -1 to indicate that. + ICEBERG_ASSIGN_OR_RAISE( + field_id, GetJsonValueOrDefault(json, kFieldId, kInvalidFieldId)); + } else { + ICEBERG_ASSIGN_OR_RAISE(field_id, GetJsonValue(json, kFieldId)); + } ICEBERG_ASSIGN_OR_RAISE( auto transform, GetJsonValue(json, kTransform).and_then(TransformFromString)); @@ -755,9 +765,8 @@ Result> SnapshotFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto schema_id, GetJsonValueOptional(json, kSchemaId)); return std::make_unique( - snapshot_id, parent_snapshot_id, - sequence_number.value_or(TableMetadata::kInitialSequenceNumber), timestamp_ms, - manifest_list, std::move(summary), schema_id); + snapshot_id, parent_snapshot_id, sequence_number.value_or(kInitialSequenceNumber), + timestamp_ms, manifest_list, std::move(summary), schema_id); } nlohmann::json ToJson(const BlobMetadata& blob_metadata) { @@ -905,7 +914,7 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { } // write the current schema ID and schema list - json[kCurrentSchemaId] = table_metadata.current_schema_id; + SetOptionalField(json, kCurrentSchemaId, table_metadata.current_schema_id); json[kSchemas] = ToJsonList(table_metadata.schemas); // for older readers, continue writing the default spec as "partition-spec" @@ -963,7 +972,8 @@ namespace { /// /// \return The current schema or parse error. Result> ParseSchemas( - const nlohmann::json& json, int8_t format_version, int32_t& current_schema_id, + const nlohmann::json& json, int8_t format_version, + std::optional& current_schema_id, std::vector>& schemas) { std::shared_ptr current_schema; if (json.contains(kSchemas)) { @@ -986,7 +996,7 @@ Result> ParseSchemas( } if (!current_schema) { return JsonParseError("Cannot find schema with {}={} from {}", kCurrentSchemaId, - current_schema_id, schema_array.dump()); + current_schema_id.value(), schema_array.dump()); } } else { if (format_version != 1) { @@ -1031,13 +1041,30 @@ Status ParsePartitionSpecs(const nlohmann::json& json, int8_t format_version, return JsonParseError("{} must exist in format v{}", kPartitionSpecs, format_version); } - default_spec_id = TableMetadata::kInitialSpecId; - ICEBERG_ASSIGN_OR_RAISE(auto spec, GetJsonValue(json, kPartitionSpec) - .and_then([current_schema](const auto& json) { - return PartitionSpecFromJson(current_schema, - json); - })); + ICEBERG_ASSIGN_OR_RAISE(auto partition_spec_json, + GetJsonValue(json, kPartitionSpec)); + if (!partition_spec_json.is_array()) { + return JsonParseError("Cannot parse v1 partition spec from non-array: {}", + partition_spec_json.dump()); + } + + int32_t next_partition_field_id = PartitionSpec::kLegacyPartitionDataIdStart; + std::vector fields; + for (const auto& entry_json : partition_spec_json) { + ICEBERG_ASSIGN_OR_RAISE(auto field, PartitionFieldFromJson(entry_json)); + int32_t field_id = field->field_id(); + if (field_id == kInvalidFieldId) { + // If the field ID is not set, we need to assign a new one + field_id = next_partition_field_id++; + } + fields.emplace_back(field->source_id(), field_id, std::string(field->name()), + std::move(field->transform())); + } + + auto spec = std::make_unique(current_schema, kInitialSpecId, + std::move(fields)); + default_spec_id = spec->spec_id(); partition_specs.push_back(std::move(spec)); } @@ -1066,7 +1093,9 @@ Status ParseSortOrders(const nlohmann::json& json, int8_t format_version, if (format_version > 1) { return JsonParseError("{} must exist in format v{}", kSortOrders, format_version); } - return NotImplementedError("Assign a default sort order"); + auto sort_order = SortOrder::Unsorted(); + default_sort_order_id = sort_order->order_id(); + sort_orders.push_back(std::move(sort_order)); } return {}; } @@ -1083,7 +1112,7 @@ Result> TableMetadataFromJson(const nlohmann::jso ICEBERG_ASSIGN_OR_RAISE(table_metadata->format_version, GetJsonValue(json, kFormatVersion)); if (table_metadata->format_version < 1 || - table_metadata->format_version > TableMetadata::kSupportedTableFormatVersion) { + table_metadata->format_version > kSupportedTableFormatVersion) { return JsonParseError("Cannot read unsupported version: {}", table_metadata->format_version); } @@ -1097,7 +1126,7 @@ Result> TableMetadataFromJson(const nlohmann::jso ICEBERG_ASSIGN_OR_RAISE(table_metadata->last_sequence_number, GetJsonValue(json, kLastSequenceNumber)); } else { - table_metadata->last_sequence_number = TableMetadata::kInitialSequenceNumber; + table_metadata->last_sequence_number = kInitialSequenceNumber; } ICEBERG_ASSIGN_OR_RAISE(table_metadata->last_column_id, GetJsonValue(json, kLastColumnId)); @@ -1119,10 +1148,16 @@ Result> TableMetadataFromJson(const nlohmann::jso return JsonParseError("{} must exist in format v{}", kLastPartitionId, table_metadata->format_version); } - // TODO(gangwu): iterate all partition specs to find the largest partition - // field id or assign a default value for unpartitioned tables. However, - // PartitionSpec::lastAssignedFieldId() is not implemented yet. - return NotImplementedError("Find the largest partition field id"); + + if (table_metadata->partition_specs.empty()) { + table_metadata->last_partition_id = + PartitionSpec::Unpartitioned()->last_assigned_field_id(); + } else { + table_metadata->last_partition_id = + std::ranges::max(table_metadata->partition_specs, {}, [](const auto& spec) { + return spec->last_assigned_field_id(); + })->last_assigned_field_id(); + } } ICEBERG_RETURN_UNEXPECTED(ParseSortOrders(json, table_metadata->format_version, @@ -1136,14 +1171,13 @@ Result> TableMetadataFromJson(const nlohmann::jso // This field is optional, but internally we set this to -1 when not set ICEBERG_ASSIGN_OR_RAISE( table_metadata->current_snapshot_id, - GetJsonValueOrDefault(json, kCurrentSnapshotId, - TableMetadata::kInvalidSnapshotId)); + GetJsonValueOrDefault(json, kCurrentSnapshotId, kInvalidSnapshotId)); if (table_metadata->format_version >= 3) { ICEBERG_ASSIGN_OR_RAISE(table_metadata->next_row_id, GetJsonValue(json, kNextRowId)); } else { - table_metadata->next_row_id = TableMetadata::kInitialRowId; + table_metadata->next_row_id = kInitialRowId; } ICEBERG_ASSIGN_OR_RAISE(auto last_updated_ms, @@ -1155,7 +1189,7 @@ Result> TableMetadataFromJson(const nlohmann::jso ICEBERG_ASSIGN_OR_RAISE( table_metadata->refs, FromJsonMap>(json, kRefs, SnapshotRefFromJson)); - } else if (table_metadata->current_snapshot_id != TableMetadata::kInvalidSnapshotId) { + } else if (table_metadata->current_snapshot_id != kInvalidSnapshotId) { table_metadata->refs["main"] = std::make_unique(SnapshotRef{ .snapshot_id = table_metadata->current_snapshot_id, .retention = SnapshotRef::Branch{}, diff --git a/src/iceberg/json_internal.h b/src/iceberg/json_internal.h index 8881d5e0c..33f47b3da 100644 --- a/src/iceberg/json_internal.h +++ b/src/iceberg/json_internal.h @@ -126,10 +126,12 @@ nlohmann::json ToJson(const PartitionField& partition_field); /// and name. /// /// \param json The JSON object representing a `PartitionField`. +/// \param allow_field_id_missing Whether the field ID is allowed to be missing. This can +/// happen when deserializing partition fields from V1 metadata files. /// \return An `expected` value containing either a `PartitionField` object or an error. /// If the JSON is malformed or missing expected fields, an error will be returned. Result> PartitionFieldFromJson( - const nlohmann::json& json); + const nlohmann::json& json, bool allow_field_id_missing = false); /// \brief Serializes a `PartitionSpec` object to JSON. /// diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 40a11901b..e8cd1a0dc 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -19,17 +19,38 @@ #include "iceberg/partition_spec.h" +#include #include +#include +#include "iceberg/constants.h" #include "iceberg/schema.h" -#include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { PartitionSpec::PartitionSpec(std::shared_ptr schema, int32_t spec_id, - std::vector fields) - : schema_(std::move(schema)), spec_id_(spec_id), fields_(std::move(fields)) {} + std::vector fields, + std::optional last_assigned_field_id) + : schema_(std::move(schema)), spec_id_(spec_id), fields_(std::move(fields)) { + if (last_assigned_field_id) { + last_assigned_field_id_ = last_assigned_field_id.value(); + } else if (fields_.empty()) { + last_assigned_field_id_ = kLegacyPartitionDataIdStart - 1; + } else { + last_assigned_field_id_ = std::ranges::max(fields_, {}, [](const auto& field) { + return field.field_id(); + }).field_id(); + } +} + +const std::shared_ptr& PartitionSpec::Unpartitioned() { + static const std::shared_ptr unpartitioned = + std::make_shared( + /*schema=*/nullptr, kInitialSpecId, std::vector{}, + kLegacyPartitionDataIdStart - 1); + return unpartitioned; +} const std::shared_ptr& PartitionSpec::schema() const { return schema_; } @@ -47,8 +68,7 @@ std::string PartitionSpec::ToString() const { } bool PartitionSpec::Equals(const PartitionSpec& other) const { - return *schema_ == *other.schema_ && spec_id_ == other.spec_id_ && - fields_ == other.fields_; + return spec_id_ == other.spec_id_ && fields_ == other.fields_; } } // namespace iceberg diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index 9bd36968b..3a40db5db 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -23,6 +23,7 @@ /// Partition specs for Iceberg tables. #include +#include #include #include #include @@ -40,8 +41,24 @@ namespace iceberg { /// evolution. class ICEBERG_EXPORT PartitionSpec : public util::Formattable { public: + /// \brief The start ID for partition field. It is only used to generate + /// partition field id for v1 metadata where it is tracked. + constexpr static int32_t kLegacyPartitionDataIdStart = 1000; + + /// \brief Create a new partition spec. + /// + /// \param schema The table schema. + /// \param spec_id The spec ID. + /// \param fields The partition fields. + /// \param last_assigned_field_id The last assigned field ID. If not provided, it will + /// be calculated from the fields. PartitionSpec(std::shared_ptr schema, int32_t spec_id, - std::vector fields); + std::vector fields, + std::optional last_assigned_field_id = std::nullopt); + + /// \brief Get an unsorted partition spec singleton. + static const std::shared_ptr& Unpartitioned(); + /// \brief Get the table schema const std::shared_ptr& schema() const; /// \brief Get the spec ID. @@ -51,6 +68,8 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { std::string ToString() const override; + int32_t last_assigned_field_id() const { return last_assigned_field_id_; } + friend bool operator==(const PartitionSpec& lhs, const PartitionSpec& rhs) { return lhs.Equals(rhs); } @@ -66,6 +85,7 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { std::shared_ptr schema_; const int32_t spec_id_; std::vector fields_; + int32_t last_assigned_field_id_; }; } // namespace iceberg diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 52a57db7a..c31554ead 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -26,10 +26,10 @@ namespace iceberg { -Schema::Schema(int32_t schema_id, std::vector fields) +Schema::Schema(std::vector fields, std::optional schema_id) : StructType(std::move(fields)), schema_id_(schema_id) {} -int32_t Schema::schema_id() const { return schema_id_; } +std::optional Schema::schema_id() const { return schema_id_; } std::string Schema::ToString() const { std::string repr = "schema<"; diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 0da0c4031..51cafa8e7 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -24,6 +24,7 @@ /// and any utility functions. See iceberg/type.h and iceberg/field.h as well. #include +#include #include #include @@ -40,13 +41,13 @@ namespace iceberg { /// evolution. class ICEBERG_EXPORT Schema : public StructType { public: - Schema(int32_t schema_id, std::vector fields); + Schema(std::vector fields, std::optional schema_id); /// \brief Get the schema ID. /// /// A schema is identified by a unique ID for the purposes of schema /// evolution. - [[nodiscard]] int32_t schema_id() const; + [[nodiscard]] std::optional schema_id() const; [[nodiscard]] std::string ToString() const override; @@ -58,7 +59,7 @@ class ICEBERG_EXPORT Schema : public StructType { /// \brief Compare two schemas for equality. [[nodiscard]] bool Equals(const Schema& other) const; - const int32_t schema_id_; + const std::optional schema_id_; }; } // namespace iceberg diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc index 69d7b82f5..266d8d2b8 100644 --- a/src/iceberg/schema_internal.cc +++ b/src/iceberg/schema_internal.cc @@ -331,17 +331,18 @@ Result> FromArrowSchema(const ArrowSchema& schema) { } // namespace -std::unique_ptr FromStructType(StructType&& struct_type, int32_t schema_id) { +std::unique_ptr FromStructType(StructType&& struct_type, + std::optional schema_id) { std::vector fields; fields.reserve(struct_type.fields().size()); for (auto& field : struct_type.fields()) { fields.emplace_back(std::move(field)); } - return std::make_unique(schema_id, std::move(fields)); + return std::make_unique(std::move(fields), schema_id); } Result> FromArrowSchema(const ArrowSchema& schema, - int32_t schema_id) { + std::optional schema_id) { auto type_result = FromArrowSchema(schema); if (!type_result) { return unexpected(type_result.error()); diff --git a/src/iceberg/schema_internal.h b/src/iceberg/schema_internal.h index 0a376cc39..26be1c3f1 100644 --- a/src/iceberg/schema_internal.h +++ b/src/iceberg/schema_internal.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include @@ -46,13 +47,14 @@ Status ToArrowSchema(const Schema& schema, ArrowSchema* out); /// \param[in] schema_id The schema ID of the Iceberg schema. /// \return The Iceberg schema or an error if the conversion fails. Result> FromArrowSchema(const ArrowSchema& schema, - int32_t schema_id); + std::optional schema_id); /// \brief Convert a struct type to an Iceberg schema. /// /// \param[in] struct_type The struct type to convert. /// \param[in] schema_id The schema ID of the Iceberg schema. /// \return The Iceberg schema. -std::unique_ptr FromStructType(StructType&& struct_type, int32_t schema_id); +std::unique_ptr FromStructType(StructType&& struct_type, + std::optional schema_id); } // namespace iceberg diff --git a/src/iceberg/sort_order.cc b/src/iceberg/sort_order.cc index a85dcf1e3..624ac3193 100644 --- a/src/iceberg/sort_order.cc +++ b/src/iceberg/sort_order.cc @@ -21,14 +21,21 @@ #include +#include "iceberg/constants.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { -SortOrder::SortOrder(int64_t order_id, std::vector fields) +SortOrder::SortOrder(int32_t order_id, std::vector fields) : order_id_(order_id), fields_(std::move(fields)) {} -int64_t SortOrder::order_id() const { return order_id_; } +const std::shared_ptr& SortOrder::Unsorted() { + static const std::shared_ptr unsorted = + std::make_shared(kInitialSortOrderId, std::vector{}); + return unsorted; +} + +int32_t SortOrder::order_id() const { return order_id_; } std::span SortOrder::fields() const { return fields_; } diff --git a/src/iceberg/sort_order.h b/src/iceberg/sort_order.h index 049e86f72..66bc1d661 100644 --- a/src/iceberg/sort_order.h +++ b/src/iceberg/sort_order.h @@ -36,10 +36,13 @@ namespace iceberg { /// applied to the data. class ICEBERG_EXPORT SortOrder : public util::Formattable { public: - SortOrder(int64_t order_id, std::vector fields); + SortOrder(int32_t order_id, std::vector fields); + + /// \brief Get an unsorted sort order singleton. + static const std::shared_ptr& Unsorted(); /// \brief Get the sort order id. - int64_t order_id() const; + int32_t order_id() const; /// \brief Get the list of sort fields. std::span fields() const; @@ -58,7 +61,7 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable { /// \brief Compare two sort orders for equality. bool Equals(const SortOrder& other) const; - int64_t order_id_; + int32_t order_id_; std::vector fields_; }; diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index 171021445..f043bb74d 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -20,8 +20,12 @@ #include "iceberg/table_metadata.h" #include +#include #include +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/sort_order.h" namespace iceberg { std::string ToString(const SnapshotLogEntry& entry) { @@ -34,6 +38,33 @@ std::string ToString(const MetadataLogEntry& entry) { entry.metadata_file); } +const std::shared_ptr& TableMetadata::Schema() const { + static const std::shared_ptr<::iceberg::Schema> empty_schema = nullptr; + + auto iter = std::ranges::find_if(schemas, [this](const auto& schema) { + return schema->schema_id() == current_schema_id; + }); + return iter == schemas.end() ? empty_schema : *iter; +} + +const std::shared_ptr& TableMetadata::PartitionSpec() const { + static const std::shared_ptr empty_spec = nullptr; + + auto iter = std::ranges::find_if(partition_specs, [this](const auto& spec) { + return spec->spec_id() == default_spec_id; + }); + return iter == partition_specs.end() ? empty_spec : *iter; +} + +const std::shared_ptr& TableMetadata::SortOrder() const { + static const std::shared_ptr empty_order = nullptr; + + auto iter = std::ranges::find_if(sort_orders, [this](const auto& order) { + return order->order_id() == default_sort_order_id; + }); + return iter == sort_orders.end() ? empty_order : *iter; +} + Result TimePointMsFromUnixMs(int64_t unix_ms) { return TimePointMs{std::chrono::milliseconds(unix_ms)}; } diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index 2a1afcbfc..f38606715 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -83,17 +83,6 @@ struct ICEBERG_EXPORT MetadataLogEntry { /// Schema|PartitionSpec|SortOrder> 2) List 3) Map 4) /// Map struct ICEBERG_EXPORT TableMetadata { - static constexpr int8_t kDefaultTableFormatVersion = 2; - static constexpr int8_t kSupportedTableFormatVersion = 3; - static constexpr int8_t kMinFormatVersionRowLineage = 3; - static constexpr int32_t kInitialSpecId = 0; - static constexpr int32_t kInitialSortOrderId = 1; - static constexpr int32_t kInitialSchemaId = 0; - static constexpr int64_t kInitialRowId = 0; - static constexpr int64_t kInitialSequenceNumber = 0; - static constexpr int64_t kInvalidSequenceNumber = -1; - static constexpr int64_t kInvalidSnapshotId = -1; - /// An integer version number for the format int8_t format_version; /// A UUID that identifies the table @@ -109,7 +98,7 @@ struct ICEBERG_EXPORT TableMetadata { /// A list of schemas std::vector> schemas; /// ID of the table's current schema - int32_t current_schema_id; + std::optional current_schema_id; /// A list of partition specs std::vector> partition_specs; /// ID of the current partition spec that writers should use by default @@ -140,6 +129,13 @@ struct ICEBERG_EXPORT TableMetadata { std::vector> partition_statistics; /// A `long` higher than all assigned row IDs int64_t next_row_id; + + /// \brief Get the current schema + const std::shared_ptr& Schema() const; + /// \brief Get the current partition spec + const std::shared_ptr& PartitionSpec() const; + /// \brief Get the current sort order + const std::shared_ptr& SortOrder() const; }; /// \brief Returns a string representation of a SnapshotLogEntry diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b9dc6c2a7..c31c6e0e8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -23,12 +23,15 @@ fetchcontent_declare(googletest GTest) fetchcontent_makeavailable(googletest) +set(ICEBERG_TEST_RESOURCES "${CMAKE_SOURCE_DIR}/test/resources") + +configure_file("${CMAKE_SOURCE_DIR}/test/test_config.h.in" + "${CMAKE_BINARY_DIR}/iceberg/test/test_config.h") + add_executable(schema_test) target_sources(schema_test - PRIVATE json_internal_test.cc - schema_test.cc + PRIVATE schema_test.cc schema_field_test.cc - schema_json_test.cc type_test.cc transform_test.cc partition_field_test.cc @@ -50,6 +53,14 @@ target_link_libraries(expression_test PRIVATE iceberg_static GTest::gtest_main GTest::gmock) add_test(NAME expression_test COMMAND expression_test) +add_executable(json_serde_test) +target_include_directories(json_serde_test PRIVATE "${CMAKE_BINARY_DIR}") +target_sources(json_serde_test PRIVATE json_internal_test.cc metadata_serde_test.cc + schema_json_test.cc) +target_link_libraries(json_serde_test PRIVATE iceberg_static GTest::gtest_main + GTest::gmock) +add_test(NAME json_serde_test COMMAND json_serde_test) + if(ICEBERG_BUILD_BUNDLE) add_executable(avro_test) target_sources(avro_test PRIVATE avro_test.cc) diff --git a/test/arrow_test.cc b/test/arrow_test.cc index 85dcc91f9..52cef049a 100644 --- a/test/arrow_test.cc +++ b/test/arrow_test.cc @@ -81,11 +81,11 @@ TEST_P(ToArrowSchemaTest, PrimitiveType) { constexpr int32_t kFieldId = 1024; const auto& param = GetParam(); Schema schema( - /*schema_id=*/0, {param.optional ? SchemaField::MakeOptional(kFieldId, std::string(kFieldName), param.iceberg_type) : SchemaField::MakeRequired(kFieldId, std::string(kFieldName), - param.iceberg_type)}); + param.iceberg_type)}, + /*schema_id=*/0); ArrowSchema arrow_schema; ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), IsOk()); @@ -166,9 +166,9 @@ TEST(ToArrowSchemaTest, StructType) { std::make_shared()), SchemaField::MakeOptional(kStrFieldId, std::string(kStrFieldName), std::make_shared())}); - Schema schema( - /*schema_id=*/0, {SchemaField::MakeRequired( - kStructFieldId, std::string(kStructFieldName), struct_type)}); + Schema schema({SchemaField::MakeRequired(kStructFieldId, std::string(kStructFieldName), + struct_type)}, + /*schema_id=*/0); ArrowSchema arrow_schema; ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), IsOk()); @@ -199,8 +199,8 @@ TEST(ToArrowSchemaTest, ListType) { auto list_type = std::make_shared(SchemaField::MakeOptional( kElemFieldId, std::string(kElemFieldName), std::make_shared())); Schema schema( - /*schema_id=*/0, - {SchemaField::MakeRequired(kListFieldId, std::string(kListFieldName), list_type)}); + {SchemaField::MakeRequired(kListFieldId, std::string(kListFieldName), list_type)}, + /*schema_id=*/0); ArrowSchema arrow_schema; ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), IsOk()); @@ -234,8 +234,8 @@ TEST(ToArrowSchemaTest, MapType) { std::make_shared())); Schema schema( - /*schema_id=*/0, - {SchemaField::MakeRequired(kFieldId, std::string(kMapFieldName), map_type)}); + {SchemaField::MakeRequired(kFieldId, std::string(kMapFieldName), map_type)}, + /*schema_id=*/0); ArrowSchema arrow_schema; ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), IsOk()); diff --git a/test/json_internal_test.cc b/test/json_internal_test.cc index 6b6a22044..0ff158e5e 100644 --- a/test/json_internal_test.cc +++ b/test/json_internal_test.cc @@ -134,9 +134,10 @@ TEST(JsonPartitionTest, PartitionFieldFromJsonMissingField) { TEST(JsonPartitionTest, PartitionSpec) { auto schema = std::make_shared( - 100, std::vector{ - SchemaField(3, "region", std::make_shared(), false), - SchemaField(5, "ts", std::make_shared(), false)}); + std::vector{ + SchemaField(3, "region", std::make_shared(), false), + SchemaField(5, "ts", std::make_shared(), false)}, + /*schema_id=*/100); auto identity_transform = Transform::Identity(); PartitionSpec spec(schema, 1, diff --git a/test/metadata_serde_test.cc b/test/metadata_serde_test.cc new file mode 100644 index 000000000..45384df90 --- /dev/null +++ b/test/metadata_serde_test.cc @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "iceberg/json_internal.h" +#include "iceberg/partition_field.h" +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/snapshot.h" +#include "iceberg/sort_field.h" +#include "iceberg/sort_order.h" +#include "iceberg/table_metadata.h" +#include "iceberg/test/test_config.h" +#include "iceberg/transform.h" +#include "iceberg/type.h" + +namespace iceberg { + +namespace { + +class MetadataSerdeTest : public ::testing::Test { + protected: + void SetUp() override {} + + static std::string GetResourcePath(const std::string& file_name) { + return std::string(ICEBERG_TEST_RESOURCES) + "/" + file_name; + } + + static void ReadJsonFile(const std::string& file_name, std::string* content) { + std::filesystem::path path{GetResourcePath(file_name)}; + ASSERT_TRUE(std::filesystem::exists(path)) + << "File does not exist: " << path.string(); + + std::ifstream file(path); + std::stringstream buffer; + buffer << file.rdbuf(); + *content = buffer.str(); + } + + static void ReadTableMetadata(const std::string& file_name, + std::unique_ptr* metadata) { + std::string json_content; + ReadJsonFile(file_name, &json_content); + + nlohmann::json json = nlohmann::json::parse(json_content); + auto result = TableMetadataFromJson(json); + ASSERT_TRUE(result.has_value()) << "Failed to parse table metadata from " << file_name + << ": " << result.error().message; + *metadata = std::move(result.value()); + } +}; + +} // namespace + +TEST_F(MetadataSerdeTest, DeserializeV1Valid) { + std::unique_ptr metadata; + ASSERT_NO_FATAL_FAILURE(ReadTableMetadata("TableMetadataV1Valid.json", &metadata)); + + EXPECT_EQ(metadata->format_version, 1); + EXPECT_EQ(metadata->table_uuid, "d20125c8-7284-442c-9aea-15fee620737c"); + EXPECT_EQ(metadata->location, "s3://bucket/test/location"); + EXPECT_EQ(metadata->last_updated_ms.time_since_epoch().count(), 1602638573874); + EXPECT_EQ(metadata->last_column_id, 3); + EXPECT_EQ(metadata->current_snapshot_id, -1); + + // Compare schema + EXPECT_EQ(metadata->current_schema_id, std::nullopt); + std::vector schema_fields; + schema_fields.emplace_back(/*field_id=*/1, "x", std::make_shared(), + /*optional=*/false); + schema_fields.emplace_back(/*field_id=*/2, "y", std::make_shared(), + /*optional=*/false); + schema_fields.emplace_back(/*field_id=*/3, "z", std::make_shared(), + /*optional=*/false); + auto expected_schema = + std::make_shared(schema_fields, /*schema_id=*/std::nullopt); + auto schema = metadata->Schema(); + ASSERT_NE(schema, nullptr); + EXPECT_EQ(*schema, *expected_schema); + + // Compare partition spec + std::vector partition_fields; + partition_fields.emplace_back(/*source_id=*/1, /*field_id=*/1000, /*name=*/"x", + Transform::Identity()); + auto expected_spec = + std::make_shared(expected_schema, /*spec_id=*/0, partition_fields); + auto partition_spec = metadata->PartitionSpec(); + ASSERT_NE(partition_spec, nullptr); + EXPECT_EQ(*partition_spec, *expected_spec); +} + +TEST_F(MetadataSerdeTest, DeserializeV2Valid) { + std::unique_ptr metadata; + ASSERT_NO_FATAL_FAILURE(ReadTableMetadata("TableMetadataV2Valid.json", &metadata)); + + EXPECT_EQ(metadata->format_version, 2); + EXPECT_EQ(metadata->table_uuid, "9c12d441-03fe-4693-9a96-a0705ddf69c1"); + EXPECT_EQ(metadata->location, "s3://bucket/test/location"); + EXPECT_EQ(metadata->last_updated_ms.time_since_epoch().count(), 1602638573590); + EXPECT_EQ(metadata->last_column_id, 3); + + // Compare schema + EXPECT_EQ(metadata->current_schema_id, 1); + std::vector schema_fields; + schema_fields.emplace_back(/*field_id=*/1, "x", std::make_shared(), + /*optional=*/false); + schema_fields.emplace_back(/*field_id=*/2, "y", std::make_shared(), + /*optional=*/false); + schema_fields.emplace_back(/*field_id=*/3, "z", std::make_shared(), + /*optional=*/false); + auto expected_schema = + std::make_shared(std::move(schema_fields), /*schema_id=*/1); + auto schema = metadata->Schema(); + ASSERT_NE(schema, nullptr); + EXPECT_EQ(*schema, *expected_schema); + + // Compare partition spec + EXPECT_EQ(metadata->default_spec_id, 0); + std::vector partition_fields; + partition_fields.emplace_back(/*source_id=*/1, /*field_id=*/1000, /*name=*/"x", + Transform::Identity()); + auto expected_spec = std::make_shared(expected_schema, /*spec_id=*/0, + std::move(partition_fields)); + auto partition_spec = metadata->PartitionSpec(); + ASSERT_NE(partition_spec, nullptr); + EXPECT_EQ(*partition_spec, *expected_spec); + + // Compare sort order + EXPECT_EQ(metadata->default_sort_order_id, 3); + std::vector sort_fields; + sort_fields.emplace_back(/*source_id=*/2, Transform::Identity(), + SortDirection::kAscending, NullOrder::kFirst); + sort_fields.emplace_back(/*source_id=*/3, Transform::Bucket(4), + SortDirection::kDescending, NullOrder::kLast); + auto expected_sort_order = + std::make_shared(/*order_id=*/3, std::move(sort_fields)); + auto sort_order = metadata->SortOrder(); + ASSERT_NE(sort_order, nullptr); + EXPECT_EQ(*sort_order, *expected_sort_order); + + EXPECT_EQ(metadata->current_snapshot_id, 3055729675574597004); + + // Compare snapshots + std::vector expected_snapshots{{ + .snapshot_id = 3051729675574597004, + .sequence_number = 0, + .timestamp_ms = 1515100955770, + .manifest_list = "s3://a/b/1.avro", + .summary = {{"operation", "append"}}, + }, + { + .snapshot_id = 3055729675574597004, + .parent_snapshot_id = 3051729675574597004, + .sequence_number = 1, + .timestamp_ms = 1555100955770, + .manifest_list = "s3://a/b/2.avro", + .summary = {{"operation", "append"}}, + .schema_id = 1, + }}; + EXPECT_EQ(metadata->snapshots.size(), expected_snapshots.size()); + for (size_t i = 0; i < expected_snapshots.size(); ++i) { + EXPECT_EQ(*metadata->snapshots[i], expected_snapshots[i]); + } + + // Compare snapshot logs + std::vector expected_snapshot_log{ + { + .timestamp_ms = TimePointMsFromUnixMs(1515100955770).value(), + .snapshot_id = 3051729675574597004, + }, + { + .timestamp_ms = TimePointMsFromUnixMs(1555100955770).value(), + .snapshot_id = 3055729675574597004, + }}; + EXPECT_EQ(metadata->snapshot_log.size(), 2); + for (size_t i = 0; i < expected_snapshots.size(); ++i) { + EXPECT_EQ(metadata->snapshot_log[i], expected_snapshot_log[i]); + } +} + +} // namespace iceberg diff --git a/test/partition_spec_test.cc b/test/partition_spec_test.cc index 6c2cd590d..c9f4d3acd 100644 --- a/test/partition_spec_test.cc +++ b/test/partition_spec_test.cc @@ -24,6 +24,7 @@ #include #include +#include #include "iceberg/partition_field.h" #include "iceberg/schema.h" @@ -36,7 +37,8 @@ TEST(PartitionSpecTest, Basics) { { SchemaField field1(5, "ts", std::make_shared(), true); SchemaField field2(7, "bar", std::make_shared(), true); - auto const schema = std::make_shared(100, std::vector{field1, field2}); + auto const schema = + std::make_shared(std::vector{field1, field2}, 100); auto identity_transform = Transform::Identity(); PartitionField pt_field1(5, 1000, "day", identity_transform); @@ -60,7 +62,8 @@ TEST(PartitionSpecTest, Basics) { TEST(PartitionSpecTest, Equality) { SchemaField field1(5, "ts", std::make_shared(), true); SchemaField field2(7, "bar", std::make_shared(), true); - auto const schema = std::make_shared(100, std::vector{field1, field2}); + auto const schema = + std::make_shared(std::vector{field1, field2}, 100); auto identity_transform = Transform::Identity(); PartitionField pt_field1(5, 1000, "day", identity_transform); PartitionField pt_field2(7, 1001, "hour", identity_transform); diff --git a/test/resources/TableMetadataPartitionStatisticsFiles.json b/test/resources/TableMetadataPartitionStatisticsFiles.json new file mode 100644 index 000000000..df145e663 --- /dev/null +++ b/test/resources/TableMetadataPartitionStatisticsFiles.json @@ -0,0 +1,61 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 0, + "sort-orders": [ + { + "order-id": 0, + "fields": [] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 0 + } + ], + "partition-statistics": [ + { + "snapshot-id": 3055729675574597004, + "statistics-path": "s3://a/b/partition-stats.parquet", + "file-size-in-bytes": 43 + } + ], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataStatisticsFiles.json b/test/resources/TableMetadataStatisticsFiles.json new file mode 100644 index 000000000..55fc67033 --- /dev/null +++ b/test/resources/TableMetadataStatisticsFiles.json @@ -0,0 +1,70 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 0, + "sort-orders": [ + { + "order-id": 0, + "fields": [] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 0 + } + ], + "statistics": [ + { + "snapshot-id": 3055729675574597004, + "statistics-path": "s3://a/b/stats.puffin", + "file-size-in-bytes": 413, + "file-footer-size-in-bytes": 42, + "blob-metadata": [ + { + "type": "ndv", + "snapshot-id": 3055729675574597004, + "sequence-number": 1, + "fields": [1] + } + ] + } + ], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataUnsupportedVersion.json b/test/resources/TableMetadataUnsupportedVersion.json new file mode 100644 index 000000000..c40a0c9cd --- /dev/null +++ b/test/resources/TableMetadataUnsupportedVersion.json @@ -0,0 +1,36 @@ +{ + "format-version": 4, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": "s3://bucket/test/location", + "last-updated-ms": 1602638573874, + "last-sequence-number": 0, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "partition-spec": [], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [] +} diff --git a/test/resources/TableMetadataV1MissingSchemaType.json b/test/resources/TableMetadataV1MissingSchemaType.json new file mode 100644 index 000000000..dac2bbfb6 --- /dev/null +++ b/test/resources/TableMetadataV1MissingSchemaType.json @@ -0,0 +1,41 @@ +{ + "format-version": 1, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": "s3://bucket/test/location", + "last-updated-ms": 1602638573874, + "last-column-id": 3, + "schema": { + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "partition-spec": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [] +} diff --git a/test/resources/TableMetadataV1Valid.json b/test/resources/TableMetadataV1Valid.json new file mode 100644 index 000000000..9f981cc98 --- /dev/null +++ b/test/resources/TableMetadataV1Valid.json @@ -0,0 +1,42 @@ +{ + "format-version": 1, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": "s3://bucket/test/location", + "last-updated-ms": 1602638573874, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "partition-spec": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [] +} diff --git a/test/resources/TableMetadataV2CurrentSchemaNotFound.json b/test/resources/TableMetadataV2CurrentSchemaNotFound.json new file mode 100644 index 000000000..ae8858515 --- /dev/null +++ b/test/resources/TableMetadataV2CurrentSchemaNotFound.json @@ -0,0 +1,87 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 2, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + }, + { + "type": "struct", + "schema-id": 1, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2MissingLastPartitionId.json b/test/resources/TableMetadataV2MissingLastPartitionId.json new file mode 100644 index 000000000..65af7da66 --- /dev/null +++ b/test/resources/TableMetadataV2MissingLastPartitionId.json @@ -0,0 +1,73 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2MissingPartitionSpecs.json b/test/resources/TableMetadataV2MissingPartitionSpecs.json new file mode 100644 index 000000000..7e4b82656 --- /dev/null +++ b/test/resources/TableMetadataV2MissingPartitionSpecs.json @@ -0,0 +1,67 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }], + "partition-spec": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2MissingSchemas.json b/test/resources/TableMetadataV2MissingSchemas.json new file mode 100644 index 000000000..85fd03b5e --- /dev/null +++ b/test/resources/TableMetadataV2MissingSchemas.json @@ -0,0 +1,71 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2MissingSortOrder.json b/test/resources/TableMetadataV2MissingSortOrder.json new file mode 100644 index 000000000..52b1f8cda --- /dev/null +++ b/test/resources/TableMetadataV2MissingSortOrder.json @@ -0,0 +1,54 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2Valid.json b/test/resources/TableMetadataV2Valid.json new file mode 100644 index 000000000..3393cd7b1 --- /dev/null +++ b/test/resources/TableMetadataV2Valid.json @@ -0,0 +1,122 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + }, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [ + 1, + 2 + ], + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/1.avro" + }, + { + "snapshot-id": 3055729675574597004, + "parent-snapshot-id": 3051729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 1 + } + ], + "snapshot-log": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770 + }, + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770 + } + ], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2ValidMinimal.json b/test/resources/TableMetadataV2ValidMinimal.json new file mode 100644 index 000000000..3976ba1ca --- /dev/null +++ b/test/resources/TableMetadataV2ValidMinimal.json @@ -0,0 +1,71 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ] +} diff --git a/test/resources/TableMetadataV3ValidMinimal.json b/test/resources/TableMetadataV3ValidMinimal.json new file mode 100644 index 000000000..b0411141e --- /dev/null +++ b/test/resources/TableMetadataV3ValidMinimal.json @@ -0,0 +1,73 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long", + "initial-default": 1, + "write-default": 1 + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ] +} diff --git a/test/schema_test.cc b/test/schema_test.cc index 88dde5bda..1017f17ad 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -32,7 +32,7 @@ TEST(SchemaTest, Basics) { { iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(7, "bar", std::make_shared(), true); - iceberg::Schema schema(100, {field1, field2}); + iceberg::Schema schema({field1, field2}, 100); ASSERT_EQ(schema, schema); ASSERT_EQ(100, schema.schema_id()); std::span fields = schema.fields(); @@ -56,7 +56,7 @@ TEST(SchemaTest, Basics) { iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(5, "bar", std::make_shared(), true); - iceberg::Schema schema(100, {field1, field2}); + iceberg::Schema schema({field1, field2}, 100); }, ::testing::ThrowsMessage( ::testing::HasSubstr("duplicate field ID 5"))); @@ -66,11 +66,11 @@ TEST(SchemaTest, Equality) { iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(7, "bar", std::make_shared(), true); iceberg::SchemaField field3(5, "foobar", std::make_shared(), true); - iceberg::Schema schema1(100, {field1, field2}); - iceberg::Schema schema2(101, {field1, field2}); - iceberg::Schema schema3(101, {field1}); - iceberg::Schema schema4(101, {field3, field2}); - iceberg::Schema schema5(100, {field1, field2}); + iceberg::Schema schema1({field1, field2}, 100); + iceberg::Schema schema2({field1, field2}, 101); + iceberg::Schema schema3({field1}, 101); + iceberg::Schema schema4({field3, field2}, 101); + iceberg::Schema schema5({field1, field2}, 100); ASSERT_EQ(schema1, schema1); ASSERT_NE(schema1, schema2); diff --git a/test/test_config.h.in b/test/test_config.h.in new file mode 100644 index 000000000..0f0ec5f66 --- /dev/null +++ b/test/test_config.h.in @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#define ICEBERG_TEST_RESOURCES "@ICEBERG_TEST_RESOURCES@" From 156c90ee267552afc9b40eaf3a55353dc3344128 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 19 Apr 2025 15:31:06 +0800 Subject: [PATCH 2/4] fix unsorted order id --- src/iceberg/sort_order.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/iceberg/sort_order.cc b/src/iceberg/sort_order.cc index 624ac3193..002928b53 100644 --- a/src/iceberg/sort_order.cc +++ b/src/iceberg/sort_order.cc @@ -31,7 +31,7 @@ SortOrder::SortOrder(int32_t order_id, std::vector fields) const std::shared_ptr& SortOrder::Unsorted() { static const std::shared_ptr unsorted = - std::make_shared(kInitialSortOrderId, std::vector{}); + std::make_shared(/*order_id=*/0, std::vector{}); return unsorted; } From 37a8c3000016f20de30776fa1ce99d7389b4cb15 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 20 Apr 2025 15:51:33 +0800 Subject: [PATCH 3/4] use result for schema/spec/order --- src/iceberg/constants.h | 38 --------------------------------- src/iceberg/json_internal.cc | 33 +++++++++++++++-------------- src/iceberg/partition_spec.cc | 1 - src/iceberg/partition_spec.h | 3 ++- src/iceberg/result.h | 1 + src/iceberg/schema.h | 2 ++ src/iceberg/schema_field.h | 2 ++ src/iceberg/snapshot.h | 2 ++ src/iceberg/sort_order.cc | 1 - src/iceberg/sort_order.h | 2 ++ src/iceberg/table_metadata.cc | 40 ++++++++++++++++++++++++----------- src/iceberg/table_metadata.h | 20 ++++++++++++------ test/metadata_serde_test.cc | 20 +++++++++--------- 13 files changed, 80 insertions(+), 85 deletions(-) delete mode 100644 src/iceberg/constants.h diff --git a/src/iceberg/constants.h b/src/iceberg/constants.h deleted file mode 100644 index 4534ad84f..000000000 --- a/src/iceberg/constants.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once - -#include - -namespace iceberg { - -constexpr int8_t kDefaultTableFormatVersion = 2; -constexpr int8_t kSupportedTableFormatVersion = 3; -constexpr int8_t kMinFormatVersionRowLineage = 3; -constexpr int32_t kInitialSpecId = 0; -constexpr int32_t kInitialSortOrderId = 1; -constexpr int32_t kInitialSchemaId = 0; -constexpr int64_t kInitialRowId = 0; -constexpr int64_t kInitialSequenceNumber = 0; -constexpr int64_t kInvalidSequenceNumber = -1; -constexpr int64_t kInvalidSnapshotId = -1; -constexpr int32_t kInvalidFieldId = -1; - -} // namespace iceberg diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index 600506db0..965892ccc 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -27,9 +27,9 @@ #include #include +#include #include -#include "iceberg/constants.h" #include "iceberg/partition_field.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" @@ -499,7 +499,7 @@ nlohmann::json ToJson(const Snapshot& snapshot) { nlohmann::json json; json[kSnapshotId] = snapshot.snapshot_id; SetOptionalField(json, kParentSnapshotId, snapshot.parent_snapshot_id); - if (snapshot.sequence_number > kInitialSequenceNumber) { + if (snapshot.sequence_number > TableMetadata::kInitialSequenceNumber) { json[kSequenceNumber] = snapshot.sequence_number; } json[kTimestampMs] = snapshot.timestamp_ms; @@ -666,8 +666,8 @@ Result> PartitionFieldFromJson( int32_t field_id; if (allow_field_id_missing) { // Partition field id in v1 is not tracked, so we use -1 to indicate that. - ICEBERG_ASSIGN_OR_RAISE( - field_id, GetJsonValueOrDefault(json, kFieldId, kInvalidFieldId)); + ICEBERG_ASSIGN_OR_RAISE(field_id, GetJsonValueOrDefault( + json, kFieldId, SchemaField::kInvalidFieldId)); } else { ICEBERG_ASSIGN_OR_RAISE(field_id, GetJsonValue(json, kFieldId)); } @@ -765,8 +765,9 @@ Result> SnapshotFromJson(const nlohmann::json& json) { ICEBERG_ASSIGN_OR_RAISE(auto schema_id, GetJsonValueOptional(json, kSchemaId)); return std::make_unique( - snapshot_id, parent_snapshot_id, sequence_number.value_or(kInitialSequenceNumber), - timestamp_ms, manifest_list, std::move(summary), schema_id); + snapshot_id, parent_snapshot_id, + sequence_number.value_or(TableMetadata::kInitialSequenceNumber), timestamp_ms, + manifest_list, std::move(summary), schema_id); } nlohmann::json ToJson(const BlobMetadata& blob_metadata) { @@ -1054,7 +1055,7 @@ Status ParsePartitionSpecs(const nlohmann::json& json, int8_t format_version, for (const auto& entry_json : partition_spec_json) { ICEBERG_ASSIGN_OR_RAISE(auto field, PartitionFieldFromJson(entry_json)); int32_t field_id = field->field_id(); - if (field_id == kInvalidFieldId) { + if (field_id == SchemaField::kInvalidFieldId) { // If the field ID is not set, we need to assign a new one field_id = next_partition_field_id++; } @@ -1062,8 +1063,8 @@ Status ParsePartitionSpecs(const nlohmann::json& json, int8_t format_version, std::move(field->transform())); } - auto spec = std::make_unique(current_schema, kInitialSpecId, - std::move(fields)); + auto spec = std::make_unique( + current_schema, PartitionSpec::kInitialSpecId, std::move(fields)); default_spec_id = spec->spec_id(); partition_specs.push_back(std::move(spec)); } @@ -1112,7 +1113,7 @@ Result> TableMetadataFromJson(const nlohmann::jso ICEBERG_ASSIGN_OR_RAISE(table_metadata->format_version, GetJsonValue(json, kFormatVersion)); if (table_metadata->format_version < 1 || - table_metadata->format_version > kSupportedTableFormatVersion) { + table_metadata->format_version > TableMetadata::kSupportedTableFormatVersion) { return JsonParseError("Cannot read unsupported version: {}", table_metadata->format_version); } @@ -1126,7 +1127,7 @@ Result> TableMetadataFromJson(const nlohmann::jso ICEBERG_ASSIGN_OR_RAISE(table_metadata->last_sequence_number, GetJsonValue(json, kLastSequenceNumber)); } else { - table_metadata->last_sequence_number = kInitialSequenceNumber; + table_metadata->last_sequence_number = TableMetadata::kInitialSequenceNumber; } ICEBERG_ASSIGN_OR_RAISE(table_metadata->last_column_id, GetJsonValue(json, kLastColumnId)); @@ -1169,15 +1170,15 @@ Result> TableMetadataFromJson(const nlohmann::jso } // This field is optional, but internally we set this to -1 when not set - ICEBERG_ASSIGN_OR_RAISE( - table_metadata->current_snapshot_id, - GetJsonValueOrDefault(json, kCurrentSnapshotId, kInvalidSnapshotId)); + ICEBERG_ASSIGN_OR_RAISE(table_metadata->current_snapshot_id, + GetJsonValueOrDefault(json, kCurrentSnapshotId, + Snapshot::kInvalidSnapshotId)); if (table_metadata->format_version >= 3) { ICEBERG_ASSIGN_OR_RAISE(table_metadata->next_row_id, GetJsonValue(json, kNextRowId)); } else { - table_metadata->next_row_id = kInitialRowId; + table_metadata->next_row_id = TableMetadata::kInitialRowId; } ICEBERG_ASSIGN_OR_RAISE(auto last_updated_ms, @@ -1189,7 +1190,7 @@ Result> TableMetadataFromJson(const nlohmann::jso ICEBERG_ASSIGN_OR_RAISE( table_metadata->refs, FromJsonMap>(json, kRefs, SnapshotRefFromJson)); - } else if (table_metadata->current_snapshot_id != kInvalidSnapshotId) { + } else if (table_metadata->current_snapshot_id != Snapshot::kInvalidSnapshotId) { table_metadata->refs["main"] = std::make_unique(SnapshotRef{ .snapshot_id = table_metadata->current_snapshot_id, .retention = SnapshotRef::Branch{}, diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index e8cd1a0dc..47794e7c6 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -23,7 +23,6 @@ #include #include -#include "iceberg/constants.h" #include "iceberg/schema.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index 3a40db5db..a18ba7b24 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -41,9 +41,10 @@ namespace iceberg { /// evolution. class ICEBERG_EXPORT PartitionSpec : public util::Formattable { public: + static constexpr int32_t kInitialSpecId = 0; /// \brief The start ID for partition field. It is only used to generate /// partition field id for v1 metadata where it is tracked. - constexpr static int32_t kLegacyPartitionDataIdStart = 1000; + static constexpr int32_t kLegacyPartitionDataIdStart = 1000; /// \brief Create a new partition spec. /// diff --git a/src/iceberg/result.h b/src/iceberg/result.h index f8b3740b6..bedaccc47 100644 --- a/src/iceberg/result.h +++ b/src/iceberg/result.h @@ -42,6 +42,7 @@ enum class ErrorKind { kNotSupported, kInvalidExpression, kJsonParseError, + kNotFound, }; /// \brief Error with a kind and a message. diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 51cafa8e7..a23e3e4be 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -41,6 +41,8 @@ namespace iceberg { /// evolution. class ICEBERG_EXPORT Schema : public StructType { public: + static constexpr int32_t kInitialSchemaId = 0; + Schema(std::vector fields, std::optional schema_id); /// \brief Get the schema ID. diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index 3fde248f7..7f33728f2 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -37,6 +37,8 @@ namespace iceberg { /// \brief A type combined with a name. class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { public: + static constexpr int32_t kInvalidFieldId = -1; + /// \brief Construct a field. /// \param[in] field_id The field ID. /// \param[in] name The field name. diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index da9dd95b0..bc68a4949 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -233,6 +233,8 @@ struct ICEBERG_EXPORT DataOperation { /// /// Snapshots are created by table operations. struct ICEBERG_EXPORT Snapshot { + static constexpr int64_t kInvalidSnapshotId = -1; + /// A unqiue long ID. int64_t snapshot_id; /// The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent. diff --git a/src/iceberg/sort_order.cc b/src/iceberg/sort_order.cc index 002928b53..9db51c70d 100644 --- a/src/iceberg/sort_order.cc +++ b/src/iceberg/sort_order.cc @@ -21,7 +21,6 @@ #include -#include "iceberg/constants.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { diff --git a/src/iceberg/sort_order.h b/src/iceberg/sort_order.h index 66bc1d661..de4abbae2 100644 --- a/src/iceberg/sort_order.h +++ b/src/iceberg/sort_order.h @@ -36,6 +36,8 @@ namespace iceberg { /// applied to the data. class ICEBERG_EXPORT SortOrder : public util::Formattable { public: + static constexpr int32_t kInitialSortOrderId = 1; + SortOrder(int32_t order_id, std::vector fields); /// \brief Get an unsorted sort order singleton. diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index f043bb74d..ece4d0eee 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -23,6 +23,7 @@ #include #include +#include "iceberg/expected.h" #include "iceberg/partition_spec.h" #include "iceberg/schema.h" #include "iceberg/sort_order.h" @@ -38,31 +39,46 @@ std::string ToString(const MetadataLogEntry& entry) { entry.metadata_file); } -const std::shared_ptr& TableMetadata::Schema() const { - static const std::shared_ptr<::iceberg::Schema> empty_schema = nullptr; - +Result>> TableMetadata::Schema() + const { auto iter = std::ranges::find_if(schemas, [this](const auto& schema) { return schema->schema_id() == current_schema_id; }); - return iter == schemas.end() ? empty_schema : *iter; + if (iter == schemas.end()) { + return unexpected({ + .kind = ErrorKind::kNotFound, + .message = std::format("Current schema is not found"), + }); + } + return std::cref(*iter); } -const std::shared_ptr& TableMetadata::PartitionSpec() const { - static const std::shared_ptr empty_spec = nullptr; - +Result>> +TableMetadata::PartitionSpec() const { auto iter = std::ranges::find_if(partition_specs, [this](const auto& spec) { return spec->spec_id() == default_spec_id; }); - return iter == partition_specs.end() ? empty_spec : *iter; + if (iter == partition_specs.end()) { + return unexpected({ + .kind = ErrorKind::kNotFound, + .message = std::format("Default partition spec is not found"), + }); + } + return std::cref(*iter); } -const std::shared_ptr& TableMetadata::SortOrder() const { - static const std::shared_ptr empty_order = nullptr; - +Result>> +TableMetadata::SortOrder() const { auto iter = std::ranges::find_if(sort_orders, [this](const auto& order) { return order->order_id() == default_sort_order_id; }); - return iter == sort_orders.end() ? empty_order : *iter; + if (iter == sort_orders.end()) { + return unexpected({ + .kind = ErrorKind::kNotFound, + .message = std::format("Default sort order is not found"), + }); + } + return std::cref(*iter); } Result TimePointMsFromUnixMs(int64_t unix_ms) { diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index f38606715..fa86adc9d 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -83,6 +83,13 @@ struct ICEBERG_EXPORT MetadataLogEntry { /// Schema|PartitionSpec|SortOrder> 2) List 3) Map 4) /// Map struct ICEBERG_EXPORT TableMetadata { + static constexpr int8_t kDefaultTableFormatVersion = 2; + static constexpr int8_t kSupportedTableFormatVersion = 3; + static constexpr int8_t kMinFormatVersionRowLineage = 3; + static constexpr int64_t kInitialSequenceNumber = 0; + static constexpr int64_t kInvalidSequenceNumber = -1; + static constexpr int64_t kInitialRowId = 0; + /// An integer version number for the format int8_t format_version; /// A UUID that identifies the table @@ -130,12 +137,13 @@ struct ICEBERG_EXPORT TableMetadata { /// A `long` higher than all assigned row IDs int64_t next_row_id; - /// \brief Get the current schema - const std::shared_ptr& Schema() const; - /// \brief Get the current partition spec - const std::shared_ptr& PartitionSpec() const; - /// \brief Get the current sort order - const std::shared_ptr& SortOrder() const; + /// \brief Get the current schema, return NotFoundError if not found + Result>> Schema() const; + /// \brief Get the current partition spec, return NotFoundError if not found + Result>> PartitionSpec() + const; + /// \brief Get the current sort order, return NotFoundError if not found + Result>> SortOrder() const; }; /// \brief Returns a string representation of a SnapshotLogEntry diff --git a/test/metadata_serde_test.cc b/test/metadata_serde_test.cc index 45384df90..d187bafb1 100644 --- a/test/metadata_serde_test.cc +++ b/test/metadata_serde_test.cc @@ -100,8 +100,8 @@ TEST_F(MetadataSerdeTest, DeserializeV1Valid) { auto expected_schema = std::make_shared(schema_fields, /*schema_id=*/std::nullopt); auto schema = metadata->Schema(); - ASSERT_NE(schema, nullptr); - EXPECT_EQ(*schema, *expected_schema); + ASSERT_TRUE(schema.has_value()); + EXPECT_EQ(*(schema.value().get()), *expected_schema); // Compare partition spec std::vector partition_fields; @@ -110,8 +110,8 @@ TEST_F(MetadataSerdeTest, DeserializeV1Valid) { auto expected_spec = std::make_shared(expected_schema, /*spec_id=*/0, partition_fields); auto partition_spec = metadata->PartitionSpec(); - ASSERT_NE(partition_spec, nullptr); - EXPECT_EQ(*partition_spec, *expected_spec); + ASSERT_TRUE(partition_spec.has_value()); + EXPECT_EQ(*(partition_spec.value().get()), *expected_spec); } TEST_F(MetadataSerdeTest, DeserializeV2Valid) { @@ -136,8 +136,8 @@ TEST_F(MetadataSerdeTest, DeserializeV2Valid) { auto expected_schema = std::make_shared(std::move(schema_fields), /*schema_id=*/1); auto schema = metadata->Schema(); - ASSERT_NE(schema, nullptr); - EXPECT_EQ(*schema, *expected_schema); + ASSERT_TRUE(schema.has_value()); + EXPECT_EQ(*(schema.value().get()), *expected_schema); // Compare partition spec EXPECT_EQ(metadata->default_spec_id, 0); @@ -147,8 +147,8 @@ TEST_F(MetadataSerdeTest, DeserializeV2Valid) { auto expected_spec = std::make_shared(expected_schema, /*spec_id=*/0, std::move(partition_fields)); auto partition_spec = metadata->PartitionSpec(); - ASSERT_NE(partition_spec, nullptr); - EXPECT_EQ(*partition_spec, *expected_spec); + ASSERT_TRUE(partition_spec.has_value()); + EXPECT_EQ(*(partition_spec.value().get()), *expected_spec); // Compare sort order EXPECT_EQ(metadata->default_sort_order_id, 3); @@ -160,8 +160,8 @@ TEST_F(MetadataSerdeTest, DeserializeV2Valid) { auto expected_sort_order = std::make_shared(/*order_id=*/3, std::move(sort_fields)); auto sort_order = metadata->SortOrder(); - ASSERT_NE(sort_order, nullptr); - EXPECT_EQ(*sort_order, *expected_sort_order); + ASSERT_TRUE(sort_order.has_value()); + EXPECT_EQ(*(sort_order.value().get()), *expected_sort_order); EXPECT_EQ(metadata->current_snapshot_id, 3055729675574597004); From 183d0ec0211ad3b5d42df28e142a9a20c4c85c14 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 20 Apr 2025 18:26:32 +0800 Subject: [PATCH 4/4] avoid std::reference_wrapper because expected requires default constructor --- src/iceberg/table_metadata.cc | 15 ++++++--------- src/iceberg/table_metadata.h | 7 +++---- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index ece4d0eee..494db0c9d 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -39,8 +39,7 @@ std::string ToString(const MetadataLogEntry& entry) { entry.metadata_file); } -Result>> TableMetadata::Schema() - const { +Result> TableMetadata::Schema() const { auto iter = std::ranges::find_if(schemas, [this](const auto& schema) { return schema->schema_id() == current_schema_id; }); @@ -50,11 +49,10 @@ Result>> TableMetadata::Sch .message = std::format("Current schema is not found"), }); } - return std::cref(*iter); + return *iter; } -Result>> -TableMetadata::PartitionSpec() const { +Result> TableMetadata::PartitionSpec() const { auto iter = std::ranges::find_if(partition_specs, [this](const auto& spec) { return spec->spec_id() == default_spec_id; }); @@ -64,11 +62,10 @@ TableMetadata::PartitionSpec() const { .message = std::format("Default partition spec is not found"), }); } - return std::cref(*iter); + return *iter; } -Result>> -TableMetadata::SortOrder() const { +Result> TableMetadata::SortOrder() const { auto iter = std::ranges::find_if(sort_orders, [this](const auto& order) { return order->order_id() == default_sort_order_id; }); @@ -78,7 +75,7 @@ TableMetadata::SortOrder() const { .message = std::format("Default sort order is not found"), }); } - return std::cref(*iter); + return *iter; } Result TimePointMsFromUnixMs(int64_t unix_ms) { diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index fa86adc9d..3081d56ae 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -138,12 +138,11 @@ struct ICEBERG_EXPORT TableMetadata { int64_t next_row_id; /// \brief Get the current schema, return NotFoundError if not found - Result>> Schema() const; + Result> Schema() const; /// \brief Get the current partition spec, return NotFoundError if not found - Result>> PartitionSpec() - const; + Result> PartitionSpec() const; /// \brief Get the current sort order, return NotFoundError if not found - Result>> SortOrder() const; + Result> SortOrder() const; }; /// \brief Returns a string representation of a SnapshotLogEntry