diff --git a/.github/.licenserc.yaml b/.github/.licenserc.yaml index 8b2055400..72f329ebb 100644 --- a/.github/.licenserc.yaml +++ b/.github/.licenserc.yaml @@ -13,5 +13,6 @@ header: - 'NOTICE' - 'src/iceberg/expected.h' - 'src/iceberg/util/murmurhash3_internal.*' + - 'test/resources/**' comment: on-failure diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 94e2fa268..52c8fe53f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,6 +33,7 @@ repos: rev: v19.1.5 hooks: - id: clang-format + exclude: ^test/resources/.*\.json$ - repo: https://github.com/cheshirekow/cmake-format-precommit rev: v0.6.10 diff --git a/src/iceberg/json_internal.cc b/src/iceberg/json_internal.cc index 5fd76c60d..965892ccc 100644 --- a/src/iceberg/json_internal.cc +++ b/src/iceberg/json_internal.cc @@ -22,12 +22,15 @@ #include #include #include +#include #include #include #include +#include #include +#include "iceberg/partition_field.h" #include "iceberg/partition_spec.h" #include "iceberg/result.h" #include "iceberg/schema.h" @@ -248,7 +251,7 @@ Result> FromJsonList( list.emplace_back(std::move(entry)); } } - return {}; + return list; } /// \brief Parse a list of items from a JSON object. @@ -471,7 +474,7 @@ nlohmann::json ToJson(const Type& type) { nlohmann::json ToJson(const Schema& schema) { nlohmann::json json = ToJson(static_cast(schema)); - json[kSchemaId] = schema.schema_id(); + SetOptionalField(json, kSchemaId, schema.schema_id()); // TODO(gangwu): add identifier-field-ids. return json; } @@ -625,7 +628,7 @@ Result> FieldFromJson(const nlohmann::json& json) { } Result> SchemaFromJson(const nlohmann::json& json) { - ICEBERG_ASSIGN_OR_RAISE(auto schema_id, GetJsonValue(json, kSchemaId)); + ICEBERG_ASSIGN_OR_RAISE(auto schema_id, GetJsonValueOptional(json, kSchemaId)); ICEBERG_ASSIGN_OR_RAISE(auto type, TypeFromJson(json)); if (type->type_id() != TypeId::kStruct) [[unlikely]] { @@ -658,9 +661,16 @@ nlohmann::json ToJson(const PartitionSpec& partition_spec) { } Result> PartitionFieldFromJson( - const nlohmann::json& json) { + const nlohmann::json& json, bool allow_field_id_missing) { ICEBERG_ASSIGN_OR_RAISE(auto source_id, GetJsonValue(json, kSourceId)); - ICEBERG_ASSIGN_OR_RAISE(auto field_id, GetJsonValue(json, kFieldId)); + int32_t field_id; + if (allow_field_id_missing) { + // Partition field id in v1 is not tracked, so we use -1 to indicate that. + ICEBERG_ASSIGN_OR_RAISE(field_id, GetJsonValueOrDefault( + json, kFieldId, SchemaField::kInvalidFieldId)); + } else { + ICEBERG_ASSIGN_OR_RAISE(field_id, GetJsonValue(json, kFieldId)); + } ICEBERG_ASSIGN_OR_RAISE( auto transform, GetJsonValue(json, kTransform).and_then(TransformFromString)); @@ -905,7 +915,7 @@ nlohmann::json ToJson(const TableMetadata& table_metadata) { } // write the current schema ID and schema list - json[kCurrentSchemaId] = table_metadata.current_schema_id; + SetOptionalField(json, kCurrentSchemaId, table_metadata.current_schema_id); json[kSchemas] = ToJsonList(table_metadata.schemas); // for older readers, continue writing the default spec as "partition-spec" @@ -963,7 +973,8 @@ namespace { /// /// \return The current schema or parse error. Result> ParseSchemas( - const nlohmann::json& json, int8_t format_version, int32_t& current_schema_id, + const nlohmann::json& json, int8_t format_version, + std::optional& current_schema_id, std::vector>& schemas) { std::shared_ptr current_schema; if (json.contains(kSchemas)) { @@ -986,7 +997,7 @@ Result> ParseSchemas( } if (!current_schema) { return JsonParseError("Cannot find schema with {}={} from {}", kCurrentSchemaId, - current_schema_id, schema_array.dump()); + current_schema_id.value(), schema_array.dump()); } } else { if (format_version != 1) { @@ -1031,13 +1042,30 @@ Status ParsePartitionSpecs(const nlohmann::json& json, int8_t format_version, return JsonParseError("{} must exist in format v{}", kPartitionSpecs, format_version); } - default_spec_id = TableMetadata::kInitialSpecId; - ICEBERG_ASSIGN_OR_RAISE(auto spec, GetJsonValue(json, kPartitionSpec) - .and_then([current_schema](const auto& json) { - return PartitionSpecFromJson(current_schema, - json); - })); + ICEBERG_ASSIGN_OR_RAISE(auto partition_spec_json, + GetJsonValue(json, kPartitionSpec)); + if (!partition_spec_json.is_array()) { + return JsonParseError("Cannot parse v1 partition spec from non-array: {}", + partition_spec_json.dump()); + } + + int32_t next_partition_field_id = PartitionSpec::kLegacyPartitionDataIdStart; + std::vector fields; + for (const auto& entry_json : partition_spec_json) { + ICEBERG_ASSIGN_OR_RAISE(auto field, PartitionFieldFromJson(entry_json)); + int32_t field_id = field->field_id(); + if (field_id == SchemaField::kInvalidFieldId) { + // If the field ID is not set, we need to assign a new one + field_id = next_partition_field_id++; + } + fields.emplace_back(field->source_id(), field_id, std::string(field->name()), + std::move(field->transform())); + } + + auto spec = std::make_unique( + current_schema, PartitionSpec::kInitialSpecId, std::move(fields)); + default_spec_id = spec->spec_id(); partition_specs.push_back(std::move(spec)); } @@ -1066,7 +1094,9 @@ Status ParseSortOrders(const nlohmann::json& json, int8_t format_version, if (format_version > 1) { return JsonParseError("{} must exist in format v{}", kSortOrders, format_version); } - return NotImplementedError("Assign a default sort order"); + auto sort_order = SortOrder::Unsorted(); + default_sort_order_id = sort_order->order_id(); + sort_orders.push_back(std::move(sort_order)); } return {}; } @@ -1119,10 +1149,16 @@ Result> TableMetadataFromJson(const nlohmann::jso return JsonParseError("{} must exist in format v{}", kLastPartitionId, table_metadata->format_version); } - // TODO(gangwu): iterate all partition specs to find the largest partition - // field id or assign a default value for unpartitioned tables. However, - // PartitionSpec::lastAssignedFieldId() is not implemented yet. - return NotImplementedError("Find the largest partition field id"); + + if (table_metadata->partition_specs.empty()) { + table_metadata->last_partition_id = + PartitionSpec::Unpartitioned()->last_assigned_field_id(); + } else { + table_metadata->last_partition_id = + std::ranges::max(table_metadata->partition_specs, {}, [](const auto& spec) { + return spec->last_assigned_field_id(); + })->last_assigned_field_id(); + } } ICEBERG_RETURN_UNEXPECTED(ParseSortOrders(json, table_metadata->format_version, @@ -1134,10 +1170,9 @@ Result> TableMetadataFromJson(const nlohmann::jso } // This field is optional, but internally we set this to -1 when not set - ICEBERG_ASSIGN_OR_RAISE( - table_metadata->current_snapshot_id, - GetJsonValueOrDefault(json, kCurrentSnapshotId, - TableMetadata::kInvalidSnapshotId)); + ICEBERG_ASSIGN_OR_RAISE(table_metadata->current_snapshot_id, + GetJsonValueOrDefault(json, kCurrentSnapshotId, + Snapshot::kInvalidSnapshotId)); if (table_metadata->format_version >= 3) { ICEBERG_ASSIGN_OR_RAISE(table_metadata->next_row_id, @@ -1155,7 +1190,7 @@ Result> TableMetadataFromJson(const nlohmann::jso ICEBERG_ASSIGN_OR_RAISE( table_metadata->refs, FromJsonMap>(json, kRefs, SnapshotRefFromJson)); - } else if (table_metadata->current_snapshot_id != TableMetadata::kInvalidSnapshotId) { + } else if (table_metadata->current_snapshot_id != Snapshot::kInvalidSnapshotId) { table_metadata->refs["main"] = std::make_unique(SnapshotRef{ .snapshot_id = table_metadata->current_snapshot_id, .retention = SnapshotRef::Branch{}, diff --git a/src/iceberg/json_internal.h b/src/iceberg/json_internal.h index 8881d5e0c..33f47b3da 100644 --- a/src/iceberg/json_internal.h +++ b/src/iceberg/json_internal.h @@ -126,10 +126,12 @@ nlohmann::json ToJson(const PartitionField& partition_field); /// and name. /// /// \param json The JSON object representing a `PartitionField`. +/// \param allow_field_id_missing Whether the field ID is allowed to be missing. This can +/// happen when deserializing partition fields from V1 metadata files. /// \return An `expected` value containing either a `PartitionField` object or an error. /// If the JSON is malformed or missing expected fields, an error will be returned. Result> PartitionFieldFromJson( - const nlohmann::json& json); + const nlohmann::json& json, bool allow_field_id_missing = false); /// \brief Serializes a `PartitionSpec` object to JSON. /// diff --git a/src/iceberg/partition_spec.cc b/src/iceberg/partition_spec.cc index 40a11901b..47794e7c6 100644 --- a/src/iceberg/partition_spec.cc +++ b/src/iceberg/partition_spec.cc @@ -19,17 +19,37 @@ #include "iceberg/partition_spec.h" +#include #include +#include #include "iceberg/schema.h" -#include "iceberg/type.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep namespace iceberg { PartitionSpec::PartitionSpec(std::shared_ptr schema, int32_t spec_id, - std::vector fields) - : schema_(std::move(schema)), spec_id_(spec_id), fields_(std::move(fields)) {} + std::vector fields, + std::optional last_assigned_field_id) + : schema_(std::move(schema)), spec_id_(spec_id), fields_(std::move(fields)) { + if (last_assigned_field_id) { + last_assigned_field_id_ = last_assigned_field_id.value(); + } else if (fields_.empty()) { + last_assigned_field_id_ = kLegacyPartitionDataIdStart - 1; + } else { + last_assigned_field_id_ = std::ranges::max(fields_, {}, [](const auto& field) { + return field.field_id(); + }).field_id(); + } +} + +const std::shared_ptr& PartitionSpec::Unpartitioned() { + static const std::shared_ptr unpartitioned = + std::make_shared( + /*schema=*/nullptr, kInitialSpecId, std::vector{}, + kLegacyPartitionDataIdStart - 1); + return unpartitioned; +} const std::shared_ptr& PartitionSpec::schema() const { return schema_; } @@ -47,8 +67,7 @@ std::string PartitionSpec::ToString() const { } bool PartitionSpec::Equals(const PartitionSpec& other) const { - return *schema_ == *other.schema_ && spec_id_ == other.spec_id_ && - fields_ == other.fields_; + return spec_id_ == other.spec_id_ && fields_ == other.fields_; } } // namespace iceberg diff --git a/src/iceberg/partition_spec.h b/src/iceberg/partition_spec.h index 9bd36968b..a18ba7b24 100644 --- a/src/iceberg/partition_spec.h +++ b/src/iceberg/partition_spec.h @@ -23,6 +23,7 @@ /// Partition specs for Iceberg tables. #include +#include #include #include #include @@ -40,8 +41,25 @@ namespace iceberg { /// evolution. class ICEBERG_EXPORT PartitionSpec : public util::Formattable { public: + static constexpr int32_t kInitialSpecId = 0; + /// \brief The start ID for partition field. It is only used to generate + /// partition field id for v1 metadata where it is tracked. + static constexpr int32_t kLegacyPartitionDataIdStart = 1000; + + /// \brief Create a new partition spec. + /// + /// \param schema The table schema. + /// \param spec_id The spec ID. + /// \param fields The partition fields. + /// \param last_assigned_field_id The last assigned field ID. If not provided, it will + /// be calculated from the fields. PartitionSpec(std::shared_ptr schema, int32_t spec_id, - std::vector fields); + std::vector fields, + std::optional last_assigned_field_id = std::nullopt); + + /// \brief Get an unsorted partition spec singleton. + static const std::shared_ptr& Unpartitioned(); + /// \brief Get the table schema const std::shared_ptr& schema() const; /// \brief Get the spec ID. @@ -51,6 +69,8 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { std::string ToString() const override; + int32_t last_assigned_field_id() const { return last_assigned_field_id_; } + friend bool operator==(const PartitionSpec& lhs, const PartitionSpec& rhs) { return lhs.Equals(rhs); } @@ -66,6 +86,7 @@ class ICEBERG_EXPORT PartitionSpec : public util::Formattable { std::shared_ptr schema_; const int32_t spec_id_; std::vector fields_; + int32_t last_assigned_field_id_; }; } // namespace iceberg diff --git a/src/iceberg/result.h b/src/iceberg/result.h index f8b3740b6..bedaccc47 100644 --- a/src/iceberg/result.h +++ b/src/iceberg/result.h @@ -42,6 +42,7 @@ enum class ErrorKind { kNotSupported, kInvalidExpression, kJsonParseError, + kNotFound, }; /// \brief Error with a kind and a message. diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc index 52a57db7a..c31554ead 100644 --- a/src/iceberg/schema.cc +++ b/src/iceberg/schema.cc @@ -26,10 +26,10 @@ namespace iceberg { -Schema::Schema(int32_t schema_id, std::vector fields) +Schema::Schema(std::vector fields, std::optional schema_id) : StructType(std::move(fields)), schema_id_(schema_id) {} -int32_t Schema::schema_id() const { return schema_id_; } +std::optional Schema::schema_id() const { return schema_id_; } std::string Schema::ToString() const { std::string repr = "schema<"; diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h index 0da0c4031..a23e3e4be 100644 --- a/src/iceberg/schema.h +++ b/src/iceberg/schema.h @@ -24,6 +24,7 @@ /// and any utility functions. See iceberg/type.h and iceberg/field.h as well. #include +#include #include #include @@ -40,13 +41,15 @@ namespace iceberg { /// evolution. class ICEBERG_EXPORT Schema : public StructType { public: - Schema(int32_t schema_id, std::vector fields); + static constexpr int32_t kInitialSchemaId = 0; + + Schema(std::vector fields, std::optional schema_id); /// \brief Get the schema ID. /// /// A schema is identified by a unique ID for the purposes of schema /// evolution. - [[nodiscard]] int32_t schema_id() const; + [[nodiscard]] std::optional schema_id() const; [[nodiscard]] std::string ToString() const override; @@ -58,7 +61,7 @@ class ICEBERG_EXPORT Schema : public StructType { /// \brief Compare two schemas for equality. [[nodiscard]] bool Equals(const Schema& other) const; - const int32_t schema_id_; + const std::optional schema_id_; }; } // namespace iceberg diff --git a/src/iceberg/schema_field.h b/src/iceberg/schema_field.h index 3fde248f7..7f33728f2 100644 --- a/src/iceberg/schema_field.h +++ b/src/iceberg/schema_field.h @@ -37,6 +37,8 @@ namespace iceberg { /// \brief A type combined with a name. class ICEBERG_EXPORT SchemaField : public iceberg::util::Formattable { public: + static constexpr int32_t kInvalidFieldId = -1; + /// \brief Construct a field. /// \param[in] field_id The field ID. /// \param[in] name The field name. diff --git a/src/iceberg/schema_internal.cc b/src/iceberg/schema_internal.cc index 69d7b82f5..266d8d2b8 100644 --- a/src/iceberg/schema_internal.cc +++ b/src/iceberg/schema_internal.cc @@ -331,17 +331,18 @@ Result> FromArrowSchema(const ArrowSchema& schema) { } // namespace -std::unique_ptr FromStructType(StructType&& struct_type, int32_t schema_id) { +std::unique_ptr FromStructType(StructType&& struct_type, + std::optional schema_id) { std::vector fields; fields.reserve(struct_type.fields().size()); for (auto& field : struct_type.fields()) { fields.emplace_back(std::move(field)); } - return std::make_unique(schema_id, std::move(fields)); + return std::make_unique(std::move(fields), schema_id); } Result> FromArrowSchema(const ArrowSchema& schema, - int32_t schema_id) { + std::optional schema_id) { auto type_result = FromArrowSchema(schema); if (!type_result) { return unexpected(type_result.error()); diff --git a/src/iceberg/schema_internal.h b/src/iceberg/schema_internal.h index 0a376cc39..26be1c3f1 100644 --- a/src/iceberg/schema_internal.h +++ b/src/iceberg/schema_internal.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include @@ -46,13 +47,14 @@ Status ToArrowSchema(const Schema& schema, ArrowSchema* out); /// \param[in] schema_id The schema ID of the Iceberg schema. /// \return The Iceberg schema or an error if the conversion fails. Result> FromArrowSchema(const ArrowSchema& schema, - int32_t schema_id); + std::optional schema_id); /// \brief Convert a struct type to an Iceberg schema. /// /// \param[in] struct_type The struct type to convert. /// \param[in] schema_id The schema ID of the Iceberg schema. /// \return The Iceberg schema. -std::unique_ptr FromStructType(StructType&& struct_type, int32_t schema_id); +std::unique_ptr FromStructType(StructType&& struct_type, + std::optional schema_id); } // namespace iceberg diff --git a/src/iceberg/snapshot.h b/src/iceberg/snapshot.h index da9dd95b0..bc68a4949 100644 --- a/src/iceberg/snapshot.h +++ b/src/iceberg/snapshot.h @@ -233,6 +233,8 @@ struct ICEBERG_EXPORT DataOperation { /// /// Snapshots are created by table operations. struct ICEBERG_EXPORT Snapshot { + static constexpr int64_t kInvalidSnapshotId = -1; + /// A unqiue long ID. int64_t snapshot_id; /// The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent. diff --git a/src/iceberg/sort_order.cc b/src/iceberg/sort_order.cc index a85dcf1e3..9db51c70d 100644 --- a/src/iceberg/sort_order.cc +++ b/src/iceberg/sort_order.cc @@ -25,10 +25,16 @@ namespace iceberg { -SortOrder::SortOrder(int64_t order_id, std::vector fields) +SortOrder::SortOrder(int32_t order_id, std::vector fields) : order_id_(order_id), fields_(std::move(fields)) {} -int64_t SortOrder::order_id() const { return order_id_; } +const std::shared_ptr& SortOrder::Unsorted() { + static const std::shared_ptr unsorted = + std::make_shared(/*order_id=*/0, std::vector{}); + return unsorted; +} + +int32_t SortOrder::order_id() const { return order_id_; } std::span SortOrder::fields() const { return fields_; } diff --git a/src/iceberg/sort_order.h b/src/iceberg/sort_order.h index 049e86f72..de4abbae2 100644 --- a/src/iceberg/sort_order.h +++ b/src/iceberg/sort_order.h @@ -36,10 +36,15 @@ namespace iceberg { /// applied to the data. class ICEBERG_EXPORT SortOrder : public util::Formattable { public: - SortOrder(int64_t order_id, std::vector fields); + static constexpr int32_t kInitialSortOrderId = 1; + + SortOrder(int32_t order_id, std::vector fields); + + /// \brief Get an unsorted sort order singleton. + static const std::shared_ptr& Unsorted(); /// \brief Get the sort order id. - int64_t order_id() const; + int32_t order_id() const; /// \brief Get the list of sort fields. std::span fields() const; @@ -58,7 +63,7 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable { /// \brief Compare two sort orders for equality. bool Equals(const SortOrder& other) const; - int64_t order_id_; + int32_t order_id_; std::vector fields_; }; diff --git a/src/iceberg/table_metadata.cc b/src/iceberg/table_metadata.cc index 171021445..494db0c9d 100644 --- a/src/iceberg/table_metadata.cc +++ b/src/iceberg/table_metadata.cc @@ -20,8 +20,13 @@ #include "iceberg/table_metadata.h" #include +#include #include +#include "iceberg/expected.h" +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/sort_order.h" namespace iceberg { std::string ToString(const SnapshotLogEntry& entry) { @@ -34,6 +39,45 @@ std::string ToString(const MetadataLogEntry& entry) { entry.metadata_file); } +Result> TableMetadata::Schema() const { + auto iter = std::ranges::find_if(schemas, [this](const auto& schema) { + return schema->schema_id() == current_schema_id; + }); + if (iter == schemas.end()) { + return unexpected({ + .kind = ErrorKind::kNotFound, + .message = std::format("Current schema is not found"), + }); + } + return *iter; +} + +Result> TableMetadata::PartitionSpec() const { + auto iter = std::ranges::find_if(partition_specs, [this](const auto& spec) { + return spec->spec_id() == default_spec_id; + }); + if (iter == partition_specs.end()) { + return unexpected({ + .kind = ErrorKind::kNotFound, + .message = std::format("Default partition spec is not found"), + }); + } + return *iter; +} + +Result> TableMetadata::SortOrder() const { + auto iter = std::ranges::find_if(sort_orders, [this](const auto& order) { + return order->order_id() == default_sort_order_id; + }); + if (iter == sort_orders.end()) { + return unexpected({ + .kind = ErrorKind::kNotFound, + .message = std::format("Default sort order is not found"), + }); + } + return *iter; +} + Result TimePointMsFromUnixMs(int64_t unix_ms) { return TimePointMs{std::chrono::milliseconds(unix_ms)}; } diff --git a/src/iceberg/table_metadata.h b/src/iceberg/table_metadata.h index 2a1afcbfc..3081d56ae 100644 --- a/src/iceberg/table_metadata.h +++ b/src/iceberg/table_metadata.h @@ -86,13 +86,9 @@ struct ICEBERG_EXPORT TableMetadata { static constexpr int8_t kDefaultTableFormatVersion = 2; static constexpr int8_t kSupportedTableFormatVersion = 3; static constexpr int8_t kMinFormatVersionRowLineage = 3; - static constexpr int32_t kInitialSpecId = 0; - static constexpr int32_t kInitialSortOrderId = 1; - static constexpr int32_t kInitialSchemaId = 0; - static constexpr int64_t kInitialRowId = 0; static constexpr int64_t kInitialSequenceNumber = 0; static constexpr int64_t kInvalidSequenceNumber = -1; - static constexpr int64_t kInvalidSnapshotId = -1; + static constexpr int64_t kInitialRowId = 0; /// An integer version number for the format int8_t format_version; @@ -109,7 +105,7 @@ struct ICEBERG_EXPORT TableMetadata { /// A list of schemas std::vector> schemas; /// ID of the table's current schema - int32_t current_schema_id; + std::optional current_schema_id; /// A list of partition specs std::vector> partition_specs; /// ID of the current partition spec that writers should use by default @@ -140,6 +136,13 @@ struct ICEBERG_EXPORT TableMetadata { std::vector> partition_statistics; /// A `long` higher than all assigned row IDs int64_t next_row_id; + + /// \brief Get the current schema, return NotFoundError if not found + Result> Schema() const; + /// \brief Get the current partition spec, return NotFoundError if not found + Result> PartitionSpec() const; + /// \brief Get the current sort order, return NotFoundError if not found + Result> SortOrder() const; }; /// \brief Returns a string representation of a SnapshotLogEntry diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b9dc6c2a7..c31c6e0e8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -23,12 +23,15 @@ fetchcontent_declare(googletest GTest) fetchcontent_makeavailable(googletest) +set(ICEBERG_TEST_RESOURCES "${CMAKE_SOURCE_DIR}/test/resources") + +configure_file("${CMAKE_SOURCE_DIR}/test/test_config.h.in" + "${CMAKE_BINARY_DIR}/iceberg/test/test_config.h") + add_executable(schema_test) target_sources(schema_test - PRIVATE json_internal_test.cc - schema_test.cc + PRIVATE schema_test.cc schema_field_test.cc - schema_json_test.cc type_test.cc transform_test.cc partition_field_test.cc @@ -50,6 +53,14 @@ target_link_libraries(expression_test PRIVATE iceberg_static GTest::gtest_main GTest::gmock) add_test(NAME expression_test COMMAND expression_test) +add_executable(json_serde_test) +target_include_directories(json_serde_test PRIVATE "${CMAKE_BINARY_DIR}") +target_sources(json_serde_test PRIVATE json_internal_test.cc metadata_serde_test.cc + schema_json_test.cc) +target_link_libraries(json_serde_test PRIVATE iceberg_static GTest::gtest_main + GTest::gmock) +add_test(NAME json_serde_test COMMAND json_serde_test) + if(ICEBERG_BUILD_BUNDLE) add_executable(avro_test) target_sources(avro_test PRIVATE avro_test.cc) diff --git a/test/arrow_test.cc b/test/arrow_test.cc index 85dcc91f9..52cef049a 100644 --- a/test/arrow_test.cc +++ b/test/arrow_test.cc @@ -81,11 +81,11 @@ TEST_P(ToArrowSchemaTest, PrimitiveType) { constexpr int32_t kFieldId = 1024; const auto& param = GetParam(); Schema schema( - /*schema_id=*/0, {param.optional ? SchemaField::MakeOptional(kFieldId, std::string(kFieldName), param.iceberg_type) : SchemaField::MakeRequired(kFieldId, std::string(kFieldName), - param.iceberg_type)}); + param.iceberg_type)}, + /*schema_id=*/0); ArrowSchema arrow_schema; ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), IsOk()); @@ -166,9 +166,9 @@ TEST(ToArrowSchemaTest, StructType) { std::make_shared()), SchemaField::MakeOptional(kStrFieldId, std::string(kStrFieldName), std::make_shared())}); - Schema schema( - /*schema_id=*/0, {SchemaField::MakeRequired( - kStructFieldId, std::string(kStructFieldName), struct_type)}); + Schema schema({SchemaField::MakeRequired(kStructFieldId, std::string(kStructFieldName), + struct_type)}, + /*schema_id=*/0); ArrowSchema arrow_schema; ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), IsOk()); @@ -199,8 +199,8 @@ TEST(ToArrowSchemaTest, ListType) { auto list_type = std::make_shared(SchemaField::MakeOptional( kElemFieldId, std::string(kElemFieldName), std::make_shared())); Schema schema( - /*schema_id=*/0, - {SchemaField::MakeRequired(kListFieldId, std::string(kListFieldName), list_type)}); + {SchemaField::MakeRequired(kListFieldId, std::string(kListFieldName), list_type)}, + /*schema_id=*/0); ArrowSchema arrow_schema; ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), IsOk()); @@ -234,8 +234,8 @@ TEST(ToArrowSchemaTest, MapType) { std::make_shared())); Schema schema( - /*schema_id=*/0, - {SchemaField::MakeRequired(kFieldId, std::string(kMapFieldName), map_type)}); + {SchemaField::MakeRequired(kFieldId, std::string(kMapFieldName), map_type)}, + /*schema_id=*/0); ArrowSchema arrow_schema; ASSERT_THAT(ToArrowSchema(schema, &arrow_schema), IsOk()); diff --git a/test/json_internal_test.cc b/test/json_internal_test.cc index 6b6a22044..0ff158e5e 100644 --- a/test/json_internal_test.cc +++ b/test/json_internal_test.cc @@ -134,9 +134,10 @@ TEST(JsonPartitionTest, PartitionFieldFromJsonMissingField) { TEST(JsonPartitionTest, PartitionSpec) { auto schema = std::make_shared( - 100, std::vector{ - SchemaField(3, "region", std::make_shared(), false), - SchemaField(5, "ts", std::make_shared(), false)}); + std::vector{ + SchemaField(3, "region", std::make_shared(), false), + SchemaField(5, "ts", std::make_shared(), false)}, + /*schema_id=*/100); auto identity_transform = Transform::Identity(); PartitionSpec spec(schema, 1, diff --git a/test/metadata_serde_test.cc b/test/metadata_serde_test.cc new file mode 100644 index 000000000..d187bafb1 --- /dev/null +++ b/test/metadata_serde_test.cc @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "iceberg/json_internal.h" +#include "iceberg/partition_field.h" +#include "iceberg/partition_spec.h" +#include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/snapshot.h" +#include "iceberg/sort_field.h" +#include "iceberg/sort_order.h" +#include "iceberg/table_metadata.h" +#include "iceberg/test/test_config.h" +#include "iceberg/transform.h" +#include "iceberg/type.h" + +namespace iceberg { + +namespace { + +class MetadataSerdeTest : public ::testing::Test { + protected: + void SetUp() override {} + + static std::string GetResourcePath(const std::string& file_name) { + return std::string(ICEBERG_TEST_RESOURCES) + "/" + file_name; + } + + static void ReadJsonFile(const std::string& file_name, std::string* content) { + std::filesystem::path path{GetResourcePath(file_name)}; + ASSERT_TRUE(std::filesystem::exists(path)) + << "File does not exist: " << path.string(); + + std::ifstream file(path); + std::stringstream buffer; + buffer << file.rdbuf(); + *content = buffer.str(); + } + + static void ReadTableMetadata(const std::string& file_name, + std::unique_ptr* metadata) { + std::string json_content; + ReadJsonFile(file_name, &json_content); + + nlohmann::json json = nlohmann::json::parse(json_content); + auto result = TableMetadataFromJson(json); + ASSERT_TRUE(result.has_value()) << "Failed to parse table metadata from " << file_name + << ": " << result.error().message; + *metadata = std::move(result.value()); + } +}; + +} // namespace + +TEST_F(MetadataSerdeTest, DeserializeV1Valid) { + std::unique_ptr metadata; + ASSERT_NO_FATAL_FAILURE(ReadTableMetadata("TableMetadataV1Valid.json", &metadata)); + + EXPECT_EQ(metadata->format_version, 1); + EXPECT_EQ(metadata->table_uuid, "d20125c8-7284-442c-9aea-15fee620737c"); + EXPECT_EQ(metadata->location, "s3://bucket/test/location"); + EXPECT_EQ(metadata->last_updated_ms.time_since_epoch().count(), 1602638573874); + EXPECT_EQ(metadata->last_column_id, 3); + EXPECT_EQ(metadata->current_snapshot_id, -1); + + // Compare schema + EXPECT_EQ(metadata->current_schema_id, std::nullopt); + std::vector schema_fields; + schema_fields.emplace_back(/*field_id=*/1, "x", std::make_shared(), + /*optional=*/false); + schema_fields.emplace_back(/*field_id=*/2, "y", std::make_shared(), + /*optional=*/false); + schema_fields.emplace_back(/*field_id=*/3, "z", std::make_shared(), + /*optional=*/false); + auto expected_schema = + std::make_shared(schema_fields, /*schema_id=*/std::nullopt); + auto schema = metadata->Schema(); + ASSERT_TRUE(schema.has_value()); + EXPECT_EQ(*(schema.value().get()), *expected_schema); + + // Compare partition spec + std::vector partition_fields; + partition_fields.emplace_back(/*source_id=*/1, /*field_id=*/1000, /*name=*/"x", + Transform::Identity()); + auto expected_spec = + std::make_shared(expected_schema, /*spec_id=*/0, partition_fields); + auto partition_spec = metadata->PartitionSpec(); + ASSERT_TRUE(partition_spec.has_value()); + EXPECT_EQ(*(partition_spec.value().get()), *expected_spec); +} + +TEST_F(MetadataSerdeTest, DeserializeV2Valid) { + std::unique_ptr metadata; + ASSERT_NO_FATAL_FAILURE(ReadTableMetadata("TableMetadataV2Valid.json", &metadata)); + + EXPECT_EQ(metadata->format_version, 2); + EXPECT_EQ(metadata->table_uuid, "9c12d441-03fe-4693-9a96-a0705ddf69c1"); + EXPECT_EQ(metadata->location, "s3://bucket/test/location"); + EXPECT_EQ(metadata->last_updated_ms.time_since_epoch().count(), 1602638573590); + EXPECT_EQ(metadata->last_column_id, 3); + + // Compare schema + EXPECT_EQ(metadata->current_schema_id, 1); + std::vector schema_fields; + schema_fields.emplace_back(/*field_id=*/1, "x", std::make_shared(), + /*optional=*/false); + schema_fields.emplace_back(/*field_id=*/2, "y", std::make_shared(), + /*optional=*/false); + schema_fields.emplace_back(/*field_id=*/3, "z", std::make_shared(), + /*optional=*/false); + auto expected_schema = + std::make_shared(std::move(schema_fields), /*schema_id=*/1); + auto schema = metadata->Schema(); + ASSERT_TRUE(schema.has_value()); + EXPECT_EQ(*(schema.value().get()), *expected_schema); + + // Compare partition spec + EXPECT_EQ(metadata->default_spec_id, 0); + std::vector partition_fields; + partition_fields.emplace_back(/*source_id=*/1, /*field_id=*/1000, /*name=*/"x", + Transform::Identity()); + auto expected_spec = std::make_shared(expected_schema, /*spec_id=*/0, + std::move(partition_fields)); + auto partition_spec = metadata->PartitionSpec(); + ASSERT_TRUE(partition_spec.has_value()); + EXPECT_EQ(*(partition_spec.value().get()), *expected_spec); + + // Compare sort order + EXPECT_EQ(metadata->default_sort_order_id, 3); + std::vector sort_fields; + sort_fields.emplace_back(/*source_id=*/2, Transform::Identity(), + SortDirection::kAscending, NullOrder::kFirst); + sort_fields.emplace_back(/*source_id=*/3, Transform::Bucket(4), + SortDirection::kDescending, NullOrder::kLast); + auto expected_sort_order = + std::make_shared(/*order_id=*/3, std::move(sort_fields)); + auto sort_order = metadata->SortOrder(); + ASSERT_TRUE(sort_order.has_value()); + EXPECT_EQ(*(sort_order.value().get()), *expected_sort_order); + + EXPECT_EQ(metadata->current_snapshot_id, 3055729675574597004); + + // Compare snapshots + std::vector expected_snapshots{{ + .snapshot_id = 3051729675574597004, + .sequence_number = 0, + .timestamp_ms = 1515100955770, + .manifest_list = "s3://a/b/1.avro", + .summary = {{"operation", "append"}}, + }, + { + .snapshot_id = 3055729675574597004, + .parent_snapshot_id = 3051729675574597004, + .sequence_number = 1, + .timestamp_ms = 1555100955770, + .manifest_list = "s3://a/b/2.avro", + .summary = {{"operation", "append"}}, + .schema_id = 1, + }}; + EXPECT_EQ(metadata->snapshots.size(), expected_snapshots.size()); + for (size_t i = 0; i < expected_snapshots.size(); ++i) { + EXPECT_EQ(*metadata->snapshots[i], expected_snapshots[i]); + } + + // Compare snapshot logs + std::vector expected_snapshot_log{ + { + .timestamp_ms = TimePointMsFromUnixMs(1515100955770).value(), + .snapshot_id = 3051729675574597004, + }, + { + .timestamp_ms = TimePointMsFromUnixMs(1555100955770).value(), + .snapshot_id = 3055729675574597004, + }}; + EXPECT_EQ(metadata->snapshot_log.size(), 2); + for (size_t i = 0; i < expected_snapshots.size(); ++i) { + EXPECT_EQ(metadata->snapshot_log[i], expected_snapshot_log[i]); + } +} + +} // namespace iceberg diff --git a/test/partition_spec_test.cc b/test/partition_spec_test.cc index 6c2cd590d..c9f4d3acd 100644 --- a/test/partition_spec_test.cc +++ b/test/partition_spec_test.cc @@ -24,6 +24,7 @@ #include #include +#include #include "iceberg/partition_field.h" #include "iceberg/schema.h" @@ -36,7 +37,8 @@ TEST(PartitionSpecTest, Basics) { { SchemaField field1(5, "ts", std::make_shared(), true); SchemaField field2(7, "bar", std::make_shared(), true); - auto const schema = std::make_shared(100, std::vector{field1, field2}); + auto const schema = + std::make_shared(std::vector{field1, field2}, 100); auto identity_transform = Transform::Identity(); PartitionField pt_field1(5, 1000, "day", identity_transform); @@ -60,7 +62,8 @@ TEST(PartitionSpecTest, Basics) { TEST(PartitionSpecTest, Equality) { SchemaField field1(5, "ts", std::make_shared(), true); SchemaField field2(7, "bar", std::make_shared(), true); - auto const schema = std::make_shared(100, std::vector{field1, field2}); + auto const schema = + std::make_shared(std::vector{field1, field2}, 100); auto identity_transform = Transform::Identity(); PartitionField pt_field1(5, 1000, "day", identity_transform); PartitionField pt_field2(7, 1001, "hour", identity_transform); diff --git a/test/resources/TableMetadataPartitionStatisticsFiles.json b/test/resources/TableMetadataPartitionStatisticsFiles.json new file mode 100644 index 000000000..df145e663 --- /dev/null +++ b/test/resources/TableMetadataPartitionStatisticsFiles.json @@ -0,0 +1,61 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 0, + "sort-orders": [ + { + "order-id": 0, + "fields": [] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 0 + } + ], + "partition-statistics": [ + { + "snapshot-id": 3055729675574597004, + "statistics-path": "s3://a/b/partition-stats.parquet", + "file-size-in-bytes": 43 + } + ], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataStatisticsFiles.json b/test/resources/TableMetadataStatisticsFiles.json new file mode 100644 index 000000000..55fc67033 --- /dev/null +++ b/test/resources/TableMetadataStatisticsFiles.json @@ -0,0 +1,70 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 0, + "sort-orders": [ + { + "order-id": 0, + "fields": [] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 0 + } + ], + "statistics": [ + { + "snapshot-id": 3055729675574597004, + "statistics-path": "s3://a/b/stats.puffin", + "file-size-in-bytes": 413, + "file-footer-size-in-bytes": 42, + "blob-metadata": [ + { + "type": "ndv", + "snapshot-id": 3055729675574597004, + "sequence-number": 1, + "fields": [1] + } + ] + } + ], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataUnsupportedVersion.json b/test/resources/TableMetadataUnsupportedVersion.json new file mode 100644 index 000000000..c40a0c9cd --- /dev/null +++ b/test/resources/TableMetadataUnsupportedVersion.json @@ -0,0 +1,36 @@ +{ + "format-version": 4, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": "s3://bucket/test/location", + "last-updated-ms": 1602638573874, + "last-sequence-number": 0, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "partition-spec": [], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [] +} diff --git a/test/resources/TableMetadataV1MissingSchemaType.json b/test/resources/TableMetadataV1MissingSchemaType.json new file mode 100644 index 000000000..dac2bbfb6 --- /dev/null +++ b/test/resources/TableMetadataV1MissingSchemaType.json @@ -0,0 +1,41 @@ +{ + "format-version": 1, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": "s3://bucket/test/location", + "last-updated-ms": 1602638573874, + "last-column-id": 3, + "schema": { + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "partition-spec": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [] +} diff --git a/test/resources/TableMetadataV1Valid.json b/test/resources/TableMetadataV1Valid.json new file mode 100644 index 000000000..9f981cc98 --- /dev/null +++ b/test/resources/TableMetadataV1Valid.json @@ -0,0 +1,42 @@ +{ + "format-version": 1, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": "s3://bucket/test/location", + "last-updated-ms": 1602638573874, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "partition-spec": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [] +} diff --git a/test/resources/TableMetadataV2CurrentSchemaNotFound.json b/test/resources/TableMetadataV2CurrentSchemaNotFound.json new file mode 100644 index 000000000..ae8858515 --- /dev/null +++ b/test/resources/TableMetadataV2CurrentSchemaNotFound.json @@ -0,0 +1,87 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 2, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + }, + { + "type": "struct", + "schema-id": 1, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2MissingLastPartitionId.json b/test/resources/TableMetadataV2MissingLastPartitionId.json new file mode 100644 index 000000000..65af7da66 --- /dev/null +++ b/test/resources/TableMetadataV2MissingLastPartitionId.json @@ -0,0 +1,73 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2MissingPartitionSpecs.json b/test/resources/TableMetadataV2MissingPartitionSpecs.json new file mode 100644 index 000000000..7e4b82656 --- /dev/null +++ b/test/resources/TableMetadataV2MissingPartitionSpecs.json @@ -0,0 +1,67 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }], + "partition-spec": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2MissingSchemas.json b/test/resources/TableMetadataV2MissingSchemas.json new file mode 100644 index 000000000..85fd03b5e --- /dev/null +++ b/test/resources/TableMetadataV2MissingSchemas.json @@ -0,0 +1,71 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2MissingSortOrder.json b/test/resources/TableMetadataV2MissingSortOrder.json new file mode 100644 index 000000000..52b1f8cda --- /dev/null +++ b/test/resources/TableMetadataV2MissingSortOrder.json @@ -0,0 +1,54 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2Valid.json b/test/resources/TableMetadataV2Valid.json new file mode 100644 index 000000000..3393cd7b1 --- /dev/null +++ b/test/resources/TableMetadataV2Valid.json @@ -0,0 +1,122 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + }, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [ + 1, + 2 + ], + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/1.avro" + }, + { + "snapshot-id": 3055729675574597004, + "parent-snapshot-id": 3051729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 1 + } + ], + "snapshot-log": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770 + }, + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770 + } + ], + "metadata-log": [] +} diff --git a/test/resources/TableMetadataV2ValidMinimal.json b/test/resources/TableMetadataV2ValidMinimal.json new file mode 100644 index 000000000..3976ba1ca --- /dev/null +++ b/test/resources/TableMetadataV2ValidMinimal.json @@ -0,0 +1,71 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ] +} diff --git a/test/resources/TableMetadataV3ValidMinimal.json b/test/resources/TableMetadataV3ValidMinimal.json new file mode 100644 index 000000000..b0411141e --- /dev/null +++ b/test/resources/TableMetadataV3ValidMinimal.json @@ -0,0 +1,73 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long", + "initial-default": 1, + "write-default": 1 + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ] +} diff --git a/test/schema_test.cc b/test/schema_test.cc index 88dde5bda..1017f17ad 100644 --- a/test/schema_test.cc +++ b/test/schema_test.cc @@ -32,7 +32,7 @@ TEST(SchemaTest, Basics) { { iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(7, "bar", std::make_shared(), true); - iceberg::Schema schema(100, {field1, field2}); + iceberg::Schema schema({field1, field2}, 100); ASSERT_EQ(schema, schema); ASSERT_EQ(100, schema.schema_id()); std::span fields = schema.fields(); @@ -56,7 +56,7 @@ TEST(SchemaTest, Basics) { iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(5, "bar", std::make_shared(), true); - iceberg::Schema schema(100, {field1, field2}); + iceberg::Schema schema({field1, field2}, 100); }, ::testing::ThrowsMessage( ::testing::HasSubstr("duplicate field ID 5"))); @@ -66,11 +66,11 @@ TEST(SchemaTest, Equality) { iceberg::SchemaField field1(5, "foo", std::make_shared(), true); iceberg::SchemaField field2(7, "bar", std::make_shared(), true); iceberg::SchemaField field3(5, "foobar", std::make_shared(), true); - iceberg::Schema schema1(100, {field1, field2}); - iceberg::Schema schema2(101, {field1, field2}); - iceberg::Schema schema3(101, {field1}); - iceberg::Schema schema4(101, {field3, field2}); - iceberg::Schema schema5(100, {field1, field2}); + iceberg::Schema schema1({field1, field2}, 100); + iceberg::Schema schema2({field1, field2}, 101); + iceberg::Schema schema3({field1}, 101); + iceberg::Schema schema4({field3, field2}, 101); + iceberg::Schema schema5({field1, field2}, 100); ASSERT_EQ(schema1, schema1); ASSERT_NE(schema1, schema2); diff --git a/test/test_config.h.in b/test/test_config.h.in new file mode 100644 index 000000000..0f0ec5f66 --- /dev/null +++ b/test/test_config.h.in @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#define ICEBERG_TEST_RESOURCES "@ICEBERG_TEST_RESOURCES@"