diff --git a/src/iceberg/result.h b/src/iceberg/result.h index 99df37247..8c27579d6 100644 --- a/src/iceberg/result.h +++ b/src/iceberg/result.h @@ -48,6 +48,7 @@ enum class ErrorKind { kNotFound, kNotImplemented, kNotSupported, + kValidationError, kUnknownError, }; @@ -97,6 +98,7 @@ DEFINE_ERROR_FUNCTION(NotAllowed) DEFINE_ERROR_FUNCTION(NotFound) DEFINE_ERROR_FUNCTION(NotImplemented) DEFINE_ERROR_FUNCTION(NotSupported) +DEFINE_ERROR_FUNCTION(ValidationError) DEFINE_ERROR_FUNCTION(UnknownError) #undef DEFINE_ERROR_FUNCTION diff --git a/src/iceberg/sort_order.cc b/src/iceberg/sort_order.cc index 48a306611..53caf4286 100644 --- a/src/iceberg/sort_order.cc +++ b/src/iceberg/sort_order.cc @@ -20,9 +20,18 @@ #include "iceberg/sort_order.h" #include +#include +#include #include +#include "iceberg/exception.h" +#include "iceberg/expression/term.h" +#include "iceberg/result.h" +#include "iceberg/schema.h" +#include "iceberg/sort_field.h" +#include "iceberg/transform.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep +#include "iceberg/util/macros.h" namespace iceberg { @@ -31,7 +40,7 @@ SortOrder::SortOrder(int32_t order_id, std::vector fields) const std::shared_ptr& SortOrder::Unsorted() { static const std::shared_ptr unsorted = - std::make_shared(/*order_id=*/0, std::vector{}); + std::make_shared(kUnsortedOrderId, std::vector{}); return unsorted; } @@ -80,4 +89,114 @@ bool SortOrder::Equals(const SortOrder& other) const { return order_id_ == other.order_id_ && fields_ == other.fields_; } +// SortOrderBuilder implementation + +struct SortOrderBuilder::Impl { + const Schema* schema; + std::optional sort_id; + std::vector fields; + bool case_sensitive{false}; + + explicit Impl(const Schema* schema) : schema(schema) {} +}; + +SortOrderBuilder::~SortOrderBuilder() = default; + +SortOrderBuilder::SortOrderBuilder(SortOrderBuilder&&) noexcept = default; + +SortOrderBuilder& SortOrderBuilder::operator=(SortOrderBuilder&&) noexcept = default; + +SortOrderBuilder::SortOrderBuilder(const Schema* schema) + : impl_(std::make_unique(schema)) {} + +std::unique_ptr SortOrderBuilder::BuildFromSchema( + const Schema* schema) { + return std::unique_ptr(new SortOrderBuilder(schema)); // NOLINT +} + +SortOrderBuilder& SortOrderBuilder::WithOrderId(int32_t sort_id) { + impl_->sort_id = sort_id; + return *this; +} + +SortOrderBuilder& SortOrderBuilder::CaseSensitive(bool case_sensitive) { + impl_->case_sensitive = case_sensitive; + return *this; +} + +Result> SortOrderBuilder::BuildUncheckd() { + if (impl_->fields.empty()) { + if (impl_->sort_id.has_value() && impl_->sort_id != SortOrder::kUnsortedOrderId) { + return InvalidArgument("Unsorted order ID must be 0"); + } + return std::make_unique(SortOrder::kUnsortedOrderId, + std::vector{}); + } + + if (impl_->sort_id.has_value() && impl_->sort_id == SortOrder::kUnsortedOrderId) { + return InvalidArgument("Sort order ID 0 is reserved for unsorted order"); + } + + // default ID to 1 as 0 is reserved for unsorted order + return std::make_unique( + impl_->sort_id.value_or(SortOrder::kInitialSortOrderId), std::move(impl_->fields)); +} + +Result> SortOrderBuilder::Build() { + ICEBERG_ASSIGN_OR_RAISE(auto sort_order, BuildUncheckd()); + ICEBERG_RETURN_UNEXPECTED(CheckCompatibility(sort_order, impl_->schema)); + return sort_order; +} + +SortOrderBuilder& SortOrderBuilder::AddSortField( + int32_t source_id, const std::shared_ptr& transform, + SortDirection direction, NullOrder null_order) { + impl_->fields.emplace_back(source_id, transform, direction, null_order); + return *this; +} + +SortOrderBuilder& SortOrderBuilder::AddSortField(const std::shared_ptr& term, + SortDirection direction, + NullOrder null_order) { + if (auto named_ref = std::dynamic_pointer_cast(term)) { + auto bound_ref = named_ref->Bind(*impl_->schema, impl_->case_sensitive); + ICEBERG_CHECK(bound_ref.has_value(), "Failed to bind named reference to schema."); + int32_t source_id = bound_ref.value()->field().field_id(); + impl_->fields.emplace_back(source_id, Transform::Identity(), direction, null_order); + } else if (auto unbound_transform = std::dynamic_pointer_cast(term)) { + auto bound_transform = unbound_transform->Bind(*impl_->schema, impl_->case_sensitive); + ICEBERG_CHECK(bound_transform.has_value(), + "Failed to bind unbound transform to schema."); + int32_t source_id = bound_transform.value()->reference()->field().field_id(); + impl_->fields.emplace_back(source_id, bound_transform.value()->transform(), direction, + null_order); + } else { + throw IcebergError(std::format( + "Invalid term: {}, expected either a named reference or an unbound transform", + term ? term->ToString() : "null")); + } + + return *this; +} + +Status SortOrderBuilder::CheckCompatibility(const std::unique_ptr& sort_order, + const Schema* schema) { + for (const auto& field : sort_order->fields()) { + ICEBERG_ASSIGN_OR_RAISE(auto schema_field, schema->FindFieldById(field.source_id())); + if (schema_field == std::nullopt) { + return ValidationError("Cannot find source column for sort field: {}", field); + } + + const auto& source_type = schema_field.value().get().type(); + + if (!source_type->is_primitive()) { + return ValidationError("Cannot sort by non-primitive source field: {}", + *source_type); + } + + ICEBERG_RETURN_UNEXPECTED(field.transform()->ResultType(source_type)); + } + return {}; +} + } // namespace iceberg diff --git a/src/iceberg/sort_order.h b/src/iceberg/sort_order.h index e245a74a2..226dddd47 100644 --- a/src/iceberg/sort_order.h +++ b/src/iceberg/sort_order.h @@ -20,11 +20,15 @@ #pragma once #include +#include #include #include +#include "iceberg/expression/expressions.h" +#include "iceberg/expression/term.h" #include "iceberg/iceberg_export.h" #include "iceberg/sort_field.h" +#include "iceberg/type_fwd.h" #include "iceberg/util/formattable.h" namespace iceberg { @@ -36,6 +40,7 @@ namespace iceberg { /// applied to the data. class ICEBERG_EXPORT SortOrder : public util::Formattable { public: + static constexpr int32_t kUnsortedOrderId = 0; static constexpr int32_t kInitialSortOrderId = 1; SortOrder(int32_t order_id, std::vector fields); @@ -77,4 +82,85 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable { std::vector fields_; }; +/// \brief A builder used to create valid SortOrder instances. +class ICEBERG_EXPORT SortOrderBuilder { + public: + /// \brief Create a builder for a new SortOrder + /// + /// \return A new SortOrderBuilder instance initialized with Schema + static std::unique_ptr BuildFromSchema(const Schema* schema); + + /// \brief Add an expression term to the sort, ascending with the given null order. + SortOrderBuilder& Asc(const std::shared_ptr& term, NullOrder null_order) { + return AddSortField(term, SortDirection::kAscending, null_order); + } + + /// \brief Add an expression term to the sort, descending with the given null order. + SortOrderBuilder& Desc(const std::shared_ptr& term, NullOrder null_order) { + return AddSortField(term, SortDirection::kDescending, null_order); + } + + /// \brief Add a sort field to the sort order. + SortOrderBuilder& SortBy(std::string name, SortDirection direction, + NullOrder null_order) { + return AddSortField(Expressions::Ref(std::move(name)), direction, null_order); + } + + /// \brief Add a sort field to the sort order. + SortOrderBuilder& SortBy(const std::shared_ptr& term, SortDirection direction, + NullOrder null_order) { + return AddSortField(term, direction, null_order); + } + + /// \brief Set sort id to the sort order. + SortOrderBuilder& WithOrderId(int32_t sort_id); + + /// \brief Set case sensitive to the sort order. + SortOrderBuilder& CaseSensitive(bool case_sensitive); + + /// \brief Add a sort field to the sort order with the specified source field ID, + /// transform, direction, and null order. + /// + /// \param source_id The source field ID. + /// \param transform The transform to apply to the field. + /// \param direction The sort direction. + /// \param null_order The null ordering behavior (e.g., nulls first or nulls last). + SortOrderBuilder& AddSortField(int32_t source_id, + const std::shared_ptr& transform, + SortDirection direction, NullOrder null_order); + + /// \brief Builds a SortOrder instance. + /// + /// \return A Result containing the constructed SortOrder or an error + Result> Build(); + + /// \brief Destructor + ~SortOrderBuilder(); + + // Delete copy operations (use BuildFromSchema to create a new builder) + SortOrderBuilder(const SortOrderBuilder&) = delete; + SortOrderBuilder& operator=(const SortOrderBuilder&) = delete; + + // Enable move operations + SortOrderBuilder(SortOrderBuilder&&) noexcept; + SortOrderBuilder& operator=(SortOrderBuilder&&) noexcept; + + private: + /// \brief Private constructor for building from Schema + explicit SortOrderBuilder(const Schema* schema); + + SortOrderBuilder& AddSortField(const std::shared_ptr& term, + SortDirection direction, NullOrder null_order); + + /// \brief Builds an unchecked SortOrder instance. + Result> BuildUncheckd(); + + static Status CheckCompatibility(const std::unique_ptr& sort_order, + const Schema* schema); + + /// Internal state members + struct Impl; + std::unique_ptr impl_; +}; + } // namespace iceberg diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt index fd8cbc972..9a9589d6b 100644 --- a/src/iceberg/test/CMakeLists.txt +++ b/src/iceberg/test/CMakeLists.txt @@ -73,6 +73,7 @@ add_iceberg_test(schema_test partition_field_test.cc partition_spec_test.cc sort_field_test.cc + sort_order_builder_test.cc sort_order_test.cc snapshot_test.cc schema_util_test.cc) diff --git a/src/iceberg/test/meson.build b/src/iceberg/test/meson.build index 22ed4bdde..7395f8459 100644 --- a/src/iceberg/test/meson.build +++ b/src/iceberg/test/meson.build @@ -38,6 +38,7 @@ iceberg_tests = { 'schema_util_test.cc', 'snapshot_test.cc', 'sort_field_test.cc', + 'sort_order_builder_test.cc', 'sort_order_test.cc', 'transform_test.cc', 'type_test.cc', diff --git a/src/iceberg/test/schema_field_test.cc b/src/iceberg/test/schema_field_test.cc index bc0dbbdf3..1078aadf2 100644 --- a/src/iceberg/test/schema_field_test.cc +++ b/src/iceberg/test/schema_field_test.cc @@ -63,7 +63,7 @@ TEST(SchemaFieldTest, Equality) { iceberg::SchemaField field1(1, "foo", iceberg::int32(), false); iceberg::SchemaField field2(2, "foo", iceberg::int32(), false); iceberg::SchemaField field3(1, "bar", iceberg::int32(), false); - iceberg::SchemaField field4(1, "foo", std::make_shared(), false); + iceberg::SchemaField field4(1, "foo", iceberg::int64(), false); iceberg::SchemaField field5(1, "foo", iceberg::int32(), true); iceberg::SchemaField field6(1, "foo", iceberg::int32(), false); diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc index d99e9b3c3..89a8d54b5 100644 --- a/src/iceberg/test/schema_test.cc +++ b/src/iceberg/test/schema_test.cc @@ -25,7 +25,6 @@ #include #include -#include "gtest/gtest.h" #include "iceberg/result.h" #include "iceberg/schema_field.h" #include "iceberg/test/matchers.h" diff --git a/src/iceberg/test/sort_order_builder_test.cc b/src/iceberg/test/sort_order_builder_test.cc new file mode 100644 index 000000000..0d04fdeb1 --- /dev/null +++ b/src/iceberg/test/sort_order_builder_test.cc @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include + +#include "iceberg/expression/expressions.h" +#include "iceberg/schema.h" +#include "iceberg/schema_field.h" +#include "iceberg/sort_field.h" +#include "iceberg/sort_order.h" +#include "iceberg/test/matchers.h" +#include "iceberg/transform.h" +#include "iceberg/type.h" +#include "iceberg/util/formatter.h" // IWYU pragma: keep + +namespace iceberg { + +class SortOrderBuilderTest : public ::testing::Test { + protected: + void SetUp() override { + field1_ = std::make_unique(1, "x", int32(), true); + field2_ = std::make_unique(2, "y", string(), true); + field3_ = std::make_unique(3, "time", timestamp(), true); + + schema_ = std::make_unique( + std::vector{*field1_, *field2_, *field3_}, 1); + + term1_ = Expressions::Ref("x"); + term2_ = Expressions::Ref("y"); + term3_ = Expressions::Bucket("x", 10); + term4_ = Expressions::Day("time"); + } + + std::unique_ptr schema_; + std::unique_ptr field1_; + std::unique_ptr field2_; + std::unique_ptr field3_; + + // NamedReference + std::shared_ptr term1_; + std::shared_ptr term2_; + + // UnboundTransform + std::shared_ptr term3_; + std::shared_ptr term4_; +}; + +void AssertSortField(const SortField& field, int source_id, SortDirection direction, + const Transform& transform) { + ASSERT_EQ(field.source_id(), source_id); + ASSERT_EQ(field.direction(), direction); + ASSERT_EQ(*field.transform(), transform); +} + +TEST_F(SortOrderBuilderTest, Asc) { + auto builder = SortOrderBuilder::BuildFromSchema(schema_.get()); + builder->Asc(term1_, NullOrder::kFirst).WithOrderId(1); + + ICEBERG_UNWRAP_OR_FAIL(auto sort_order, builder->Build()); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_sorted()); + AssertSortField(sort_order->fields()[0], 1, SortDirection::kAscending, + *Transform::Identity()); +} + +TEST_F(SortOrderBuilderTest, Desc) { + auto builder = SortOrderBuilder::BuildFromSchema(schema_.get()); + builder->Desc(term1_, NullOrder::kFirst).WithOrderId(1); + + ICEBERG_UNWRAP_OR_FAIL(auto sort_order, builder->Build()); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_sorted()); + AssertSortField(sort_order->fields()[0], 1, SortDirection::kDescending, + *Transform::Identity()); +} + +TEST_F(SortOrderBuilderTest, SortBy) { + auto builder = SortOrderBuilder::BuildFromSchema(schema_.get()); + builder->SortBy("y", SortDirection::kAscending, NullOrder::kFirst).WithOrderId(1); + + ICEBERG_UNWRAP_OR_FAIL(auto sort_order, builder->Build()); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_sorted()); + AssertSortField(sort_order->fields()[0], 2, SortDirection::kAscending, + *Transform::Identity()); +} + +TEST_F(SortOrderBuilderTest, SortByTerm) { + auto builder = SortOrderBuilder::BuildFromSchema(schema_.get()); + builder->SortBy(term2_, SortDirection::kAscending, NullOrder::kFirst).WithOrderId(1); + + ICEBERG_UNWRAP_OR_FAIL(auto sort_order, builder->Build()); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_sorted()); + AssertSortField(sort_order->fields()[0], 2, SortDirection::kAscending, + *Transform::Identity()); +} + +TEST_F(SortOrderBuilderTest, CaseSensitive) { + auto builder = SortOrderBuilder::BuildFromSchema(schema_.get()); + builder->CaseSensitive(false) + .SortBy("Y", SortDirection::kAscending, NullOrder::kFirst) + .WithOrderId(1); + + ICEBERG_UNWRAP_OR_FAIL(auto sort_order, builder->Build()); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_sorted()); + AssertSortField(sort_order->fields()[0], 2, SortDirection::kAscending, + *Transform::Identity()); +} + +TEST_F(SortOrderBuilderTest, AddSortField) { + auto builder = SortOrderBuilder::BuildFromSchema(schema_.get()); + builder + ->AddSortField(3, Transform::Month(), SortDirection::kAscending, NullOrder::kFirst) + .WithOrderId(1); + + ICEBERG_UNWRAP_OR_FAIL(auto sort_order, builder->Build()); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_sorted()); + AssertSortField(sort_order->fields()[0], 3, SortDirection::kAscending, + *Transform::Month()); +} + +TEST_F(SortOrderBuilderTest, BucketTransform) { + auto builder = SortOrderBuilder::BuildFromSchema(schema_.get()); + builder->SortBy(term3_, SortDirection::kAscending, NullOrder::kFirst).WithOrderId(1); + + ICEBERG_UNWRAP_OR_FAIL(auto sort_order, builder->Build()); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_sorted()); + AssertSortField(sort_order->fields()[0], 1, SortDirection::kAscending, + *Transform::Bucket(10)); +} + +TEST_F(SortOrderBuilderTest, DayTransform) { + auto builder = SortOrderBuilder::BuildFromSchema(schema_.get()); + builder->SortBy(term4_, SortDirection::kDescending, NullOrder::kLast).WithOrderId(1); + + ICEBERG_UNWRAP_OR_FAIL(auto sort_order, builder->Build()); + ASSERT_NE(sort_order, nullptr); + + EXPECT_TRUE(sort_order->is_sorted()); + AssertSortField(sort_order->fields()[0], 3, SortDirection::kDescending, + *Transform::Day()); +} + +} // namespace iceberg diff --git a/src/iceberg/test/sort_order_test.cc b/src/iceberg/test/sort_order_test.cc index bb407afae..fe58faf1c 100644 --- a/src/iceberg/test/sort_order_test.cc +++ b/src/iceberg/test/sort_order_test.cc @@ -24,7 +24,6 @@ #include -#include "iceberg/schema.h" #include "iceberg/sort_field.h" #include "iceberg/transform.h" #include "iceberg/util/formatter.h" // IWYU pragma: keep diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc index 4724cc18e..fd2d46ad4 100644 --- a/src/iceberg/transform.cc +++ b/src/iceberg/transform.cc @@ -23,6 +23,7 @@ #include #include +#include "iceberg/result.h" #include "iceberg/transform_function.h" #include "iceberg/type.h" @@ -125,6 +126,80 @@ Result> Transform::Bind( } } +Result> Transform::ResultType( + const std::shared_ptr& source_type) const { + switch (transform_type_) { + case TransformType::kIdentity: + if (!source_type->is_primitive()) [[unlikely]] { + return InvalidArgument("{} is not a valid input type of identity transform", + source_type->ToString()); + } + return source_type; + case TransformType::kVoid: + return source_type; + case TransformType::kUnknown: + return string(); + case TransformType::kBucket: + switch (source_type->type_id()) { + case TypeId::kInt: + case TypeId::kLong: + case TypeId::kDecimal: + case TypeId::kDate: + case TypeId::kTime: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + return int32(); + default: + return InvalidArgument("{} is not a valid input type of bucket transform", + source_type->ToString()); + } + case TransformType::kTruncate: + switch (source_type->type_id()) { + case TypeId::kInt: + case TypeId::kLong: + case TypeId::kString: + case TypeId::kBinary: + case TypeId::kDecimal: + return source_type; + default: + return InvalidArgument("{} is not a valid input type of truncate transform", + source_type->ToString()); + } + case TransformType::kYear: + case TransformType::kMonth: + switch (source_type->type_id()) { + case TypeId::kDate: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + return int32(); + default: + return InvalidArgument("{} is not a valid input type of {} transform", + source_type->ToString(), this->ToString()); + } + case TransformType::kDay: + switch (source_type->type_id()) { + case TypeId::kDate: + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + return date(); + default: + return InvalidArgument("{} is not a valid input type of day transform", + source_type->ToString()); + } + case TransformType::kHour: + switch (source_type->type_id()) { + case TypeId::kTimestamp: + case TypeId::kTimestampTz: + return int32(); + default: + return InvalidArgument("{} is not a valid input type of hour transform", + source_type->ToString()); + } + default: + std::unreachable(); + } +} + bool Transform::PreservesOrder() const { switch (transform_type_) { case TransformType::kUnknown: diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h index 21483e0c4..4c0fb1923 100644 --- a/src/iceberg/transform.h +++ b/src/iceberg/transform.h @@ -150,6 +150,10 @@ class ICEBERG_EXPORT Transform : public util::Formattable { Result> Bind( const std::shared_ptr& source_type) const; + /// \brief Returns the Type produced by this transform given a source type. + Result> ResultType( + const std::shared_ptr& source_type) const; + /// \brief Whether the transform preserves the order of values (is monotonic). bool PreservesOrder() const;