Skip to content

Commit e759807

Browse files
authored
feat: add SortOrder::Make and SortOrder::Validate (#300)
1 parent 9b0626c commit e759807

File tree

8 files changed

+345
-3
lines changed

8 files changed

+345
-3
lines changed

src/iceberg/sort_order.cc

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,15 @@
2020
#include "iceberg/sort_order.h"
2121

2222
#include <format>
23+
#include <optional>
2324
#include <ranges>
2425

26+
#include "iceberg/result.h"
27+
#include "iceberg/schema.h"
28+
#include "iceberg/sort_field.h"
29+
#include "iceberg/transform.h"
2530
#include "iceberg/util/formatter.h" // IWYU pragma: keep
31+
#include "iceberg/util/macros.h"
2632

2733
namespace iceberg {
2834

@@ -31,7 +37,7 @@ SortOrder::SortOrder(int32_t order_id, std::vector<SortField> fields)
3137

3238
const std::shared_ptr<SortOrder>& SortOrder::Unsorted() {
3339
static const std::shared_ptr<SortOrder> unsorted =
34-
std::make_shared<SortOrder>(/*order_id=*/0, std::vector<SortField>{});
40+
std::make_shared<SortOrder>(kUnsortedOrderId, std::vector<SortField>{});
3541
return unsorted;
3642
}
3743

@@ -80,4 +86,60 @@ bool SortOrder::Equals(const SortOrder& other) const {
8086
return order_id_ == other.order_id_ && fields_ == other.fields_;
8187
}
8288

89+
Status SortOrder::Validate(const Schema& schema) const {
90+
for (const auto& field : fields_) {
91+
auto schema_field = schema.FindFieldById(field.source_id());
92+
if (!schema_field.has_value() || schema_field.value() == std::nullopt) {
93+
return InvalidArgument("Cannot find source column for sort field: {}", field);
94+
}
95+
96+
const auto& source_type = schema_field.value().value().get().type();
97+
98+
if (!field.transform()->CanTransform(*source_type)) {
99+
return InvalidArgument("Invalid source type {} for transform {}",
100+
source_type->ToString(), field.transform()->ToString());
101+
}
102+
}
103+
return {};
104+
}
105+
106+
Result<std::unique_ptr<SortOrder>> SortOrder::Make(const Schema& schema, int32_t sort_id,
107+
std::vector<SortField> fields) {
108+
if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] {
109+
return InvalidArgument("{} is reserved for unsorted sort order", kUnsortedOrderId);
110+
}
111+
112+
if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] {
113+
return InvalidArgument("Sort order must have at least one sort field");
114+
}
115+
116+
for (const auto& field : fields) {
117+
ICEBERG_ASSIGN_OR_RAISE(auto schema_field, schema.FindFieldById(field.source_id()));
118+
if (schema_field == std::nullopt) {
119+
return InvalidArgument("Cannot find source column for sort field: {}", field);
120+
}
121+
122+
const auto& source_type = schema_field.value().get().type();
123+
if (field.transform()->CanTransform(*source_type) == false) {
124+
return InvalidArgument("Invalid source type {} for transform {}",
125+
source_type->ToString(), field.transform()->ToString());
126+
}
127+
}
128+
129+
return std::make_unique<SortOrder>(sort_id, std::move(fields));
130+
}
131+
132+
Result<std::unique_ptr<SortOrder>> SortOrder::Make(int32_t sort_id,
133+
std::vector<SortField> fields) {
134+
if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] {
135+
return InvalidArgument("{} is reserved for unsorted sort order", kUnsortedOrderId);
136+
}
137+
138+
if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] {
139+
return InvalidArgument("Sort order must have at least one sort field");
140+
}
141+
142+
return std::make_unique<SortOrder>(sort_id, std::move(fields));
143+
}
144+
83145
} // namespace iceberg

src/iceberg/sort_order.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@
2020
#pragma once
2121

2222
#include <cstdint>
23+
#include <memory>
2324
#include <span>
2425
#include <vector>
2526

2627
#include "iceberg/iceberg_export.h"
2728
#include "iceberg/sort_field.h"
29+
#include "iceberg/type_fwd.h"
2830
#include "iceberg/util/formattable.h"
2931

3032
namespace iceberg {
@@ -36,6 +38,7 @@ namespace iceberg {
3638
/// applied to the data.
3739
class ICEBERG_EXPORT SortOrder : public util::Formattable {
3840
public:
41+
static constexpr int32_t kUnsortedOrderId = 0;
3942
static constexpr int32_t kInitialSortOrderId = 1;
4043

4144
SortOrder(int32_t order_id, std::vector<SortField> fields);
@@ -69,6 +72,28 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable {
6972
return lhs.Equals(rhs);
7073
}
7174

75+
/// \brief Validates the sort order against a schema.
76+
/// \param schema The schema to validate against.
77+
/// \return Error status if the sort order has any invalid transform.
78+
Status Validate(const Schema& schema) const;
79+
80+
/// \brief Create a SortOrder.
81+
/// \param schema The schema to bind the sort order to.
82+
/// \param sort_id The sort order id.
83+
/// \param fields The sort fields.
84+
/// \return A Result containing the SortOrder or an error.
85+
static Result<std::unique_ptr<SortOrder>> Make(const Schema& schema, int32_t sort_id,
86+
std::vector<SortField> fields);
87+
88+
/// \brief Create a SortOrder without binding to a schema.
89+
/// \param sort_id The sort order id.
90+
/// \param fields The sort fields.
91+
/// \return A Result containing the SortOrder or an error.
92+
/// \note This method does not check whether the sort fields are valid for any schema.
93+
/// Use IsBoundToSchema to check if the sort order is valid for a given schema.
94+
static Result<std::unique_ptr<SortOrder>> Make(int32_t sort_id,
95+
std::vector<SortField> fields);
96+
7297
private:
7398
/// \brief Compare two sort orders for equality.
7499
bool Equals(const SortOrder& other) const;

src/iceberg/test/schema_field_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ TEST(SchemaFieldTest, Equality) {
6363
iceberg::SchemaField field1(1, "foo", iceberg::int32(), false);
6464
iceberg::SchemaField field2(2, "foo", iceberg::int32(), false);
6565
iceberg::SchemaField field3(1, "bar", iceberg::int32(), false);
66-
iceberg::SchemaField field4(1, "foo", std::make_shared<iceberg::LongType>(), false);
66+
iceberg::SchemaField field4(1, "foo", iceberg::int64(), false);
6767
iceberg::SchemaField field5(1, "foo", iceberg::int32(), true);
6868
iceberg::SchemaField field6(1, "foo", iceberg::int32(), false);
6969

src/iceberg/test/schema_test.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
#include <gmock/gmock.h>
2626
#include <gtest/gtest.h>
2727

28-
#include "gtest/gtest.h"
2928
#include "iceberg/result.h"
3029
#include "iceberg/schema_field.h"
3130
#include "iceberg/test/matchers.h"

src/iceberg/test/sort_order_test.cc

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,40 @@
2626

2727
#include "iceberg/schema.h"
2828
#include "iceberg/sort_field.h"
29+
#include "iceberg/test/matchers.h"
2930
#include "iceberg/transform.h"
3031
#include "iceberg/util/formatter.h" // IWYU pragma: keep
3132

3233
namespace iceberg {
3334

35+
class SortOrderMakeTest : public ::testing::Test {
36+
protected:
37+
void SetUp() override {
38+
field1_ = std::make_unique<SchemaField>(1, "x", int32(), true);
39+
field2_ = std::make_unique<SchemaField>(2, "y", string(), true);
40+
field3_ = std::make_unique<SchemaField>(3, "time", timestamp(), true);
41+
42+
schema_ = std::make_unique<Schema>(
43+
std::vector<SchemaField>{*field1_, *field2_, *field3_}, 1);
44+
45+
sort_field1_ = std::make_unique<SortField>(
46+
1, Transform::Identity(), SortDirection::kAscending, NullOrder::kFirst);
47+
sort_field2_ = std::make_unique<SortField>(
48+
2, Transform::Bucket(10), SortDirection::kDescending, NullOrder::kLast);
49+
sort_field3_ = std::make_unique<SortField>(
50+
3, Transform::Day(), SortDirection::kAscending, NullOrder::kFirst);
51+
}
52+
53+
std::unique_ptr<Schema> schema_;
54+
std::unique_ptr<SchemaField> field1_;
55+
std::unique_ptr<SchemaField> field2_;
56+
std::unique_ptr<SchemaField> field3_;
57+
58+
std::unique_ptr<SortField> sort_field1_;
59+
std::unique_ptr<SortField> sort_field2_;
60+
std::unique_ptr<SortField> sort_field3_;
61+
};
62+
3463
TEST(SortOrderTest, Basics) {
3564
{
3665
SchemaField field1(5, "ts", iceberg::timestamp(), true);
@@ -148,4 +177,84 @@ TEST(SortOrderTest, Satisfies) {
148177
EXPECT_FALSE(sort_order2.Satisfies(sort_order4));
149178
}
150179

180+
TEST_F(SortOrderMakeTest, MakeValidSortOrder) {
181+
ICEBERG_UNWRAP_OR_FAIL(
182+
auto sort_order,
183+
SortOrder::Make(*schema_, 1, std::vector<SortField>{*sort_field1_, *sort_field2_}));
184+
ASSERT_NE(sort_order, nullptr);
185+
186+
EXPECT_TRUE(sort_order->is_sorted());
187+
ASSERT_EQ(sort_order->fields().size(), 2);
188+
EXPECT_EQ(sort_order->fields()[0], *sort_field1_);
189+
EXPECT_EQ(sort_order->fields()[1], *sort_field2_);
190+
}
191+
192+
TEST_F(SortOrderMakeTest, MakeInvalidSortOrderEmptyFields) {
193+
auto sort_order = SortOrder::Make(*schema_, 1, std::vector<SortField>{});
194+
EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
195+
EXPECT_THAT(sort_order,
196+
HasErrorMessage("Sort order must have at least one sort field"));
197+
}
198+
199+
TEST_F(SortOrderMakeTest, MakeInvalidSortOrderUnsortedId) {
200+
auto sort_order = SortOrder::Make(*schema_, SortOrder::kUnsortedOrderId,
201+
std::vector<SortField>{*sort_field1_});
202+
EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
203+
EXPECT_THAT(sort_order,
204+
HasErrorMessage(std::format("{} is reserved for unsorted sort order",
205+
SortOrder::kUnsortedOrderId)));
206+
}
207+
208+
TEST_F(SortOrderMakeTest, MakeValidUnsortedSortOrder) {
209+
ICEBERG_UNWRAP_OR_FAIL(auto sort_order, SortOrder::Make(SortOrder::kUnsortedOrderId,
210+
std::vector<SortField>{}));
211+
ASSERT_NE(sort_order, nullptr);
212+
213+
EXPECT_TRUE(sort_order->is_unsorted());
214+
EXPECT_EQ(sort_order->fields().size(), 0);
215+
}
216+
217+
TEST_F(SortOrderMakeTest, MakeInvalidSortOrderNonPrimitiveField) {
218+
auto struct_field = std::make_unique<SchemaField>(
219+
4, "struct_field",
220+
std::make_shared<StructType>(std::vector<SchemaField>{
221+
SchemaField::MakeRequired(41, "inner_field", iceberg::int32()),
222+
}),
223+
true);
224+
225+
Schema schema_with_struct(
226+
std::vector<SchemaField>{*field1_, *field2_, *field3_, *struct_field}, 1);
227+
228+
SortField sort_field_invalid(4, Transform::Identity(), SortDirection::kAscending,
229+
NullOrder::kFirst);
230+
231+
auto sort_order = SortOrder::Make(
232+
schema_with_struct, 1, std::vector<SortField>{*sort_field1_, sort_field_invalid});
233+
EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
234+
EXPECT_THAT(sort_order, HasErrorMessage("Invalid source type"));
235+
}
236+
237+
TEST_F(SortOrderMakeTest, MakeInvalidSortOrderFieldNotInSchema) {
238+
SortField sort_field_invalid(999, Transform::Identity(), SortDirection::kAscending,
239+
NullOrder::kFirst);
240+
241+
auto sort_order = SortOrder::Make(
242+
*schema_, 1, std::vector<SortField>{*sort_field1_, sort_field_invalid});
243+
EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
244+
EXPECT_THAT(sort_order, HasErrorMessage("Cannot find source column for sort field"));
245+
}
246+
247+
TEST_F(SortOrderMakeTest, MakeUnboundSortOrder) {
248+
SortField sort_field_invalid(999, Transform::Identity(), SortDirection::kAscending,
249+
NullOrder::kFirst);
250+
251+
auto sort_order =
252+
SortOrder::Make(1, std::vector<SortField>{*sort_field1_, sort_field_invalid});
253+
ASSERT_THAT(sort_order, IsOk());
254+
auto validate_status = sort_order.value()->Validate(*schema_);
255+
EXPECT_THAT(validate_status, IsError(ErrorKind::kInvalidArgument));
256+
EXPECT_THAT(validate_status,
257+
HasErrorMessage("Cannot find source column for sort field"));
258+
}
259+
151260
} // namespace iceberg

src/iceberg/test/transform_test.cc

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <gtest/gtest.h>
2828

2929
#include "iceberg/expression/literal.h"
30+
#include "iceberg/schema_field.h"
3031
#include "iceberg/test/matchers.h"
3132
#include "iceberg/test/temporal_test_helper.h"
3233
#include "iceberg/type.h"
@@ -841,4 +842,76 @@ TEST(TransformSatisfiesOrderOfTest, SatisfiesOrderOf) {
841842
}
842843
}
843844

845+
TEST(TransformCanTransformTest, CanTransform) {
846+
struct Case {
847+
std::string transform_str;
848+
std::shared_ptr<Type> source_type;
849+
bool expected;
850+
};
851+
852+
const std::vector<Case> cases = {
853+
// Identity can transform all primitive types
854+
{.transform_str = "identity", .source_type = int32(), .expected = true},
855+
{.transform_str = "identity", .source_type = string(), .expected = true},
856+
{.transform_str = "identity", .source_type = boolean(), .expected = true},
857+
{.transform_str = "identity",
858+
.source_type = list(SchemaField(123, "element", int32(), false)),
859+
.expected = false},
860+
861+
// Void can transform any type
862+
{.transform_str = "void", .source_type = iceberg::int32(), .expected = true},
863+
{.transform_str = "void",
864+
.source_type = iceberg::map(SchemaField(123, "key", iceberg::string(), false),
865+
SchemaField(124, "value", iceberg::int32(), true)),
866+
.expected = true},
867+
868+
// Bucket can transform specific types
869+
{.transform_str = "bucket[16]", .source_type = iceberg::int32(), .expected = true},
870+
{.transform_str = "bucket[16]", .source_type = iceberg::string(), .expected = true},
871+
{.transform_str = "bucket[16]",
872+
.source_type = iceberg::float32(),
873+
.expected = false},
874+
875+
// Truncate can transform specific types
876+
{.transform_str = "truncate[32]",
877+
.source_type = iceberg::int32(),
878+
.expected = true},
879+
{.transform_str = "truncate[32]",
880+
.source_type = iceberg::string(),
881+
.expected = true},
882+
{.transform_str = "truncate[32]",
883+
.source_type = iceberg::boolean(),
884+
.expected = false},
885+
886+
// Year can transform date and timestamp types
887+
{.transform_str = "year", .source_type = iceberg::date(), .expected = true},
888+
{.transform_str = "year", .source_type = iceberg::timestamp(), .expected = true},
889+
{.transform_str = "year", .source_type = iceberg::string(), .expected = false},
890+
891+
// Month can transform date and timestamp types
892+
{.transform_str = "month", .source_type = iceberg::date(), .expected = true},
893+
{.transform_str = "month", .source_type = iceberg::timestamp(), .expected = true},
894+
{.transform_str = "month", .source_type = iceberg::binary(), .expected = false},
895+
896+
// Day can transform date and timestamp types
897+
{.transform_str = "day", .source_type = iceberg::date(), .expected = true},
898+
{.transform_str = "day", .source_type = iceberg::timestamp(), .expected = true},
899+
{.transform_str = "day", .source_type = iceberg::uuid(), .expected = false},
900+
901+
// Hour can transform timestamp types
902+
{.transform_str = "hour", .source_type = iceberg::timestamp(), .expected = true},
903+
{.transform_str = "hour", .source_type = iceberg::timestamp_tz(), .expected = true},
904+
{.transform_str = "hour", .source_type = iceberg::int32(), .expected = false},
905+
};
906+
907+
for (const auto& c : cases) {
908+
auto transform = TransformFromString(c.transform_str);
909+
ASSERT_TRUE(transform.has_value()) << "Failed to parse: " << c.transform_str;
910+
911+
EXPECT_EQ(transform.value()->CanTransform(*c.source_type), c.expected)
912+
<< "Unexpected result for transform: " << c.transform_str
913+
<< " and source type: " << c.source_type->ToString();
914+
}
915+
}
916+
844917
} // namespace iceberg

0 commit comments

Comments
 (0)