Skip to content

Commit 8ac5ab2

Browse files
committed
feat: add SortOrder::Make and SortOrder::IsBoundToSchema
Signed-off-by: Junwang Zhao <[email protected]>
1 parent 1c431b6 commit 8ac5ab2

File tree

7 files changed

+284
-3
lines changed

7 files changed

+284
-3
lines changed

src/iceberg/sort_order.cc

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,15 @@
2020
#include "iceberg/sort_order.h"
2121

2222
#include <format>
23+
#include <optional>
2324
#include <ranges>
2425

26+
#include "iceberg/result.h"
27+
#include "iceberg/schema.h"
28+
#include "iceberg/sort_field.h"
29+
#include "iceberg/transform.h"
2530
#include "iceberg/util/formatter.h" // IWYU pragma: keep
31+
#include "iceberg/util/macros.h"
2632

2733
namespace iceberg {
2834

@@ -31,7 +37,7 @@ SortOrder::SortOrder(int32_t order_id, std::vector<SortField> fields)
3137

3238
const std::shared_ptr<SortOrder>& SortOrder::Unsorted() {
3339
static const std::shared_ptr<SortOrder> unsorted =
34-
std::make_shared<SortOrder>(/*order_id=*/0, std::vector<SortField>{});
40+
std::make_shared<SortOrder>(kUnsortedOrderId, std::vector<SortField>{});
3541
return unsorted;
3642
}
3743

@@ -80,4 +86,66 @@ bool SortOrder::Equals(const SortOrder& other) const {
8086
return order_id_ == other.order_id_ && fields_ == other.fields_;
8187
}
8288

89+
bool SortOrder::IsBoundToSchema(const Schema& schema) const {
90+
for (const auto& field : fields_) {
91+
auto schema_field = schema.FindFieldById(field.source_id());
92+
if (!schema_field.has_value() || schema_field.value() == std::nullopt) {
93+
return false;
94+
}
95+
96+
const auto& source_type = schema_field.value().value().get().type();
97+
if (!source_type->is_primitive()) {
98+
return false;
99+
}
100+
101+
auto result = field.transform()->ResultType(source_type);
102+
if (!result) {
103+
return false;
104+
}
105+
}
106+
return true;
107+
}
108+
109+
Result<std::unique_ptr<SortOrder>> SortOrder::Make(const Schema& schema, int32_t sort_id,
110+
std::vector<SortField> fields) {
111+
if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] {
112+
return InvalidArgument("{} is reserved for unsorted sort order", kUnsortedOrderId);
113+
}
114+
115+
if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] {
116+
return InvalidArgument("Sort order must have at least one sort field");
117+
}
118+
119+
for (const auto& field : fields) {
120+
ICEBERG_ASSIGN_OR_RAISE(auto schema_field, schema.FindFieldById(field.source_id()));
121+
if (schema_field == std::nullopt) {
122+
return InvalidArgument("Cannot find source column for sort field: {}", field);
123+
}
124+
125+
const auto& source_type = schema_field.value().get().type();
126+
127+
if (!source_type->is_primitive()) {
128+
return InvalidArgument("Cannot sort by non-primitive source field: {}",
129+
*source_type);
130+
}
131+
132+
ICEBERG_RETURN_UNEXPECTED(field.transform()->ResultType(source_type));
133+
}
134+
135+
return std::make_unique<SortOrder>(sort_id, std::move(fields));
136+
}
137+
138+
Result<std::unique_ptr<SortOrder>> SortOrder::Make(int32_t sort_id,
139+
std::vector<SortField> fields) {
140+
if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] {
141+
return InvalidArgument("{} is reserved for unsorted sort order", kUnsortedOrderId);
142+
}
143+
144+
if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] {
145+
return InvalidArgument("Sort order must have at least one sort field");
146+
}
147+
148+
return std::make_unique<SortOrder>(sort_id, std::move(fields));
149+
}
150+
83151
} // namespace iceberg

src/iceberg/sort_order.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@
2020
#pragma once
2121

2222
#include <cstdint>
23+
#include <memory>
2324
#include <span>
2425
#include <vector>
2526

2627
#include "iceberg/iceberg_export.h"
2728
#include "iceberg/sort_field.h"
29+
#include "iceberg/type_fwd.h"
2830
#include "iceberg/util/formattable.h"
2931

3032
namespace iceberg {
@@ -36,6 +38,7 @@ namespace iceberg {
3638
/// applied to the data.
3739
class ICEBERG_EXPORT SortOrder : public util::Formattable {
3840
public:
41+
static constexpr int32_t kUnsortedOrderId = 0;
3942
static constexpr int32_t kInitialSortOrderId = 1;
4043

4144
SortOrder(int32_t order_id, std::vector<SortField> fields);
@@ -69,6 +72,28 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable {
6972
return lhs.Equals(rhs);
7073
}
7174

75+
/// \brief Checks whether the sort order is bound to the given schema.
76+
/// \param schema The schema to check against.
77+
/// \return true if the sort order is valid for the given schema.
78+
bool IsBoundToSchema(const Schema& schema) const;
79+
80+
/// \brief Create a SortOrder.
81+
/// \param schema The schema to bind the sort order to.
82+
/// \param sort_id The sort order id.
83+
/// \param fields The sort fields.
84+
/// \return A Result containing the SortOrder or an error.
85+
static Result<std::unique_ptr<SortOrder>> Make(const Schema& schema, int32_t sort_id,
86+
std::vector<SortField> fields);
87+
88+
/// \brief Create a SortOrder without binding to a schema.
89+
/// \param sort_id The sort order id.
90+
/// \param fields The sort fields.
91+
/// \return A Result containing the SortOrder or an error.
92+
/// \note This method does not check whether the sort fields are valid for any schema.
93+
/// Use IsBoundToSchema to check if the sort order is valid for a given schema.
94+
static Result<std::unique_ptr<SortOrder>> Make(int32_t sort_id,
95+
std::vector<SortField> fields);
96+
7297
private:
7398
/// \brief Compare two sort orders for equality.
7499
bool Equals(const SortOrder& other) const;

src/iceberg/test/schema_field_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ TEST(SchemaFieldTest, Equality) {
6363
iceberg::SchemaField field1(1, "foo", iceberg::int32(), false);
6464
iceberg::SchemaField field2(2, "foo", iceberg::int32(), false);
6565
iceberg::SchemaField field3(1, "bar", iceberg::int32(), false);
66-
iceberg::SchemaField field4(1, "foo", std::make_shared<iceberg::LongType>(), false);
66+
iceberg::SchemaField field4(1, "foo", iceberg::int64(), false);
6767
iceberg::SchemaField field5(1, "foo", iceberg::int32(), true);
6868
iceberg::SchemaField field6(1, "foo", iceberg::int32(), false);
6969

src/iceberg/test/schema_test.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
#include <gmock/gmock.h>
2626
#include <gtest/gtest.h>
2727

28-
#include "gtest/gtest.h"
2928
#include "iceberg/result.h"
3029
#include "iceberg/schema_field.h"
3130
#include "iceberg/test/matchers.h"

src/iceberg/test/sort_order_test.cc

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,40 @@
2626

2727
#include "iceberg/schema.h"
2828
#include "iceberg/sort_field.h"
29+
#include "iceberg/test/matchers.h"
2930
#include "iceberg/transform.h"
3031
#include "iceberg/util/formatter.h" // IWYU pragma: keep
3132

3233
namespace iceberg {
3334

35+
class SortOrderMakeTest : public ::testing::Test {
36+
protected:
37+
void SetUp() override {
38+
field1_ = std::make_unique<SchemaField>(1, "x", int32(), true);
39+
field2_ = std::make_unique<SchemaField>(2, "y", string(), true);
40+
field3_ = std::make_unique<SchemaField>(3, "time", timestamp(), true);
41+
42+
schema_ = std::make_unique<Schema>(
43+
std::vector<SchemaField>{*field1_, *field2_, *field3_}, 1);
44+
45+
sort_field1_ = std::make_unique<SortField>(
46+
1, Transform::Identity(), SortDirection::kAscending, NullOrder::kFirst);
47+
sort_field2_ = std::make_unique<SortField>(
48+
2, Transform::Bucket(10), SortDirection::kDescending, NullOrder::kLast);
49+
sort_field3_ = std::make_unique<SortField>(
50+
3, Transform::Day(), SortDirection::kAscending, NullOrder::kFirst);
51+
}
52+
53+
std::unique_ptr<Schema> schema_;
54+
std::unique_ptr<SchemaField> field1_;
55+
std::unique_ptr<SchemaField> field2_;
56+
std::unique_ptr<SchemaField> field3_;
57+
58+
std::unique_ptr<SortField> sort_field1_;
59+
std::unique_ptr<SortField> sort_field2_;
60+
std::unique_ptr<SortField> sort_field3_;
61+
};
62+
3463
TEST(SortOrderTest, Basics) {
3564
{
3665
SchemaField field1(5, "ts", iceberg::timestamp(), true);
@@ -148,4 +177,81 @@ TEST(SortOrderTest, Satisfies) {
148177
EXPECT_FALSE(sort_order2.Satisfies(sort_order4));
149178
}
150179

180+
TEST_F(SortOrderMakeTest, MakeValidSortOrder) {
181+
ICEBERG_UNWRAP_OR_FAIL(
182+
auto sort_order,
183+
SortOrder::Make(*schema_, 1, std::vector<SortField>{*sort_field1_, *sort_field2_}));
184+
ASSERT_NE(sort_order, nullptr);
185+
186+
EXPECT_TRUE(sort_order->is_sorted());
187+
ASSERT_EQ(sort_order->fields().size(), 2);
188+
EXPECT_EQ(sort_order->fields()[0], *sort_field1_);
189+
EXPECT_EQ(sort_order->fields()[1], *sort_field2_);
190+
}
191+
192+
TEST_F(SortOrderMakeTest, MakeInvalidSortOrderEmptyFields) {
193+
auto sort_order = SortOrder::Make(*schema_, 1, std::vector<SortField>{});
194+
EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
195+
EXPECT_THAT(sort_order,
196+
HasErrorMessage("Sort order must have at least one sort field"));
197+
}
198+
199+
TEST_F(SortOrderMakeTest, MakeInvalidSortOrderUnsortedId) {
200+
auto sort_order = SortOrder::Make(*schema_, SortOrder::kUnsortedOrderId,
201+
std::vector<SortField>{*sort_field1_});
202+
EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
203+
EXPECT_THAT(sort_order,
204+
HasErrorMessage(std::format("{} is reserved for unsorted sort order",
205+
SortOrder::kUnsortedOrderId)));
206+
}
207+
208+
TEST_F(SortOrderMakeTest, MakeValidUnsortedSortOrder) {
209+
ICEBERG_UNWRAP_OR_FAIL(auto sort_order, SortOrder::Make(SortOrder::kUnsortedOrderId,
210+
std::vector<SortField>{}));
211+
ASSERT_NE(sort_order, nullptr);
212+
213+
EXPECT_TRUE(sort_order->is_unsorted());
214+
EXPECT_EQ(sort_order->fields().size(), 0);
215+
}
216+
217+
TEST_F(SortOrderMakeTest, MakeInvalidSortOrderNonPrimitiveField) {
218+
auto struct_field = std::make_unique<SchemaField>(
219+
4, "struct_field",
220+
std::make_shared<StructType>(std::vector<SchemaField>{
221+
SchemaField::MakeRequired(41, "inner_field", iceberg::int32()),
222+
}),
223+
true);
224+
225+
Schema schema_with_struct(
226+
std::vector<SchemaField>{*field1_, *field2_, *field3_, *struct_field}, 1);
227+
228+
SortField sort_field_invalid(4, Transform::Identity(), SortDirection::kAscending,
229+
NullOrder::kFirst);
230+
231+
auto sort_order = SortOrder::Make(
232+
schema_with_struct, 1, std::vector<SortField>{*sort_field1_, sort_field_invalid});
233+
EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
234+
EXPECT_THAT(sort_order, HasErrorMessage("Cannot sort by non-primitive source field"));
235+
}
236+
237+
TEST_F(SortOrderMakeTest, MakeInvalidSortOrderFieldNotInSchema) {
238+
SortField sort_field_invalid(999, Transform::Identity(), SortDirection::kAscending,
239+
NullOrder::kFirst);
240+
241+
auto sort_order = SortOrder::Make(
242+
*schema_, 1, std::vector<SortField>{*sort_field1_, sort_field_invalid});
243+
EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
244+
EXPECT_THAT(sort_order, HasErrorMessage("Cannot find source column for sort field:"));
245+
}
246+
247+
TEST_F(SortOrderMakeTest, MakeUnboundSortOrder) {
248+
SortField sort_field_invalid(999, Transform::Identity(), SortDirection::kAscending,
249+
NullOrder::kFirst);
250+
251+
auto sort_order =
252+
SortOrder::Make(1, std::vector<SortField>{*sort_field1_, sort_field_invalid});
253+
ASSERT_THAT(sort_order, IsOk());
254+
ASSERT_EQ(sort_order.value()->IsBoundToSchema(*schema_), false);
255+
}
256+
151257
} // namespace iceberg

src/iceberg/transform.cc

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <regex>
2424
#include <utility>
2525

26+
#include "iceberg/result.h"
2627
#include "iceberg/transform_function.h"
2728
#include "iceberg/type.h"
2829

@@ -125,6 +126,84 @@ Result<std::shared_ptr<TransformFunction>> Transform::Bind(
125126
}
126127
}
127128

129+
Result<std::shared_ptr<Type>> Transform::ResultType(
130+
const std::shared_ptr<Type>& source_type) const {
131+
switch (transform_type_) {
132+
case TransformType::kIdentity:
133+
if (!source_type->is_primitive()) [[unlikely]] {
134+
return InvalidArgument("{} is not a valid input type of identity transform",
135+
source_type->ToString());
136+
}
137+
return source_type;
138+
case TransformType::kVoid:
139+
return source_type;
140+
case TransformType::kUnknown:
141+
return string();
142+
case TransformType::kBucket:
143+
switch (source_type->type_id()) {
144+
case TypeId::kInt:
145+
case TypeId::kLong:
146+
case TypeId::kDecimal:
147+
case TypeId::kDate:
148+
case TypeId::kTime:
149+
case TypeId::kTimestamp:
150+
case TypeId::kTimestampTz:
151+
case TypeId::kString:
152+
case TypeId::kUuid:
153+
case TypeId::kFixed:
154+
case TypeId::kBinary:
155+
return int32();
156+
default:
157+
return InvalidArgument("{} is not a valid input type of bucket transform",
158+
source_type->ToString());
159+
}
160+
case TransformType::kTruncate:
161+
switch (source_type->type_id()) {
162+
case TypeId::kInt:
163+
case TypeId::kLong:
164+
case TypeId::kString:
165+
case TypeId::kBinary:
166+
case TypeId::kDecimal:
167+
return source_type;
168+
default:
169+
return InvalidArgument("{} is not a valid input type of truncate transform",
170+
source_type->ToString());
171+
}
172+
case TransformType::kYear:
173+
case TransformType::kMonth:
174+
switch (source_type->type_id()) {
175+
case TypeId::kDate:
176+
case TypeId::kTimestamp:
177+
case TypeId::kTimestampTz:
178+
return int32();
179+
default:
180+
return InvalidArgument("{} is not a valid input type of {} transform",
181+
source_type->ToString(), this->ToString());
182+
}
183+
case TransformType::kDay:
184+
switch (source_type->type_id()) {
185+
case TypeId::kDate:
186+
case TypeId::kTimestamp:
187+
case TypeId::kTimestampTz:
188+
return date();
189+
default:
190+
return InvalidArgument("{} is not a valid input type of day transform",
191+
source_type->ToString());
192+
}
193+
case TransformType::kHour:
194+
switch (source_type->type_id()) {
195+
case TypeId::kTimestamp:
196+
case TypeId::kTimestampTz:
197+
return int32();
198+
default:
199+
return InvalidArgument("{} is not a valid input type of hour transform",
200+
source_type->ToString());
201+
}
202+
default:
203+
std::unreachable();
204+
}
205+
}
206+
128207
bool Transform::PreservesOrder() const {
129208
switch (transform_type_) {
130209
case TransformType::kUnknown:

src/iceberg/transform.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,10 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
150150
Result<std::shared_ptr<TransformFunction>> Bind(
151151
const std::shared_ptr<Type>& source_type) const;
152152

153+
/// \brief Returns the Type produced by this transform given a source type.
154+
Result<std::shared_ptr<Type>> ResultType(
155+
const std::shared_ptr<Type>& source_type) const;
156+
153157
/// \brief Whether the transform preserves the order of values (is monotonic).
154158
bool PreservesOrder() const;
155159

0 commit comments

Comments
 (0)