Skip to content

Commit 185515a

Browse files
authored
feat: transform function (#61)
1 parent 57d0d3e commit 185515a

17 files changed

+698
-141
lines changed

src/iceberg/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ set(ICEBERG_SOURCES
3131
statistics_file.cc
3232
table_metadata.cc
3333
transform.cc
34+
transform_function.cc
3435
type.cc)
3536

3637
set(ICEBERG_STATIC_BUILD_INTERFACE_LIBS)

src/iceberg/json_internal.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ Result<std::unique_ptr<SortField>> SortFieldFromJson(const nlohmann::json& json)
117117
ICEBERG_ASSIGN_OR_RAISE(auto source_id, GetJsonValue<int32_t>(json, kSourceId));
118118
ICEBERG_ASSIGN_OR_RAISE(
119119
auto transform,
120-
GetJsonValue<std::string>(json, kTransform).and_then(TransformFunctionFromString));
120+
GetJsonValue<std::string>(json, kTransform).and_then(TransformFromString));
121121
ICEBERG_ASSIGN_OR_RAISE(
122122
auto direction,
123123
GetJsonValue<std::string>(json, kDirection).and_then(SortDirectionFromString));
@@ -401,7 +401,7 @@ Result<std::unique_ptr<PartitionField>> PartitionFieldFromJson(
401401
ICEBERG_ASSIGN_OR_RAISE(auto field_id, GetJsonValue<int32_t>(json, kFieldId));
402402
ICEBERG_ASSIGN_OR_RAISE(
403403
auto transform,
404-
GetJsonValue<std::string>(json, kTransform).and_then(TransformFunctionFromString));
404+
GetJsonValue<std::string>(json, kTransform).and_then(TransformFromString));
405405
ICEBERG_ASSIGN_OR_RAISE(auto name, GetJsonValue<std::string>(json, kName));
406406
return std::make_unique<PartitionField>(source_id, field_id, name,
407407
std::move(transform));

src/iceberg/partition_field.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
namespace iceberg {
2929

3030
PartitionField::PartitionField(int32_t source_id, int32_t field_id, std::string name,
31-
std::shared_ptr<TransformFunction> transform)
31+
std::shared_ptr<Transform> transform)
3232
: source_id_(source_id),
3333
field_id_(field_id),
3434
name_(std::move(name)),
@@ -40,9 +40,7 @@ int32_t PartitionField::field_id() const { return field_id_; }
4040

4141
std::string_view PartitionField::name() const { return name_; }
4242

43-
std::shared_ptr<TransformFunction> const& PartitionField::transform() const {
44-
return transform_;
45-
}
43+
std::shared_ptr<Transform> const& PartitionField::transform() const { return transform_; }
4644

4745
std::string PartitionField::ToString() const {
4846
return std::format("{} ({} {}({}))", name_, field_id_, *transform_, source_id_);

src/iceberg/partition_field.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
4343
/// \param[in] name The partition field name.
4444
/// \param[in] transform The transform function.
4545
PartitionField(int32_t source_id, int32_t field_id, std::string name,
46-
std::shared_ptr<TransformFunction> transform);
46+
std::shared_ptr<Transform> transform);
4747

4848
/// \brief Get the source field ID.
4949
int32_t source_id() const;
@@ -55,7 +55,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
5555
std::string_view name() const;
5656

5757
/// \brief Get the transform type.
58-
std::shared_ptr<TransformFunction> const& transform() const;
58+
std::shared_ptr<Transform> const& transform() const;
5959

6060
std::string ToString() const override;
6161

@@ -74,7 +74,7 @@ class ICEBERG_EXPORT PartitionField : public util::Formattable {
7474
int32_t source_id_;
7575
int32_t field_id_;
7676
std::string name_;
77-
std::shared_ptr<TransformFunction> transform_;
77+
std::shared_ptr<Transform> transform_;
7878
};
7979

8080
} // namespace iceberg

src/iceberg/sort_field.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
namespace iceberg {
2929

30-
SortField::SortField(int32_t source_id, std::shared_ptr<TransformFunction> transform,
30+
SortField::SortField(int32_t source_id, std::shared_ptr<Transform> transform,
3131
SortDirection direction, NullOrder null_order)
3232
: source_id_(source_id),
3333
transform_(std::move(transform)),
@@ -36,9 +36,7 @@ SortField::SortField(int32_t source_id, std::shared_ptr<TransformFunction> trans
3636

3737
int32_t SortField::source_id() const { return source_id_; }
3838

39-
std::shared_ptr<TransformFunction> const& SortField::transform() const {
40-
return transform_;
41-
}
39+
std::shared_ptr<Transform> const& SortField::transform() const { return transform_; }
4240

4341
SortDirection SortField::direction() const { return direction_; }
4442

src/iceberg/sort_field.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,14 @@ class ICEBERG_EXPORT SortField : public util::Formattable {
9797
/// \param[in] transform The transform function.
9898
/// \param[in] direction The sort direction.
9999
/// \param[in] null_order The null order.
100-
SortField(int32_t source_id, std::shared_ptr<TransformFunction> transform,
100+
SortField(int32_t source_id, std::shared_ptr<Transform> transform,
101101
SortDirection direction, NullOrder null_order);
102102

103103
/// \brief Get the source field ID.
104104
int32_t source_id() const;
105105

106106
/// \brief Get the transform type.
107-
const std::shared_ptr<TransformFunction>& transform() const;
107+
const std::shared_ptr<Transform>& transform() const;
108108

109109
/// \brief Get the sort direction.
110110
SortDirection direction() const;
@@ -127,7 +127,7 @@ class ICEBERG_EXPORT SortField : public util::Formattable {
127127
[[nodiscard]] bool Equals(const SortField& other) const;
128128

129129
int32_t source_id_;
130-
std::shared_ptr<TransformFunction> transform_;
130+
std::shared_ptr<Transform> transform_;
131131
SortDirection direction_;
132132
NullOrder null_order_;
133133
};

src/iceberg/transform.cc

Lines changed: 173 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,66 +20,206 @@
2020
#include "iceberg/transform.h"
2121

2222
#include <format>
23+
#include <regex>
2324

24-
namespace iceberg {
25+
#include "iceberg/transform_function.h"
26+
#include "iceberg/type.h"
2527

28+
namespace iceberg {
2629
namespace {
27-
/// \brief Get the relative transform name
28-
constexpr std::string_view ToString(TransformType type) {
30+
constexpr std::string_view kUnknownName = "unknown";
31+
constexpr std::string_view kIdentityName = "identity";
32+
constexpr std::string_view kBucketName = "bucket";
33+
constexpr std::string_view kTruncateName = "truncate";
34+
constexpr std::string_view kYearName = "year";
35+
constexpr std::string_view kMonthName = "month";
36+
constexpr std::string_view kDayName = "day";
37+
constexpr std::string_view kHourName = "hour";
38+
constexpr std::string_view kVoidName = "void";
39+
} // namespace
40+
41+
constexpr std::string_view TransformTypeToString(TransformType type) {
2942
switch (type) {
3043
case TransformType::kUnknown:
31-
return "unknown";
44+
return kUnknownName;
3245
case TransformType::kIdentity:
33-
return "identity";
46+
return kIdentityName;
3447
case TransformType::kBucket:
35-
return "bucket";
48+
return kBucketName;
3649
case TransformType::kTruncate:
37-
return "truncate";
50+
return kTruncateName;
3851
case TransformType::kYear:
39-
return "year";
52+
return kYearName;
4053
case TransformType::kMonth:
41-
return "month";
54+
return kMonthName;
4255
case TransformType::kDay:
43-
return "day";
56+
return kDayName;
4457
case TransformType::kHour:
45-
return "hour";
58+
return kHourName;
4659
case TransformType::kVoid:
47-
return "void";
48-
default:
49-
return "invalid";
60+
return kVoidName;
5061
}
5162
}
52-
} // namespace
5363

54-
TransformFunction::TransformFunction(TransformType type) : transform_type_(type) {}
64+
std::shared_ptr<Transform> Transform::Identity() {
65+
static auto instance =
66+
std::shared_ptr<Transform>(new Transform(TransformType::kIdentity));
67+
return instance;
68+
}
69+
70+
std::shared_ptr<Transform> Transform::Year() {
71+
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kYear));
72+
return instance;
73+
}
5574

56-
TransformType TransformFunction::transform_type() const { return transform_type_; }
75+
std::shared_ptr<Transform> Transform::Month() {
76+
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kMonth));
77+
return instance;
78+
}
79+
80+
std::shared_ptr<Transform> Transform::Day() {
81+
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kDay));
82+
return instance;
83+
}
84+
85+
std::shared_ptr<Transform> Transform::Hour() {
86+
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kHour));
87+
return instance;
88+
}
89+
90+
std::shared_ptr<Transform> Transform::Void() {
91+
static auto instance = std::shared_ptr<Transform>(new Transform(TransformType::kVoid));
92+
return instance;
93+
}
94+
95+
std::shared_ptr<Transform> Transform::Bucket(int32_t num_buckets) {
96+
return std::shared_ptr<Transform>(new Transform(TransformType::kBucket, num_buckets));
97+
}
98+
99+
std::shared_ptr<Transform> Transform::Truncate(int32_t width) {
100+
return std::shared_ptr<Transform>(new Transform(TransformType::kTruncate, width));
101+
}
102+
103+
Transform::Transform(TransformType transform_type) : transform_type_(transform_type) {}
104+
105+
Transform::Transform(TransformType transform_type, int32_t param)
106+
: transform_type_(transform_type), param_(param) {}
107+
108+
TransformType Transform::transform_type() const { return transform_type_; }
109+
110+
Result<std::unique_ptr<TransformFunction>> Transform::Bind(
111+
const std::shared_ptr<Type>& source_type) const {
112+
auto type_str = TransformTypeToString(transform_type_);
113+
114+
switch (transform_type_) {
115+
case TransformType::kIdentity:
116+
return std::make_unique<IdentityTransform>(source_type);
117+
118+
case TransformType::kBucket: {
119+
if (auto param = std::get_if<int32_t>(&param_)) {
120+
return std::make_unique<BucketTransform>(source_type, *param);
121+
}
122+
return unexpected<Error>({
123+
.kind = ErrorKind::kInvalidArgument,
124+
.message = std::format(
125+
"Bucket requires int32 param, none found in transform '{}'", type_str),
126+
});
127+
}
57128

58-
std::string TransformFunction::ToString() const {
59-
return std::format("{}", iceberg::ToString(transform_type_));
129+
case TransformType::kTruncate: {
130+
if (auto param = std::get_if<int32_t>(&param_)) {
131+
return std::make_unique<TruncateTransform>(source_type, *param);
132+
}
133+
return unexpected<Error>({
134+
.kind = ErrorKind::kInvalidArgument,
135+
.message = std::format(
136+
"Truncate requires int32 param, none found in transform '{}'", type_str),
137+
});
138+
}
139+
140+
case TransformType::kYear:
141+
return std::make_unique<YearTransform>(source_type);
142+
case TransformType::kMonth:
143+
return std::make_unique<MonthTransform>(source_type);
144+
case TransformType::kDay:
145+
return std::make_unique<DayTransform>(source_type);
146+
case TransformType::kHour:
147+
return std::make_unique<HourTransform>(source_type);
148+
case TransformType::kVoid:
149+
return std::make_unique<VoidTransform>(source_type);
150+
151+
default:
152+
return unexpected<Error>({
153+
.kind = ErrorKind::kNotSupported,
154+
.message = std::format("Unsupported transform type: '{}'", type_str),
155+
});
156+
}
60157
}
61158

62159
bool TransformFunction::Equals(const TransformFunction& other) const {
63-
return transform_type_ == other.transform_type_;
160+
return transform_type_ == other.transform_type_ && *source_type_ == *other.source_type_;
161+
}
162+
163+
std::string Transform::ToString() const {
164+
switch (transform_type_) {
165+
case TransformType::kIdentity:
166+
case TransformType::kYear:
167+
case TransformType::kMonth:
168+
case TransformType::kDay:
169+
case TransformType::kHour:
170+
case TransformType::kVoid:
171+
case TransformType::kUnknown:
172+
return std::format("{}", TransformTypeToString(transform_type_));
173+
case TransformType::kBucket:
174+
case TransformType::kTruncate:
175+
return std::format("{}[{}]", TransformTypeToString(transform_type_),
176+
std::get<int32_t>(param_));
177+
}
64178
}
65179

66-
IdentityTransformFunction::IdentityTransformFunction()
67-
: TransformFunction(TransformType::kIdentity) {}
180+
TransformFunction::TransformFunction(TransformType transform_type,
181+
std::shared_ptr<Type> source_type)
182+
: transform_type_(transform_type), source_type_(std::move(source_type)) {}
68183

69-
expected<ArrowArray, Error> IdentityTransformFunction::Transform(
70-
const ArrowArray& input) {
71-
return unexpected<Error>({.kind = ErrorKind::kNotSupported,
72-
.message = "IdentityTransformFunction::Transform"});
184+
TransformType TransformFunction::transform_type() const { return transform_type_; }
185+
186+
std::shared_ptr<Type> const& TransformFunction::source_type() const {
187+
return source_type_;
188+
}
189+
190+
bool Transform::Equals(const Transform& other) const {
191+
return transform_type_ == other.transform_type_ && param_ == other.param_;
73192
}
74193

75-
expected<std::unique_ptr<TransformFunction>, Error> TransformFunctionFromString(
76-
std::string_view str) {
77-
if (str == "identity") {
78-
return std::make_unique<IdentityTransformFunction>();
194+
Result<std::shared_ptr<Transform>> TransformFromString(std::string_view transform_str) {
195+
if (transform_str == kIdentityName) return Transform::Identity();
196+
if (transform_str == kYearName) return Transform::Year();
197+
if (transform_str == kMonthName) return Transform::Month();
198+
if (transform_str == kDayName) return Transform::Day();
199+
if (transform_str == kHourName) return Transform::Hour();
200+
if (transform_str == kVoidName) return Transform::Void();
201+
202+
// Match bucket[16] or truncate[4]
203+
static const std::regex param_regex(
204+
std::format(R"(({}|{})\[(\d+)\])", kBucketName, kTruncateName));
205+
std::string str(transform_str);
206+
std::smatch match;
207+
if (std::regex_match(str, match, param_regex)) {
208+
const std::string type_str = match[1];
209+
const int32_t param = std::stoi(match[2]);
210+
211+
if (type_str == kBucketName) {
212+
return Transform::Bucket(param);
213+
}
214+
if (type_str == kTruncateName) {
215+
return Transform::Truncate(param);
216+
}
79217
}
80-
return unexpected<Error>(
81-
{.kind = ErrorKind::kInvalidArgument,
82-
.message = "Invalid TransformFunction string: " + std::string(str)});
218+
219+
return unexpected<Error>({
220+
.kind = ErrorKind::kInvalidArgument,
221+
.message = std::format("Invalid Transform string: {}", transform_str),
222+
});
83223
}
84224

85225
} // namespace iceberg

0 commit comments

Comments
 (0)